In [3]:
import pandas as pd
import json
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline




In [2]:
extraction_path = "data/deepseek_finetuned.json"
with open(extraction_path, "r") as f:
    extraction_data = json.load(f)


In [None]:
# trained model path
model_path = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.model_max_length = 1024
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    temperature=None,
    top_p=None,
    do_sample=False,
)

In [None]:
PREFIX = """
You are a helpful scientific assistant. Given a molecule name, return the full name of the molecule.
You should provide a JSON object with the full name of the molecule.

**JSON Structure:**
```json
{
    "full_name": "The full name of the molecule"
}
```

Only return the JSON object. Ensure that the full name returned can be parsed into SMILES format.

Example 1:
Input: benzylhydrazine (BHC)
Output:
```json
{
    "full_name": "benzylhydrazine"
}
```

Example 2:
Input: PEA2ZnX4
Output:
```json
{
    "full_name": "Phenylethylammonium Zinc Halide"
}

Here is the molecule name you need to format:
"""
def create_prompt(system, user):
    tokens = tokenizer.encode(user, max_length=60000, truncation=True) # prevents CUDA memory errors with current GPU
    truncated_user = tokenizer.decode(tokens)
    return [
    {"role": "system", "content": system},
    {"role": "user", "content": truncated_user}, ]

In [None]:
def get_full_passivator(molecule_name):
    instruction = create_prompt(PREFIX, molecule_name)
    json_string = pipe(instruction, max_new_tokens=1024)[0]["generated_text"][-1]['content']
    print(json_string)
    json_match = re.search(r"\{.*\}", json_string, re.DOTALL)
    if json_match:
        raw_json = json_match.group(0).strip()
    else:
        print("No JSON found")
        return molecule_name
    try:
        parsed_data = json.loads(raw_json)
        new_molecule = parsed_data["full_name"]
        return new_molecule
    except json.JSONDecodeError as e:
        print("Error creating JSON", e)
        return molecule_name

In [None]:
for key in extraction_data:
    item = extraction_data[key]
    curr_test = "test_1"
    i = 1
    while curr_test in item:
        unformatted_passivator = item[curr_test]["passivating_molecule"]
        if unformatted_passivator == None:
            i += 1
            curr_test = "test_{i}"
            continue
        full_passivator = get_full_passivator(unformatted_passivator)
        item[curr_test]["passivating_molecule"] = full_passivator
        i += 1
        curr_test = "test_{i}"


In [12]:
extraction_data["0"]

{'control_pce': 21.0,
 'control_voc': 1.17,
 'treated_pce': 23.77,
 'treated_voc': 1.145,
 'passivating_molecule': 'Cyclohexylmethylammonium iodide (CMAI)',
 'perovskite_composition': 'α-formamidinium lead triiodide (FAPbI3)',
 'electron_transport_layer': 'Fluorine-doped tin oxide (FTO)',
 'pin_nip_structure': 'NIP',
 'hole_transport_layer': 'Mixed SAMs (2PACz and Me-4PACz)',
 'stability_tests': [{'test_name': 'ISOS-D-2I',
   'temperature': 85,
   'time': 1500,
   'humidity': 50,
   'control_efficiency': 23.2,
   'treatment_efficiency': 19.9},
  {'test_name': 'ISOS-L-3',
   'temperature': 65,
   'time': 1200,
   'humidity': 50,
   'control_efficiency': 23.2,
   'treatment_efficiency': 14.2},
  {'test_name': 'ISOS-T',
   'temperature': 65,
   'time': 500,
   'humidity': 50,
   'control_efficiency': None,
   'treatment_efficiency': None},
  {'test_name': 'ISOS-LC',
   'temperature': None,
   'time': None,
   'humidity': None,
   'control_efficiency': None,
   'treatment_efficiency': None