In [1]:
import pandas as pd
import json
import re
from openai import OpenAI

In [3]:
extraction_path = "data/deepseek_finetuned.json"
with open(extraction_path, "r") as f:
    extraction_data = json.load(f)


In [2]:
def get_all_passivators(extraction_data):
    passivators = set()
    search_key = "passivating_molecule"
    for key in extraction_data:
        item = extraction_data[key]
        for paper_key in item:
            if paper_key == search_key:
                passivators.add(item[paper_key])
            elif isinstance(item[paper_key], dict):
                for test_key in item[paper_key]:
                    if test_key== search_key:
                        passivators.add(item[paper_key][test_key])
    return passivators    
        

In [4]:
print(get_all_passivators(extraction_data))

{'TiOxNy', '4-trifluoromethyl-phenylammonium', 'benzene', 'Al2O3', '4-ethyl-1H-pyrrole-2-carboxylic acid', 'PEA2ZnX4', '3F-PEA', 'PSP', 'anisole', '4-vinylbenzylammonium bromide', 'Cl-cPP and Cl-bSO', '(PbI2)2RbCl', 'NaF', '1,3-propane diammonium iodide', 'formamidinium lead iodide (FAPbI3)', 'CB-NH2', 'sodium thioglycolate', 'AZO/SnOx', 'ortho-(phenylene)di(ethylammonium) iodide', 'allylammonium (ALA)', "2,2',6,6'-bis(4-methoxy-2,4,6-trimethylphenyl)-1,3,5-triazine", 'methylammonium chloride', 'azetidinium iodide', 'formamidinium formate (FAHCOO)', 'none', '4-chlorophenylethylammonium iodide', 'Spiro-mF', 'octylammonium iodide', 'pentanamidine hydrochloride', 'Chlorine-capped TiO2', 'Oleylammonium iodide', 'lead sulfate', '4-guanidinobenzoic acid hydrochloride (GBAC)', 'poly(methyl methacrylate)', 'CsMAFA-8', '2-TMAI', '4-tert-butyl-benzylammonium iodide', 'phenylethylammonium iodide (PEAI) and 2-thiophenemethylammonium iodide (TMAI)', 'benzylhydrazine (BHC)', 'b-poly(1,1-difluoroethy

In [None]:
PREFIX = """
You are a helpful scientific assistant that specializes in passivating molecules for perovskite solar cells. 

**Instructions:**
-Given a list of molecule names, find the full name of each molecule in IUPAC nomenclature. This full name should be able to be parsed into SMILES. 
-Remember to think in the context of passivating molecules for perovskite solar cells.
-It is important to note that the name might already be in IUPAC format, in which case you should return the name as is.
-It is also possible that the name given is not a passivating molecule and cannot be parsed into SMILES. In this case, you should return null.
-You should provide a JSON object where each key is the provided molecule name and the value is the full name of the molecule in IUPAC format.
-Do not leave any molecules behind.

**JSON Structure:**
```json
{
    "1st molecule given": "The full name of the molecule in IUPAC format",
    "2nd molecule given": "The full name of the molecule in IUPAC format",
}
```

Only return the JSON object. Ensure that the full name returned can be parsed into SMILES format.
Here are the molecule names you need to format:

{molecule_names}
"""

In [None]:
def get_full_passivator(molecule_name):
    instruction = create_prompt(PREFIX, molecule_name)
    json_string = pipe(instruction, max_new_tokens=1024)[0]["generated_text"][-1]['content']
    print(json_string)
    json_match = re.search(r"\{.*\}", json_string, re.DOTALL)
    if json_match:
        raw_json = json_match.group(0).strip()
    else:
        print("No JSON found")
        return molecule_name
    try:
        parsed_data = json.loads(raw_json)
        new_molecule = parsed_data["full_name"]
        return new_molecule
    except json.JSONDecodeError as e:
        print("Error creating JSON", e)
        return molecule_name

In [None]:
for key in extraction_data:
    item = extraction_data[key]
    curr_test = "test_1"
    i = 1
    while curr_test in item:
        unformatted_passivator = item[curr_test]["passivating_molecule"]
        if unformatted_passivator == None:
            i += 1
            curr_test = "test_{i}"
            continue
        full_passivator = get_full_passivator(unformatted_passivator)
        item[curr_test]["passivating_molecule"] = full_passivator
        i += 1
        curr_test = "test_{i}"


In [12]:
extraction_data["0"]

{'control_pce': 21.0,
 'control_voc': 1.17,
 'treated_pce': 23.77,
 'treated_voc': 1.145,
 'passivating_molecule': 'Cyclohexylmethylammonium iodide (CMAI)',
 'perovskite_composition': 'α-formamidinium lead triiodide (FAPbI3)',
 'electron_transport_layer': 'Fluorine-doped tin oxide (FTO)',
 'pin_nip_structure': 'NIP',
 'hole_transport_layer': 'Mixed SAMs (2PACz and Me-4PACz)',
 'stability_tests': [{'test_name': 'ISOS-D-2I',
   'temperature': 85,
   'time': 1500,
   'humidity': 50,
   'control_efficiency': 23.2,
   'treatment_efficiency': 19.9},
  {'test_name': 'ISOS-L-3',
   'temperature': 65,
   'time': 1200,
   'humidity': 50,
   'control_efficiency': 23.2,
   'treatment_efficiency': 14.2},
  {'test_name': 'ISOS-T',
   'temperature': 65,
   'time': 500,
   'humidity': 50,
   'control_efficiency': None,
   'treatment_efficiency': None},
  {'test_name': 'ISOS-LC',
   'temperature': None,
   'time': None,
   'humidity': None,
   'control_efficiency': None,
   'treatment_efficiency': None