In [36]:
import pandas as pd
import json
import re
import os
import requests
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

True

In [28]:
extraction_path = "data/deepseek_finetuned_formatted.json"
with open(extraction_path, "r") as f:
    extraction_data = json.load(f)


In [29]:
passivator_conversions_path = "data/passivator_conversions.json"
with open(passivator_conversions_path, "r") as f:
    passivator_conversions = json.load(f)

In [30]:
def get_unconverted_passivators(extraction_data):
    passivators = set()
    search_key = "passivating_molecule"
    for key in extraction_data:
        item = extraction_data[key]
        for paper_key in item:
            if paper_key == search_key:
                passivators.add(item[paper_key])
            elif isinstance(item[paper_key], dict):
                for test_key in item[paper_key]:
                    if test_key== search_key:
                        passivators.add(item[paper_key][test_key])
    passivators = [passivator for passivator in passivators if passivator not in passivator_conversions and passivator not in [None, 'none']]
    return passivators    
        

In [88]:
PREFIX = """
Role:
You are a helpful scientific assistant specializing in passivating molecules for perovskite solar cells.

Task:
Given a list of molecule names, provide the full name of each molecule in IUPAC nomenclature. The IUPAC name must be parseable into SMILES format.

Rules:
-If the molecule name is already in IUPAC format, return it as is.
-If there are multiple molecule names given, just use the first one.
-If the name contains additional descriptive words (e.g., "passivating," "functionalized"), extract only the molecule name and convert it to IUPAC format.
-If the full molecule name cannot be parsed into SMILES after fully reasoning through it multiple times, return null.
-Ensure no molecules are left out.

Output Format:
Provide a JSON object where each key is the provided molecule name and the value is the corresponding IUPAC name or null.

json
Copy
{
    "molecule_name_1": "IUPAC_name_or_null",
    "molecule_name_2": "IUPAC_name_or_null"
}
Important Notes:

-Focus only on the parts of the string that represent the molecule name.

-Double-check that the IUPAC name can be parsed into SMILES. If not, return null.
-If the molecule is not relevant to passivating perovskite solar cells, return null.

Example Input:

{
    "ethylammonium bromide": "ethylammonium bromide",
    "passivating 2-phenylethylamine": "2-phenylethylamine",
    "CF3PEAI": "2-(4-(Trifluoromethyl)phenyl)ethylammonium iodide",
    "2D perovskite": null
}
Begin converting!
"""

In [89]:
def get_new_passivator_conversions(passivators, api_key_name="OPENAI_API_KEY", base_url="https://api.openai.com/v1/", model_name="gpt-4o"):
    api_key = os.getenv(api_key_name)
    client = OpenAI(api_key=api_key, base_url=base_url)
    system_content = PREFIX
    user_content = str(passivators)
    messages = [
        {
            "role": "system",
            "content": system_content
        },
        {
            "role": "user",
            "content": user_content
        }
    ]
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        stream=False
    )
    output = response.choices[0].message.content
    json_match = re.search(r"\{.*\}", output, re.DOTALL)

    if json_match:
        raw_json = json_match.group(0).strip()
    else:
        print("No JSON found")
        return {}
    try:
        parsed_data = json.loads(raw_json)
        return parsed_data
    except json.JSONDecodeError as e:
        print("Error creating JSON", e)
        return {}

In [64]:
unconverted_passivators = get_unconverted_passivators(extraction_data)

In [None]:
new_passivators = get_new_passivator_conversions(unconverted_passivators, api_key_name="DEEPSEEK_API_KEY", base_url="https://api.deepseek.com", model="deepseek-reasoner")

In [90]:
### second pass to ensure not missing anything (GPT-4o takes less time for this)
null_passivators = [k for k, v in new_passivators.items() if v == None]
extracted_passivators = {k: v for k, v in new_passivators.items() if v != None}

second_pass = get_new_passivator_conversions(null_passivators)

In [91]:
extracted_passivators.update(second_pass)

In [92]:
### cleaning any multiples:
for key in extracted_passivators:
    value = extracted_passivators[key]
    if isinstance(value, dict):
        extracted_passivators[key] = value[list(value.keys())[0]]

In [93]:
extracted_passivators

{'EDTA': "2,2',2'',2'''-(ethane-1,2-diyldinitrilo)tetraacetic acid",
 'azetidinium iodide': 'azetidinium iodide',
 'CF3PEAI': '2-(4-(trifluoromethyl)phenyl)ethylammonium iodide',
 'methylammonium chloride': 'methylammonium chloride',
 'cyclohexylmethylammonium iodide': 'cyclohexylmethylazanium iodide',
 'n-butylamine acetate': 'butan-1-aminium acetate',
 'benzene': 'benzene',
 '4-fluoroaniline': '4-fluoroaniline',
 'formamidinium formate (FAHCOO)': 'formamidinium formate',
 'NH4Cl': 'ammonium chloride',
 '4-guanidinobenzoic acid hydrochloride (GBAC)': '4-guanidinobenzoic acid hydrochloride',
 '4-chlorophenylethylammonium iodide': '2-(4-chlorophenyl)ethylazanium iodide',
 'Choline chloride': '(2-hydroxyethyl)trimethylazanium chloride',
 'anisole': 'methoxybenzene',
 'pentanamidine hydrochloride': 'pentane-1,5-dicarboximidamide hydrochloride',
 'phenethylammonium iodide (PEAI)': '2-phenylethylazanium iodide',
 'sodium thioglycolate': 'sodium 2-sulfanylacetate',
 'NaF': 'sodium fluoride',

In [94]:
passivator_conversions.update(extracted_passivators)

In [96]:
with open('data/passivator_conversions.json', 'w') as f:
    json.dump(passivator_conversions, f)

In [86]:
results = {}
for key in new_passivators:
    passivator = new_passivators[key]
    if passivator is not None:
        base_url = "https://opsin.ch.cam.ac.uk/opsin/"
        smiles_url = base_url + passivator + ".smi"
        r = requests.get(smiles_url)
        result = r.text if r.status_code == 200 else None
        results[passivator] = result

In [87]:
results

{"2,2',2'',2'''-(ethane-1,2-diyldinitrilo)tetraacetic acid": 'C(CN(CC(=O)O)CC(=O)O)N(CC(=O)O)CC(=O)O',
 'azetidinium iodide': '[I-].[NH2+]1CCC1',
 '2-(4-(trifluoromethyl)phenyl)ethylammonium iodide': '[I-].FC(C1=CC=C(C=C1)CC[NH3+])(F)F',
 'methylammonium chloride': '[Cl-].C[NH3+]',
 'cyclohexylmethylazanium iodide': '[I-].C1(CCCCC1)C[NH3+]',
 'butan-1-aminium acetate': 'C(C)(=O)[O-].C(CCC)[NH3+]',
 'benzene': 'C1=CC=CC=C1',
 '4-fluoroaniline': 'FC1=CC=C(N)C=C1',
 'formamidinium formate': 'C(=O)[O-].C(=[NH2+])N',
 'ammonium chloride': '[Cl-].[NH4+]',
 '4-guanidinobenzoic acid hydrochloride': 'Cl.N(C(=N)N)C1=CC=C(C(=O)O)C=C1',
 '2-(4-chlorophenyl)ethylazanium iodide': '[I-].ClC1=CC=C(C=C1)CC[NH3+]',
 '(2-hydroxyethyl)trimethylazanium chloride': '[Cl-].OCC[N+](C)(C)C',
 'methoxybenzene': 'COC1=CC=CC=C1',
 'pentane-1,5-dicarboximidamide hydrochloride': 'Cl.C(CCCCC(N)=N)C(N)=N',
 '2-phenylethylazanium iodide': '[I-].C1(=CC=CC=C1)CC[NH3+]',
 'sodium 2-sulfanylacetate': 'SCC(=O)[O-].[Na+]',
 