In [46]:
import pandas as pd
import ast
import numpy as np
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

In [48]:
annotation_df = pd.read_csv('annotation.csv')
annotation_df

Unnamed: 0,first_num,output
0,0,{'perovskite_composition': 'Cs 0.05 FA 0.85 MA...
1,1,"{'perovskite_composition': None, 'electron_tra..."
2,2,{'perovskite_composition': 'dibutylammonium le...
3,3,{'perovskite_composition': 'Cs0.05(MA0.10FA0.8...
4,4,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...
...,...,...
144,145,{'perovskite_composition': 'Cs0.05(MA0.17FA0.8...
145,146,{'perovskite_composition': 'Cs0.05(MA0.05FA0.9...
146,147,{'perovskite_composition': 'formamidinium lead...
147,148,{'perovskite_composition': 'Cs0.05FA0.85MA0.10...


In [50]:
sample_data = annotation_df['output']


# Convert string representation of dictionary into actual dictionary
expanded_data = []
for row in sample_data:
    row_dict = ast.literal_eval(row)  # Convert string to dictionary
    
    # Extract common fields
    common_fields = {
        "perovskite_composition": row_dict.get("perovskite_composition"),
        "electron_transport_layer": row_dict.get("electron_transport_layer"),
        "hole_transport_layer": row_dict.get("hole_transport_layer"),
        "structure_pin_nip": row_dict.get("structure_pin_nip"),
    }

    # Extract test data
    for key, test_data in row_dict.items():
        if key.startswith("test_"):
            test_row = common_fields.copy()
            test_row["test"] = key  # Store test name
            test_row.update(test_data)  # Merge test details
            expanded_data.append(test_row)

# Convert list of dictionaries into DataFrame
df_expanded = pd.DataFrame(expanded_data)

# Fill missing passivating_molecule values based on the first test in each group
df_expanded['passivating_molecule'] = df_expanded.groupby('perovskite_composition')['passivating_molecule'].transform(lambda x: x.ffill())

In [52]:
df_expanded

Unnamed: 0,perovskite_composition,electron_transport_layer,hole_transport_layer,structure_pin_nip,test,stability_type,passivating_molecule,humidity,temperature,time,control_pce,treated_pce,control_voc,treated_voc,efficiency_control,efficiency_tret,efficiency_cont
0,Cs 0.05 FA 0.85 MA 0.1 PbI 3,C60,2PACz and Me-4PACz,PIN,test_1,ISOSL,4-chlorobenzenesulfonate (4Cl-BZS),,65.0,1200.0,24.0,26.90,,1.18,,95.00,
1,,TinOxide,"poly[bis(4-phenyl) (2,4,6-trimethylphenyl)amin...",PIN,test_1,ISOST,,,85.0,500.0,,19.10,,1.16,,,
2,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_1,ISOSL,butylamine,85.0,25.0,1620.0,,24.30,,1.18,,0.98,0.58
3,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_1_2,ISOSD,butylamine,85.0,85.0,1056.0,,21.30,,,,0.94,
4,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_2,ISOSLT,butylamine,,25.0,1620.0,,24.00,,,,0.98,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,formamidinium lead iodide,,Spiro-OMeTAD,NIP,test_1,ISOSD,carbazole-triphenylamine and phenylammonium io...,85.0,85.0,1000.0,22.3,,,1.11,,0.92,
230,formamidinium lead iodide,,Spiro-OMeTAD,NIP,test_2,ISOSLT,carbazole-triphenylamine and phenylammonium io...,,,1100.0,,24.70,,,,0.94,0.66
231,Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3,,Spiro-OMeTAD,NIP,test_1_2,ISOSLT,phenylethylammonium iodide,507.0,,500.0,,0.84,,,,,0.70
232,Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3,,Spiro-OMeTAD,NIP,test_1,,4-tert-butyl-benzylammonium iodide,,,,21.2,22.70,1.09,1.12,,0.95,


In [54]:
df_expanded.isna().sum()

perovskite_composition       71
electron_transport_layer    105
hole_transport_layer         96
structure_pin_nip            73
test                          0
stability_type               73
passivating_molecule        131
humidity                    156
temperature                 112
time                         58
control_pce                 163
treated_pce                  92
control_voc                 195
treated_voc                 156
efficiency_control          234
efficiency_tret              40
efficiency_cont             137
dtype: int64

In [56]:
# Function to select columns
def select_data(df):
    # Convert PCE and VOC to numeric
    for col in ['control_pce', 'control_voc', 'treated_pce', 'treated_voc']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Drop rows where treated_pce or passivating_molecule is missing
    df = df.dropna(subset=['treated_pce', 'passivating_molecule', 'perovskite_composition'])

    return df

data = select_data(df_expanded)
data.head()

Unnamed: 0,perovskite_composition,electron_transport_layer,hole_transport_layer,structure_pin_nip,test,stability_type,passivating_molecule,humidity,temperature,time,control_pce,treated_pce,control_voc,treated_voc,efficiency_control,efficiency_tret,efficiency_cont
0,Cs 0.05 FA 0.85 MA 0.1 PbI 3,C60,2PACz and Me-4PACz,PIN,test_1,ISOSL,4-chlorobenzenesulfonate (4Cl-BZS),,65.0,1200.0,24.0,26.9,,1.18,,95.0,
2,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_1,ISOSL,butylamine,85.0,25.0,1620.0,,24.3,,1.18,,0.98,0.58
3,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_1_2,ISOSD,butylamine,85.0,85.0,1056.0,,21.3,,,,0.94,
4,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_2,ISOSLT,butylamine,,25.0,1620.0,,24.0,,,,0.98,
6,butylammonium lead iodide,buckminsterfullerene,Spiro-OMeTAD,NIP,test_1,,iso-butylamine iodide,,,,,22.1,,1.13,,,


In [58]:
def fetch_smiles_from_name(molecule_name):
    try:
        # Search for the molecule in PubChem by name
        compounds = pcp.get_compounds(molecule_name, 'name')
        if compounds:
            return compounds[0].isomeric_smiles  # Return the first match's SMILES
        else:
            return np.nan
    except Exception as e:
        print(f"Error fetching SMILES for {molecule_name}: {e}")
        return None

In [60]:
import re

def fix_unmatched_brackets(s):
    """
    Fixes unmatched brackets in the given string by adding the correct brackets where necessary.

    :param s: Input string with potential unmatched brackets.
    :return: A corrected string with properly balanced brackets.
    """
    opening = "({["
    closing = ")}]"
    match = {')': '(', '}': '{', ']': '['}
    stack = []

    # Step 1: Identify missing closing brackets
    fixed_s = []
    for char in s:
        if char in opening:
            stack.append(char)
            fixed_s.append(char)
        elif char in closing:
            if stack and stack[-1] == match[char]:
                stack.pop()
                fixed_s.append(char)
            else:
                # Add missing opening bracket before unmatched closing
                fixed_s.insert(0, match[char])
                fixed_s.append(char)
        else:
            fixed_s.append(char)

    # Step 2: Add missing closing brackets at the end
    while stack:
        open_bracket = stack.pop()
        fixed_s.append(closing[opening.index(open_bracket)])

    return "".join(fixed_s)


def get_chemical_names(chemical_list):
    cleaned_list = []
    for name in chemical_list:
        # Remove text inside parentheses only if it's extra information (abbreviations)
        name = re.sub(r"\s*\([^)]*\)$", "", name).strip() 
        # Remove spaces after a closing bracket (ensure proper chemical formatting)
        name = re.sub(r"\] +", "]", name)

        cleaned_list.append(name)

    return cleaned_list

In [62]:
data['passivating_molecule']

0                     4-chlorobenzenesulfonate (4Cl-BZS)
2                                             butylamine
3                                             butylamine
4                                             butylamine
6                                  iso-butylamine iodide
                             ...                        
228                                      ortho-carborane
230    carbazole-triphenylamine and phenylammonium io...
231                           phenylethylammonium iodide
232                   4-tert-butyl-benzylammonium iodide
233                     4-trifluoromethyl-phenylammonium
Name: passivating_molecule, Length: 67, dtype: object

In [70]:
fetch_smiles_from_name(get_chemical_names([fix_unmatched_brackets('4-chlorobenzenesulfonate (4Cl-BZS)')]))

'C1=CC(=CC=C1S(=O)(=O)[O-])Cl'

In [74]:
lst = data['passivating_molecule']
lst

0                     4-chlorobenzenesulfonate (4Cl-BZS)
2                                             butylamine
3                                             butylamine
4                                             butylamine
6                                  iso-butylamine iodide
                             ...                        
228                                      ortho-carborane
230    carbazole-triphenylamine and phenylammonium io...
231                           phenylethylammonium iodide
232                   4-tert-butyl-benzylammonium iodide
233                     4-trifluoromethyl-phenylammonium
Name: passivating_molecule, Length: 67, dtype: object

In [76]:
cleaned_list = lst.apply(lambda x: fix_unmatched_brackets(x))
cleaned_list

0                     4-chlorobenzenesulfonate (4Cl-BZS)
2                                             butylamine
3                                             butylamine
4                                             butylamine
6                                  iso-butylamine iodide
                             ...                        
228                                      ortho-carborane
230    carbazole-triphenylamine and phenylammonium io...
231                           phenylethylammonium iodide
232                   4-tert-butyl-benzylammonium iodide
233                     4-trifluoromethyl-phenylammonium
Name: passivating_molecule, Length: 67, dtype: object

In [78]:
cleaned_list = get_chemical_names(cleaned_list)

In [80]:
cleaned_list

['4-chlorobenzenesulfonate',
 'butylamine',
 'butylamine',
 'butylamine',
 'iso-butylamine iodide',
 'phenylethylammonium iodide',
 'EDBE',
 'intermediate negative ΔE vac',
 'butylammonium bromide',
 '2-thiopheneethylammonium chloride',
 '2-thiopheneethylammonium chloride',
 'chlorophenylethylammonium iodide',
 'fluorophenylethylammonium iodide',
 'CF3-phenethylammonium',
 '3,4,5-trifluoroanilinium',
 '3,4,5-trifluoroanilinium',
 'cyclohexylmethylammonium iodide dissolved in',
 'phenethylammonium iodide',
 '3-(aminomethyl)pyridine',
 'tri-octyl phosphine oxide',
 '2-thiophenemethylammonium bromide',
 'azetidinium lead iodide',
 'Lithium Fluoride',
 'DMePDAI 2',
 'DMePDAI 2',
 'NIP',
 'NOTE:This is how to prepare the PSC,relevent...?',
 'methylammonium lead iodide',
 'methylammonium',
 'methylammonium lead iodide',
 'poly',
 'CF3 -PEAI',
 'CF3 -PEAI',
 'CF3 -PEAI',
 '4-fluorophenylethylammonium iodide',
 '(phenethylamino)methaniminium iodide',
 'Methylammonium Lead Bromide',
 'HTAB',
 '

In [84]:
data

Unnamed: 0,perovskite_composition,electron_transport_layer,hole_transport_layer,structure_pin_nip,test,stability_type,passivating_molecule,humidity,temperature,time,control_pce,treated_pce,control_voc,treated_voc,efficiency_control,efficiency_tret,efficiency_cont,passivating_molecule_cleaned
0,Cs 0.05 FA 0.85 MA 0.1 PbI 3,C60,2PACz and Me-4PACz,PIN,test_1,ISOSL,4-chlorobenzenesulfonate (4Cl-BZS),,65.0,1200.0,24.0,26.90,,1.18,,95.00,,4-chlorobenzenesulfonate
2,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_1,ISOSL,butylamine,85.0,25.0,1620.0,,24.30,,1.18,,0.98,0.58,butylamine
3,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_1_2,ISOSD,butylamine,85.0,85.0,1056.0,,21.30,,,,0.94,,butylamine
4,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_2,ISOSLT,butylamine,,25.0,1620.0,,24.00,,,,0.98,,butylamine
6,butylammonium lead iodide,buckminsterfullerene,Spiro-OMeTAD,NIP,test_1,,iso-butylamine iodide,,,,,22.10,,1.13,,,,iso-butylamine iodide
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,Cs0.05(MA0.05FA0.95)0.95Pb(I0.95Br0.05)3,,,,test_1,,ortho-carborane,30.0,40.0,400.0,,23.00,,1.17,,0.97,0.89,ortho-carborane
230,formamidinium lead iodide,,Spiro-OMeTAD,NIP,test_2,ISOSLT,carbazole-triphenylamine and phenylammonium io...,,,1100.0,,24.70,,,,0.94,0.66,carbazole-triphenylamine and phenylammonium io...
231,Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3,,Spiro-OMeTAD,NIP,test_1_2,ISOSLT,phenylethylammonium iodide,507.0,,500.0,,0.84,,,,,0.70,phenylethylammonium iodide
232,Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3,,Spiro-OMeTAD,NIP,test_1,,4-tert-butyl-benzylammonium iodide,,,,21.2,22.70,1.09,1.12,,0.95,,4-tert-butyl-benzylammonium iodide


In [86]:
data['passivating_molecule_cleaned'] = cleaned_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['passivating_molecule_cleaned'] = cleaned_list


In [30]:
# Example input list
lst = data['passivating_molecule']

# Cleaning the list
cleaned_list = lst.apply(lambda x: fix_unmatched_brackets(x))
cleaned_list = get_chemical_names(cleaned_list)

# Output result

In [44]:
data['passivating_molecule_SMILES'] = data['passivating_molecule_cleaned'].apply(fetch_smiles_from_name)

Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fetching SMILES for nan: 'float' object is not iterable
Error fe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['passivating_molecule_SMILES'] = data['passivating_molecule_cleaned'].apply(fetch_smiles_from_name)


In [106]:
pd.read_json('/Users/kanggun/Documents/DSC180_B11_Q2/data/json_output_deepseek8b/deepseek_8b_newschema_1_OG.json')

Unnamed: 0,77,104,79,138,144,37,143,30,2,93,...,34,147,6,97,46,135,1,90,41,132
perovskite_composition,FA0.9Cs0.1PbI3,FAPbI3,(FAPbI3)0.97 (MAPbBr3)0.03,Cs 0.05 MA 0.10 FA 0.85 PbI 3,Cs0.1FA0.6MA0.3Sn0.5Pb0.5I3,Cs0.2FA0.8Pb(I0.6Br0.4)3,(FAPbI3)0.77 (MAPbBr3)0.14 (CsPbI3)0.09,FA0.9Cs0.1PbI3,(BA)₂PbI₄,FA 1-x MA x PbI 2.87 Br 0.13 (Cl),...,,FAPbI3,MAPbI3,Cs 0.05 (FA 0.92 MA 0.08) 0.95 Pb(I 0.92 Br 0....,CsPbI0.05[(FAPbI3)0.89 (MAPbBr3)0.11]0.95,Cs0.05FA0.85MA0.1PbI3,FA1-x MAx PbI3,(FAPbI3)0.85 (MAPbBr3)0.15,(FAPbI3)0.98(CsI)0.05(PEAI)0.04(MACl)0.05,Cs0.05(FAPbI3)0.85(MAPbBr3)0.15
electron_transport_layer,C60,SnO2,SnO2,FTO,PEDOT:PSS,NiOx,CuSCN,,FTO (fluorine-doped tin(IV) oxide),SnO2,...,,SnO2,TiO2,,C60,2D passivation layer (CF3-PEAI and MAI mixture),SnO2,SnO2,TiO2,SnO2
pin_nip_structure,PIN,PIN,n-i-p,PIN,Not explicitly mentioned,,NIP,PIN,PIN,PIN,...,,NIP,PIN,NIP,PIN,PIN,,PIN,,PIN
hole_transport_layer,PTAA,Spiro-OMeTAD,Spiro-OMeTAD,2PACz:3-MPA,PTAA,Me-4PACz,CuSCN,Spiro-OMeTAD,"Spiro-OMeTAD, P3HT, PTAA",spiro-OMeTAD,...,,Spiro-OMeTAD,spiro-OMeTAD,PTAA,PTAA,NiOx/MeO-4PADBC,spiro-OMeTAD,Spiro-OMeTAD,Spiro-OMeTAD,spiro-OMeTAD
test_1,"{'test_name': 'ISOS-L', 'temperature': 85, 'ti...","{'test_name': 'ISOS-L', 'temperature': 45, 'ti...","{'test_name': 'ISOS-D', 'temperature': '28 °C'...","{'test_name': 'ISOS-L', 'temperature': 65, 'ti...","{'test_name': 'ISOS-D', 'temperature': 25, 'ti...","{'test_name': 'ISOS-LT', 'temperature': None, ...","{'test_name': 'ISOS-D', 'temperature': 25, 'ti...","{'test_name': 'shelf stability', 'temperature'...","{'test_name': 'damp heat test', 'temperature':...",,...,,"{'test_name': 'Damp heat test', 'temperature':...","{'test_name': 'ISOS-D', 'temperature': 90, 'ti...","{'test_name': 'MPP tracking', 'temperature': 1...","{'test_name': 'ISOS-L', 'temperature': 25, 'ti...","{'test_name': 'ISOS-L', 'temperature': 65, 'ti...","{'test_name': 'ISOS-D', 'temperature': 85, 'ti...","{'test_name': None, 'temperature': None, 'time...","{'test_name': None, 'temperature': None, 'time...","{'test_name': 'Wetting stability', 'temperatur..."
test_2,,,"{'test_name': 'ISOS-L', 'temperature': '28 °C'...",,,,,,,,...,,,,,,"{'test_name': 'ISOS-T', 'temperature': 85, 'ti...",,,,"{'test_name': 'Thermal stability', 'temperatur..."
passivating_molecule,,,,,,"1,3-propylene diammonium iodide (PDAI2)",,NH3,(BA)₂PbI₄,,...,,,,Oleylamine (OAm),,,,,o-PDEAI2,
test_3,,,,,,,,,,,...,,,,,,"{'test_name': 'ISOS-LC', 'temperature': 100, '...",,,,"{'test_name': 'Illumination stability', 'tempe..."
control_pce,,,,,,,,,,,...,,,,,,,,,,
control_voc,,,,,,,,,,,...,,,,,,,,,,


In [110]:
pd.read_json('/Users/kanggun/Documents/DSC180_B11_Q2/data/json_output_deepseek8b_newschema/deepseek_4_updateprompt.json')

Unnamed: 0,17,100,107,10,109,62,19,65,147,81
perovskite_composition,Methylammonium Lead Iodide,,MA0.7FA0.3PbI3,Cs 0.12 FA 0.8 MA 0.08 PbI 1.8 Br 1.2,Cs 0.05(FA0.95MA0.05)0.95Pb(I0.95Br0.05)3,Cs0.15FA0.85PbI2.55Br0.45,Formamidinium lead triiodide,CH3NH3PbI3,(MAPbBr3)0.05(FAPbI3)0.95,
electron_transport_layer,,titanium oxynitride,C60,PCBM,buckminsterfullerene,,,SnOx,SnO2,EDTA-complexed SnO2
structure_pin_nip,NIP,NIP,PIN,PIN,PIN,PIN,NIP,PIN,NIP,
hole_transport_layer,,P3HT:CuPc,PTAA,2PACz,poly(triarylamine),,,Al:ZnO,Spiro-OMeTAD,spiro-OMeTAD
test_1,"{'stability_type': 'ISOS-L', 'temperature': 10...","{'stability_type': 'ISOS-L', 'temperature': 85...","{'stability_type': 'ISOS-LT', 'temperature': 6...","{'stability_type': 'thermal stability', 'tempe...","{'stability_type': 'ISOS-LT', 'temperature': 2...","{'stability_type': 'ISOS-D1', 'temperature': 2...","{'stability_type': 'ISOS-D', 'temperature': 25...","{'stability_type': 'ISOS-D', 'temperature': 25...",,"{'stability_type': 'ISOS-D', 'temperature': 12..."
test_2,"{'stability_type': 'ISOS-T', 'temperature': 60...",,,,,"{'stability_type': 'ISOS-D3', 'temperature': 8...","{'stability_type': 'ISOS-L', 'temperature': 25...",,,
test_3,,,,,,"{'stability_type': 'ISOS-O3', 'temperature': N...","{'stability_type': 'ISOS-LT', 'temperature': 5...",,,
