In [1]:
import pandas as pd
import ast
import numpy as np
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

In [2]:
annotation_df = pd.read_csv('annotation.csv')
annotation_df

Unnamed: 0,first_num,output
0,0,{'perovskite_composition': 'Cs 0.05 FA 0.85 MA...
1,1,"{'perovskite_composition': None, 'electron_tra..."
2,2,{'perovskite_composition': 'dibutylammonium le...
3,3,{'perovskite_composition': 'Cs0.05(MA0.10FA0.8...
4,4,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...
...,...,...
144,145,{'perovskite_composition': 'Cs0.05(MA0.17FA0.8...
145,146,{'perovskite_composition': 'Cs0.05(MA0.05FA0.9...
146,147,{'perovskite_composition': 'formamidinium lead...
147,148,{'perovskite_composition': 'Cs0.05FA0.85MA0.10...


In [3]:
sample_data = annotation_df['output']


# Convert string representation of dictionary into actual dictionary
expanded_data = []
for row in sample_data:
    row_dict = ast.literal_eval(row)  # Convert string to dictionary
    
    # Extract common fields
    common_fields = {
        "perovskite_composition": row_dict.get("perovskite_composition"),
        "electron_transport_layer": row_dict.get("electron_transport_layer"),
        "hole_transport_layer": row_dict.get("hole_transport_layer"),
        "structure_pin_nip": row_dict.get("structure_pin_nip"),
    }

    # Extract test data
    for key, test_data in row_dict.items():
        if key.startswith("test_"):
            test_row = common_fields.copy()
            test_row["test"] = key  # Store test name
            test_row.update(test_data)  # Merge test details
            expanded_data.append(test_row)

# Convert list of dictionaries into DataFrame
df_expanded = pd.DataFrame(expanded_data)

# Fill missing passivating_molecule values based on the first test in each group
df_expanded['passivating_molecule'] = df_expanded.groupby('perovskite_composition')['passivating_molecule'].transform(lambda x: x.ffill())

In [4]:
df_expanded

Unnamed: 0,perovskite_composition,electron_transport_layer,hole_transport_layer,structure_pin_nip,test,stability_type,passivating_molecule,humidity,temperature,time,control_pce,treated_pce,control_voc,treated_voc,efficiency_control,efficiency_tret,efficiency_cont
0,Cs 0.05 FA 0.85 MA 0.1 PbI 3,C60,2PACz and Me-4PACz,PIN,test_1,ISOSL,4-chlorobenzenesulfonate (4Cl-BZS),,65.0,1200.0,24.0,26.90,,1.18,,95.00,
1,,TinOxide,"poly[bis(4-phenyl) (2,4,6-trimethylphenyl)amin...",PIN,test_1,ISOST,,,85.0,500.0,,19.10,,1.16,,,
2,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_1,ISOSL,butylamine,85.0,25.0,1620.0,,24.30,,1.18,,0.98,0.58
3,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_1_2,ISOSD,butylamine,85.0,85.0,1056.0,,21.30,,,,0.94,
4,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_2,ISOSLT,butylamine,,25.0,1620.0,,24.00,,,,0.98,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,formamidinium lead iodide,,Spiro-OMeTAD,NIP,test_1,ISOSD,carbazole-triphenylamine and phenylammonium io...,85.0,85.0,1000.0,22.3,,,1.11,,0.92,
230,formamidinium lead iodide,,Spiro-OMeTAD,NIP,test_2,ISOSLT,carbazole-triphenylamine and phenylammonium io...,,,1100.0,,24.70,,,,0.94,0.66
231,Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3,,Spiro-OMeTAD,NIP,test_1_2,ISOSLT,phenylethylammonium iodide,507.0,,500.0,,0.84,,,,,0.70
232,Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3,,Spiro-OMeTAD,NIP,test_1,,4-tert-butyl-benzylammonium iodide,,,,21.2,22.70,1.09,1.12,,0.95,


In [5]:
df_expanded.isna().sum()

perovskite_composition       71
electron_transport_layer    105
hole_transport_layer         96
structure_pin_nip            73
test                          0
stability_type               73
passivating_molecule        131
humidity                    156
temperature                 112
time                         58
control_pce                 163
treated_pce                  92
control_voc                 195
treated_voc                 156
efficiency_control          234
efficiency_tret              40
efficiency_cont             137
dtype: int64

In [6]:
131

131

In [7]:
# Function to select columns
def select_data(df):
    # Convert PCE and VOC to numeric
    for col in ['control_pce', 'control_voc', 'treated_pce', 'treated_voc']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Drop rows where treated_pce or passivating_molecule is missing
    df = df.dropna(subset=['treated_pce', 'passivating_molecule', 'perovskite_composition'])

    return df

data = select_data(df_expanded)
data

Unnamed: 0,perovskite_composition,electron_transport_layer,hole_transport_layer,structure_pin_nip,test,stability_type,passivating_molecule,humidity,temperature,time,control_pce,treated_pce,control_voc,treated_voc,efficiency_control,efficiency_tret,efficiency_cont
0,Cs 0.05 FA 0.85 MA 0.1 PbI 3,C60,2PACz and Me-4PACz,PIN,test_1,ISOSL,4-chlorobenzenesulfonate (4Cl-BZS),,65.0,1200.0,24.0,26.90,,1.18,,95.00,
2,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_1,ISOSL,butylamine,85.0,25.0,1620.0,,24.30,,1.18,,0.98,0.58
3,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_1_2,ISOSD,butylamine,85.0,85.0,1056.0,,21.30,,,,0.94,
4,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_2,ISOSLT,butylamine,,25.0,1620.0,,24.00,,,,0.98,
6,butylammonium lead iodide,buckminsterfullerene,Spiro-OMeTAD,NIP,test_1,,iso-butylamine iodide,,,,,22.10,,1.13,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,Cs0.05(MA0.05FA0.95)0.95Pb(I0.95Br0.05)3,,,,test_1,,ortho-carborane,30.0,40.0,400.0,,23.00,,1.17,,0.97,0.89
230,formamidinium lead iodide,,Spiro-OMeTAD,NIP,test_2,ISOSLT,carbazole-triphenylamine and phenylammonium io...,,,1100.0,,24.70,,,,0.94,0.66
231,Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3,,Spiro-OMeTAD,NIP,test_1_2,ISOSLT,phenylethylammonium iodide,507.0,,500.0,,0.84,,,,,0.70
232,Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3,,Spiro-OMeTAD,NIP,test_1,,4-tert-butyl-benzylammonium iodide,,,,21.2,22.70,1.09,1.12,,0.95,


In [8]:
def fetch_smiles_from_name(molecule_name):
    try:
        # Search for the molecule in PubChem by name
        compounds = pcp.get_compounds(molecule_name, 'name')
        if compounds:
            return compounds[0].isomeric_smiles  # Return the first match's SMILES
        else:
            return np.nan
    except Exception as e:
        print(f"Error fetching SMILES for {molecule_name}: {e}")
        return None

In [9]:
import re

def fix_unmatched_brackets(s):
    """
    Fixes unmatched brackets in the given string by adding the correct brackets where necessary.

    :param s: Input string with potential unmatched brackets.
    :return: A corrected string with properly balanced brackets.
    """
    opening = "({["
    closing = ")}]"
    match = {')': '(', '}': '{', ']': '['}
    stack = []

    # Step 1: Identify missing closing brackets
    fixed_s = []
    for char in s:
        if char in opening:
            stack.append(char)
            fixed_s.append(char)
        elif char in closing:
            if stack and stack[-1] == match[char]:
                stack.pop()
                fixed_s.append(char)
            else:
                # Add missing opening bracket before unmatched closing
                fixed_s.insert(0, match[char])
                fixed_s.append(char)
        else:
            fixed_s.append(char)

    # Step 2: Add missing closing brackets at the end
    while stack:
        open_bracket = stack.pop()
        fixed_s.append(closing[opening.index(open_bracket)])

    return "".join(fixed_s)


def get_chemical_names(chemical_list):
    cleaned_list = []
    for name in chemical_list:
        # Remove text inside parentheses only if it's extra information (abbreviations)
        name = re.sub(r"\s*\([^)]*\)$", "", name).strip() 
        # Remove spaces after a closing bracket (ensure proper chemical formatting)
        name = re.sub(r"\] +", "]", name)

        cleaned_list.append(name)

    return cleaned_list

In [10]:
data['passivating_molecule']

0                     4-chlorobenzenesulfonate (4Cl-BZS)
2                                             butylamine
3                                             butylamine
4                                             butylamine
6                                  iso-butylamine iodide
                             ...                        
228                                      ortho-carborane
230    carbazole-triphenylamine and phenylammonium io...
231                           phenylethylammonium iodide
232                   4-tert-butyl-benzylammonium iodide
233                     4-trifluoromethyl-phenylammonium
Name: passivating_molecule, Length: 67, dtype: object

In [11]:
lst = data['passivating_molecule']
lst

0                     4-chlorobenzenesulfonate (4Cl-BZS)
2                                             butylamine
3                                             butylamine
4                                             butylamine
6                                  iso-butylamine iodide
                             ...                        
228                                      ortho-carborane
230    carbazole-triphenylamine and phenylammonium io...
231                           phenylethylammonium iodide
232                   4-tert-butyl-benzylammonium iodide
233                     4-trifluoromethyl-phenylammonium
Name: passivating_molecule, Length: 67, dtype: object

In [12]:
cleaned_list = lst.apply(lambda x: fix_unmatched_brackets(x))
cleaned_list

0                     4-chlorobenzenesulfonate (4Cl-BZS)
2                                             butylamine
3                                             butylamine
4                                             butylamine
6                                  iso-butylamine iodide
                             ...                        
228                                      ortho-carborane
230    carbazole-triphenylamine and phenylammonium io...
231                           phenylethylammonium iodide
232                   4-tert-butyl-benzylammonium iodide
233                     4-trifluoromethyl-phenylammonium
Name: passivating_molecule, Length: 67, dtype: object

In [13]:
cleaned_list = get_chemical_names(cleaned_list)

In [14]:
data['passivating_molecule_cleaned'] = cleaned_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['passivating_molecule_cleaned'] = cleaned_list


In [15]:
data['passivating_molecule_SMILES'] = data['passivating_molecule_cleaned'].apply(fetch_smiles_from_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['passivating_molecule_SMILES'] = data['passivating_molecule_cleaned'].apply(fetch_smiles_from_name)


In [16]:
len(data['passivating_molecule_SMILES'])

67

In [17]:
data['passivating_molecule_SMILES'].isna().sum()

31

In [18]:
data

Unnamed: 0,perovskite_composition,electron_transport_layer,hole_transport_layer,structure_pin_nip,test,stability_type,passivating_molecule,humidity,temperature,time,control_pce,treated_pce,control_voc,treated_voc,efficiency_control,efficiency_tret,efficiency_cont,passivating_molecule_cleaned,passivating_molecule_SMILES
0,Cs 0.05 FA 0.85 MA 0.1 PbI 3,C60,2PACz and Me-4PACz,PIN,test_1,ISOSL,4-chlorobenzenesulfonate (4Cl-BZS),,65.0,1200.0,24.0,26.90,,1.18,,95.00,,4-chlorobenzenesulfonate,C1=CC(=CC=C1S(=O)(=O)[O-])Cl
2,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_1,ISOSL,butylamine,85.0,25.0,1620.0,,24.30,,1.18,,0.98,0.58,butylamine,CCCCN
3,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_1_2,ISOSD,butylamine,85.0,85.0,1056.0,,21.30,,,,0.94,,butylamine,CCCCN
4,dibutylammonium lead iodide,tin dioxide,Spiro-OMeTAD,NIP,test_2,ISOSLT,butylamine,,25.0,1620.0,,24.00,,,,0.98,,butylamine,CCCCN
6,butylammonium lead iodide,buckminsterfullerene,Spiro-OMeTAD,NIP,test_1,,iso-butylamine iodide,,,,,22.10,,1.13,,,,iso-butylamine iodide,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,Cs0.05(MA0.05FA0.95)0.95Pb(I0.95Br0.05)3,,,,test_1,,ortho-carborane,30.0,40.0,400.0,,23.00,,1.17,,0.97,0.89,ortho-carborane,B1=BB=BB=BC=CB=BB=B1
230,formamidinium lead iodide,,Spiro-OMeTAD,NIP,test_2,ISOSLT,carbazole-triphenylamine and phenylammonium io...,,,1100.0,,24.70,,,,0.94,0.66,carbazole-triphenylamine and phenylammonium io...,
231,Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3,,Spiro-OMeTAD,NIP,test_1_2,ISOSLT,phenylethylammonium iodide,507.0,,500.0,,0.84,,,,,0.70,phenylethylammonium iodide,C1=CC=C(C=C1)CC[NH3+].[I-]
232,Cs0.05FA0.85MA0.10Pb(I0.97Br0.03)3,,Spiro-OMeTAD,NIP,test_1,,4-tert-butyl-benzylammonium iodide,,,,21.2,22.70,1.09,1.12,,0.95,,4-tert-butyl-benzylammonium iodide,CC(C)(C)C1=CC=C(C=C1)CN.I
