### Table of Content
- [`passivating_molecule` into SMILES format](Converting-`passivating_molecule`-into-SMILES-format)
- [`perovskite_composition` into features](`perovskite_composition`-into-features)
- [baseline ML model](baseline-ML-model)

In [2]:
import pandas as pd
import numpy as np
import pubchempy as pcp

In [3]:
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

In [4]:
df = pd.read_json('data/finetuned_llama_output.json')
data = df.T.sort_index()
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 149 entries, 0 to 149
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   control_pce               71 non-null     object
 1   control_voc               53 non-null     object
 2   treated_pce               140 non-null    object
 3   treated_voc               124 non-null    object
 4   passivating_molecule      143 non-null    object
 5   perovskite_composition    134 non-null    object
 6   electron_transport_layer  118 non-null    object
 7   hole_transport_layer      115 non-null    object
 8   pin_nip_structure         147 non-null    object
 9   stability_tests           149 non-null    object
 10  pin_structure             1 non-null      object
dtypes: object(11)
memory usage: 14.0+ KB


In [5]:
# Function to select columns
def select_data(df):
    # Convert PCE and VOC to numeric
    for col in ['control_pce', 'control_voc', 'treated_pce', 'treated_voc']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Drop rows where treated_pce or passivating_molecule is missing
    df = df.dropna(subset=['treated_pce', 'passivating_molecule', 'perovskite_composition'])

    return df

data = select_data(data)
data.head()

Unnamed: 0,control_pce,control_voc,treated_pce,treated_voc,passivating_molecule,perovskite_composition,electron_transport_layer,hole_transport_layer,pin_nip_structure,stability_tests,pin_structure
0,25.7,1.17,26.15,1.18,4-chlorobenzenesulfonate (4Cl-BZS),α-phase FAPbI3,C60,SAMs (self-assembled monolayers),PIN,"[{'test_name': 'ISOS-D-2I', 'temperature': 85,...",
3,24.5,1.2,24.5,1.2,BA2MA2Pb3I10,BA2MA2Pb3I10,SnO2,PTAA,PIN,"[{'test_name': 'ISOS-L-1', 'temperature': None...",
4,,,21.06,1.14,vinylbenzylammonium bromide (VBABr),(FAPbI3)0.95(MAPbBr3)0.05,Spiro-OMeTAD,Spiro-OMeTAD,PIN,"[{'test_name': None, 'temperature': None, 'tim...",
5,,,22.1,1135.0,iso-BAI,FA(MA)PbI3,Spiro-OMeTAD,TPBI,PIN,"[{'test_name': 'ISOS-LT', 'temperature': None,...",
6,,,15.3,1.06,phenylethylammonium,PEA2(CH3NH3)n-1Pb(nI3n+1),TiO2,spiro-OMeTAD,PIN,"[{'test_name': 'ISOS-D', 'temperature': None, ...",


In [6]:
data.isna().sum(axis=0)

control_pce                  59
control_voc                  73
treated_pce                   0
treated_voc                  10
passivating_molecule          0
perovskite_composition        0
electron_transport_layer     19
hole_transport_layer         20
pin_nip_structure             0
stability_tests               0
pin_structure               107
dtype: int64

## `passivating_molecule` into SMILES format

In [8]:
def fetch_smiles_from_name(molecule_name):
    try:
        # Search for the molecule in PubChem by name
        compounds = pcp.get_compounds(molecule_name, 'name')
        if compounds:
            return compounds[0].isomeric_smiles  # Return the first match's SMILES
        else:
            return np.nan
    except Exception as e:
        print(f"Error fetching SMILES for {molecule_name}: {e}")
        return None

In [9]:
data['passivating_molecule']

0                     4-chlorobenzenesulfonate (4Cl-BZS)
3                                           BA2MA2Pb3I10
4                    vinylbenzylammonium bromide (VBABr)
5                                                iso-BAI
6                                    phenylethylammonium
                             ...                        
144                                               GlyHCl
145                                4-fluoroaniline (FAL)
147    4-(3,6-bis(4-(bis(4-methoxyphenyl)amino)phenyl...
148           4-tert-butyl-benzylammonium iodide (tBBAI)
149            4-trifluoromethyl-phenylammonium (CF3-PA)
Name: passivating_molecule, Length: 107, dtype: object

In [10]:
data['passivating_molecule'].apply(fetch_smiles_from_name).isna().sum()

# there are 88 passivating molecules that cannot be converted without cleaning.

84

### Cleaning Data

In [12]:
import re

def fix_unmatched_brackets(s):
    """
    Fixes unmatched brackets in the given string by adding the correct brackets where necessary.

    :param s: Input string with potential unmatched brackets.
    :return: A corrected string with properly balanced brackets.
    """
    opening = "({["
    closing = ")}]"
    match = {')': '(', '}': '{', ']': '['}
    stack = []

    # Step 1: Identify missing closing brackets
    fixed_s = []
    for char in s:
        if char in opening:
            stack.append(char)
            fixed_s.append(char)
        elif char in closing:
            if stack and stack[-1] == match[char]:
                stack.pop()
                fixed_s.append(char)
            else:
                # Add missing opening bracket before unmatched closing
                fixed_s.insert(0, match[char])
                fixed_s.append(char)
        else:
            fixed_s.append(char)

    # Step 2: Add missing closing brackets at the end
    while stack:
        open_bracket = stack.pop()
        fixed_s.append(closing[opening.index(open_bracket)])

    return "".join(fixed_s)


def get_chemical_names(chemical_list):
    cleaned_list = []
    for name in chemical_list:
        # Remove text inside parentheses only if it's extra information (abbreviations)
        name = re.sub(r"\s*\([^)]*\)$", "", name).strip() 
        # Remove spaces after a closing bracket (ensure proper chemical formatting)
        name = re.sub(r"\] +", "]", name)

        cleaned_list.append(name)

    return cleaned_list

In [13]:
# Example input list
lst = data['passivating_molecule']

# Cleaning the list
cleaned_list = lst.apply(lambda x: fix_unmatched_brackets(x))
cleaned_list = get_chemical_names(cleaned_list)

# Output result
data['passivating_molecule_cleaned'] = cleaned_list

In [14]:
data['passivating_molecule_SMILES'] = data['passivating_molecule_cleaned'].apply(fetch_smiles_from_name)

In [15]:
data['passivating_molecule_SMILES'].isna().sum()

64

In [16]:
# temporary smaller data with all SMILES
temp_data = data[~data['passivating_molecule_SMILES'].isna()]

In [17]:
temp_data.isna().sum()

control_pce                     23
control_voc                     30
treated_pce                      0
treated_voc                      4
passivating_molecule             0
perovskite_composition           0
electron_transport_layer         8
hole_transport_layer             8
pin_nip_structure                0
stability_tests                  0
pin_structure                   43
passivating_molecule_cleaned     0
passivating_molecule_SMILES      0
dtype: int64

## `perovskite_composition` into features

In [19]:
import chemparse

# Example formula
formula = "(FAPbI3)0.95(MAPbBr3)0.05"

# Parse formula
result = chemparse.parse_formula(formula)
print(result)  # Output: {'FA': 0.95, 'Pb': 1.0, 'I': 2.85, 'MA': 0.05, 'Br': 0.15}

{'F': 0.95, 'I': 2.8499999999999996, 'A': 1.0, 'Br': 0.15000000000000002, 'M': 0.05, 'Pb': 1.0}


In [20]:
temp_data['perovskite_composition_'] = temp_data['perovskite_composition'].apply(chemparse.parse_formula)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_data['perovskite_composition_'] = temp_data['perovskite_composition'].apply(chemparse.parse_formula)


In [21]:
print(len(temp_data))  # Should be 43

43


## some molecular features

In [23]:
def compute_molecular_features(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return [
                Descriptors.MolWt(mol),  # Molecular weight
                Descriptors.TPSA(mol),  # Topological Polar Surface Area
                rdMolDescriptors.CalcNumRotatableBonds(mol),  # Rotatable bonds
                rdMolDescriptors.CalcNumHBA(mol),  # Hydrogen bond acceptors
                rdMolDescriptors.CalcNumHBD(mol)  # Hydrogen bond donors
            ]
        else:
            return [np.nan] * 5
    except:
        return [np.nan] * 5

mol_features = temp_data['passivating_molecule_SMILES'].apply(compute_molecular_features)
mol_features_df = pd.DataFrame(mol_features.tolist(), 
                               columns=['MolWt', 'TPSA', 'NumRotBonds', 'NumHBA', 'NumHBD'],
                               index=temp_data.index)

temp_data = pd.concat([temp_data, mol_features_df], axis=1)
temp_data

Unnamed: 0,control_pce,control_voc,treated_pce,treated_voc,passivating_molecule,perovskite_composition,electron_transport_layer,hole_transport_layer,pin_nip_structure,stability_tests,pin_structure,passivating_molecule_cleaned,passivating_molecule_SMILES,perovskite_composition_,MolWt,TPSA,NumRotBonds,NumHBA,NumHBD
0,25.7,1.17,26.15,1.18,4-chlorobenzenesulfonate (4Cl-BZS),α-phase FAPbI3,C60,SAMs (self-assembled monolayers),PIN,"[{'test_name': 'ISOS-D-2I', 'temperature': 85,...",,4-chlorobenzenesulfonate,C1=CC(=CC=C1S(=O)(=O)[O-])Cl,{},191.615,57.2,1,3,0
8,,,23.0,1.177,2-(pyren-1-yl)ethan-1-amine,CsPbI2Br,Spiro-OMeTAD,Spiro-OMeTAD,PIN,"[{'test_name': 'ISOS-LT', 'temperature': None,...",,2-(pyren-1-yl)ethan-1-amine,C1=CC2=C3C(=C1)C=CC4=C(C=CC(=C43)C=C2)CCN,"{'Cs': 1.0, 'Pb': 1.0, 'I': 2.0, 'Br': 1.0}",245.325,26.02,2,1,1
10,,,24.41,1.23,Tosylate ([TsO] -),(FAPbI3) 0.95 (MAPbBr3) 0.05,SnO2,spiro-MeOTAD,PIN,"[{'test_name': 'ISOS-D', 'temperature': None, ...",,Tosylate,CC1=CC=C(C=C1)S(=O)(=O)[O-],"{'F': 1.0, 'I': 3.0, 'A': 2.0, 'Br': 3.0, 'M':...",171.197,57.2,1,3,0
11,,,21.8,1.16,Benzotriazole,FAPbI3,PCBA,Spiro-OMeTAD,NIP,"[{'test_name': None, 'temperature': None, 'tim...",,Benzotriazole,C1=CC2=NNN=C2C=C1,"{'F': 1.0, 'A': 1.0, 'Pb': 1.0, 'I': 3.0}",119.127,41.57,0,2,1
12,,,19.4,1.31,n-butylammonium bromide (BABr),Cs0.17 FA0.83 Pb(I0.6 Br0.4 ) 3,nanoparticle-based SnO 2,"2,2′,7,7′-tetrakis(N, N′-di-p-methoxy phenylam...",PIN,"[{'test_name': 'ISOS-D', 'temperature': None, ...",,n-butylammonium bromide,CCCCN.Br,"{'I': 0.6, 'Cs': 0.17}",154.051,26.02,2,1,1
13,,,22.6,2.1,2-(9H-carbazol-9-yl)ethyl] phosphonic acid (2P...,Cs0.12 FA0.8 MA0.08 PbI1.8 Br1.2,[2-(9H-carbazol-9-yl)ethyl] phosphonic acid (2...,"[6,6]-Phenyl-C61-butyric acid methyl ester (PCBM)",PIN,"[{'test_name': 'ISOS-D', 'temperature': None, ...",,[2-(9H-carbazol-9-yl)ethyl]phosphonic acid,C1=CC=C2C(=C1)C3=CC=CC=C3N2CCP(=O)(O)O,{'Cs': 0.12},275.244,62.46,3,2,2
17,20.04,1.16,22.02,1.19,hexyltrimethylammonium bromide,Cs0.05 FA0.81 MA0.14 PbI2.55 Br0.45,doped spiro-OMeTAD,doped spiro-OMeTAD,PIN,"[{'test_name': 'ISOS-D', 'temperature': None, ...",,hexyltrimethylammonium bromide,CCCCCC[N+](C)(C)C.[Br-],{'Cs': 0.05},224.186,0.0,5,0,0
21,22.6,,22.6,,n-octylammonium bromide (C8 Br),(FAPbI3)0.92(MAPbBr3)0.08,Spiro-OMeTAD,Spiro-OMeTAD,PIN,"[{'test_name': 'ISOS-D', 'temperature': None, ...",,n-octylammonium bromide,CCCCCCCCN.Br,"{'F': 0.92, 'I': 2.7600000000000002, 'A': 1.0,...",210.159,26.02,6,1,1
30,,,23.21,1.16,ammonia,α-FAPbI3,,,PIN,"[{'test_name': None, 'temperature': None, 'tim...",,ammonia,N,{},17.031,35.0,0,1,1
31,23.39,1.12,25.49,1.15,3-(aminomethyl)pyridine (3-APy),Rb0.05Cs0.05MA0.05FA0.85Pb(I0.95Br0.05)3,PEDOT:PSS,BCP,PIN,"[{'test_name': 'ISOS-LT', 'temperature': '55',...",,3-(aminomethyl)pyridine,C1=CC(=CN=C1)CN,"{'F': 1.0, 'A': 0.9, 'Br': 0.15000000000000002...",108.144,38.91,1,2,1


In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Assuming `temp_data` contains the full dataset with the required columns
features = ['MolWt', 'TPSA', 'NumRotBonds', 'NumHBA', 'NumHBD']
target = 'treated_pce'

# Drop rows with missing values in the features or target
data = temp_data

# Split into features (X) and target (y)
X = data[features]
y = data[target]

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train a Random Forest Regressor
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Evaluate the model
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Training Mean Squared Error: {train_mse:.2f}")
print(f"Training R-squared: {train_r2:.2f}")
print(f"Testing Mean Squared Error: {test_mse:.2f}")
print(f"Testing R-squared: {test_r2:.2f}")

Training Mean Squared Error: 1.35
Training R-squared: 0.78
Testing Mean Squared Error: 15.92
Testing R-squared: -0.19


In [25]:
feature_importances = pd.DataFrame({'Feature': features, 'Importance': model.feature_importances_})
print(feature_importances.sort_values(by='Importance', ascending=False))

       Feature  Importance
0        MolWt    0.484621
1         TPSA    0.192432
3       NumHBA    0.137775
2  NumRotBonds    0.100093
4       NumHBD    0.085079


In [26]:
print(list(y_test))

[22.9, 23.0, 21.8, 11.8, 22.6, 20.2, 24.41, 24.63, 20.27]


In [27]:
print(list(y_pred))

NameError: name 'y_pred' is not defined

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Initialize and train the model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Make predictions
y_pred = linear_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared: {r2:.2f}")

In [None]:
coefficients = pd.DataFrame({
    'Feature': features,
    'Coefficient': linear_model.coef_
})

plt.figure(figsize=(8, 6))
coefficients = coefficients.sort_values(by='Coefficient', ascending=False)
sns.barplot(x='Coefficient', y='Feature', data=coefficients)
plt.title('Linear Regression Coefficients')
plt.grid(True)
plt.show()