## Model
**This notebook will do the following:**
1. Expand the CSV file.
2. Convert passivating molecules into SMILES representations and extract their features.
3. Retrieve the composition of the perovskite.
4. Train a model using the dataset.
5. Predict new PCE values for different pairings of passivating molecules and perovskites. 

In [7]:
import pandas as pd
import ast
import json
import numpy as np
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

import requests

import re

import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

### Load & expand json into DataFrame rows + Selecting

In [10]:
papers_df = pd.read_csv('150_papers_json_update.csv')
papers_df.head()

Unnamed: 0,first_num,id,text,memory,output,second_num
0,0,0_54,\t\t\t of 5 Downloaded from https://www.scienc...,"{""perovskite_composition"": ""Cs0.05FA0.85MA0.1P...","{""perovskite_composition"": ""Cs0.05FA0.85MA0.1P...",54
1,1,1_22,\t\t\t NAture PhotoNiCS | VOL 13 | JULY 2019 |...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra...",22
2,2,2_75,\t\t\t Nature eNerGY | VOL 6 | JANUARY 2021 | ...,"{""perovskite_composition"": ""(BA)2PbI 4"", ""elec...","{""perovskite_composition"": ""(BA)2PbI 4"", ""elec...",75
3,3,3_52,\t\t\t of 6 RESEARCH | REPORT Downloaded from ...,"{""perovskite_composition"": ""Cs0.05(MA0.10FA0.8...","{""perovskite_composition"": ""Cs0.05(MA0.10FA0.8...",52
4,4,4_26,"Proppe 1,2,10 , Andrew Johnston 2,10 , Sam T...","{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": ""(MAPbBr3)0.05(FAPb...",26


In [12]:
# List to store extracted data
expanded_data = []

for index, row in papers_df.iterrows():
    try:
        row_dict = json.loads(row['output'])  # Convert JSON string to dictionary
    except (json.JSONDecodeError, TypeError):
        continue  # Skip rows where conversion fails

    # Extract common fields
    common_fields = {
        "first_num": row['first_num'],
        "perovskite_composition": row_dict.get("perovskite_composition"),
        "electron_transport_layer": row_dict.get("electron_transport_layer"),
        "hole_transport_layer": row_dict.get("hole_transport_layer"),
        "structure_pin_nip": row_dict.get("structure_pin_nip"),
    }

    # Extract test data
    for key, test_data in row_dict.items():
        if key.startswith("test_") and isinstance(test_data, dict):
            test_row = common_fields.copy()
            test_row["test"] = key  # Store test name
            test_row.update(test_data)  # Merge test details
            expanded_data.append(test_row)

# Convert extracted data to DataFrame
df_expanded = pd.DataFrame(expanded_data)
df_expanded.head()

Unnamed: 0,first_num,perovskite_composition,electron_transport_layer,hole_transport_layer,structure_pin_nip,test,stability_type,passivating_molecule,humidity,temperature,time,control_pce,treated_pce,control_voc,treated_voc,efficiency_control,efficiency_tret,efficiency_cont
0,0,Cs0.05FA0.85MA0.1PbI3,C60,2PACz and Me-4PACz,PIN,test_1,ISOSL,4-chlorobenzenesulfonate (4Cl-BZS),,65,1200,24.0,26.9,,1.18,,95.0,
1,1,,TinOxide,PTAA,PIN,test_1,ISOST,phenethylammonium,,85,500,,19.1,,1.16,,,
2,2,(BA)2PbI 4,tin dioxide,Spiro-OMeTAD,NIP,test_1,ISOSL,,85.0,25,1620,22.39,24.35,,1.185,,98.0,58.6
3,2,(BA)2PbI 4,tin dioxide,Spiro-OMeTAD,NIP,test_1_2,ISOSD,,85.0,85,1056,,21.34,,,,94.0,
4,2,(BA)2PbI 4,tin dioxide,Spiro-OMeTAD,NIP,test_2,ISOSLT,,,25,1620,,24.06,,,,98.0,


In [14]:
# Selecting columns of interest
columns_of_interest = ['passivating_molecule', 'treated_pce', 'perovskite_composition']

# Will use this DataFrame called 'data' to train the model
data = df_expanded.dropna(subset=columns_of_interest)[columns_of_interest]

---

In [17]:
data.shape

(51, 3)

### SMILES representation and features

In [20]:
# function that converts names into SMILES representation
def fetch_smiles_from_name(molecule_name):
    try:
        # Search for the molecule in PubChem by name
        compounds = pcp.get_compounds(molecule_name, 'name')
        if compounds:
            return compounds[0].isomeric_smiles  # Return the first match's SMILES
        else:
            return np.nan
    except Exception as e:
        print(f"Error fetching SMILES for {molecule_name}: {e}")
        return None

In [22]:
import pubchempy as pcp
import numpy as np
import requests

def fetch_smiles(molecule_name):
    try:
        # Try fetching SMILES from PubChem
        compounds = pcp.get_compounds(molecule_name, 'name')
        if compounds and compounds[0].isomeric_smiles:
            return compounds[0].isomeric_smiles  # Return first match's SMILES
    except Exception as e:
        print(f"Error fetching from PubChem for {molecule_name}: {e}")

    # If PubChem fails, try OPSIN
    try:
        base_url = "https://opsin.ch.cam.ac.uk/opsin/"
        smiles_url = base_url + molecule_name + ".smi"
        r = requests.get(smiles_url)
        if r.status_code == 200:
            return r.text.strip()  # Remove any trailing newline characters
    except Exception as e:
        print(f"Error fetching from OPSIN for {molecule_name}: {e}")

    return None  # Return None if both methods fail

# Example usage
smiles = fetch_smiles("4-chlorobenzenesulfonate")
print(smiles)

C1=CC(=CC=C1S(=O)(=O)[O-])Cl


In [23]:
# Cleans string formatting
#### Could be improved upon by looking into string more ####

def fix_unmatched_brackets(s):
    """
    Fixes unmatched brackets in the given string by adding the correct brackets where necessary.

    :param s: Input string with potential unmatched brackets.
    :return: A corrected string with properly balanced brackets.
    """
    opening = "({["
    closing = ")}]"
    match = {')': '(', '}': '{', ']': '['}
    stack = []

    # Step 1: Identify missing closing brackets
    fixed_s = []
    for char in s:
        if char in opening:
            stack.append(char)
            fixed_s.append(char)
        elif char in closing:
            if stack and stack[-1] == match[char]:
                stack.pop()
                fixed_s.append(char)
            else:
                # Add missing opening bracket before unmatched closing
                fixed_s.insert(0, match[char])
                fixed_s.append(char)
        else:
            fixed_s.append(char)

    # Step 2: Add missing closing brackets at the end
    while stack:
        open_bracket = stack.pop()
        fixed_s.append(closing[opening.index(open_bracket)])

    return "".join(fixed_s)


def get_chemical_names(chemical_list):
    cleaned_list = []
    for name in chemical_list:
        # Remove text inside parentheses only if it's extra information (abbreviations)
        name = re.sub(r"\s*\([^)]*\)$", "", name).strip() 
        # Remove spaces after a closing bracket (ensure proper chemical formatting)
        name = re.sub(r"\] +", "]", name)

        cleaned_list.append(name)

    return cleaned_list

In [24]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

def compute_molecular_features(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return [
                Descriptors.MolWt(mol),  # Molecular weight
                Descriptors.ExactMolWt(mol),  # Exact molecular weight (isotope-specific)
                Descriptors.MolLogP(mol),  # LogP (lipophilicity)
                Descriptors.TPSA(mol),  # Topological Polar Surface Area
                Descriptors.NumValenceElectrons(mol),  # Total valence electrons
                rdMolDescriptors.CalcNumRotatableBonds(mol),  # Rotatable bonds
                rdMolDescriptors.CalcNumHBA(mol),  # Hydrogen bond acceptors
                rdMolDescriptors.CalcNumHBD(mol),  # Hydrogen bond donors
                rdMolDescriptors.CalcFractionCSP3(mol),  # Fraction of sp3 carbons
                rdMolDescriptors.CalcNumAromaticRings(mol),  # Number of aromatic rings
                rdMolDescriptors.CalcNumSaturatedRings(mol),  # Number of saturated rings
                rdMolDescriptors.CalcNumHeteroatoms(mol),  # Number of heteroatoms
                rdMolDescriptors.CalcNumHeavyAtoms(mol),  # Number of heavy atoms
                rdMolDescriptors.CalcNumSpiroAtoms(mol),  # Number of spiro atoms
                rdMolDescriptors.CalcNumBridgeheadAtoms(mol),  # Number of bridgehead atoms
                Descriptors.FpDensityMorgan1(mol),  # Morgan fingerprint density (radius=1)
                Descriptors.FpDensityMorgan2(mol),  # Morgan fingerprint density (radius=2)
                Descriptors.FpDensityMorgan3(mol),  # Morgan fingerprint density (radius=3)
                Descriptors.qed(mol),  # Quantitative Estimate of Drug-likeness
                rdMolDescriptors.CalcNumLipinskiHBA(mol),  # Lipinski Hydrogen Bond Acceptors
                rdMolDescriptors.CalcNumLipinskiHBD(mol),  # Lipinski Hydrogen Bond Donors
                rdMolDescriptors.CalcNumRings(mol),  # Total number of rings
                rdMolDescriptors.CalcNumAmideBonds(mol),  # Number of amide bonds
                Descriptors.BalabanJ(mol),  # Balaban’s connectivity index
                Descriptors.BertzCT(mol),  # Bertz complexity
                Descriptors.Chi0(mol),  # Chi connectivity index (order 0)
                Descriptors.Chi1(mol),  # Chi connectivity index (order 1)
                Descriptors.Chi2n(mol),  # Chi connectivity index (order 2, non-H)
                Descriptors.Kappa1(mol),  # Kappa Shape Index (order 1)
                Descriptors.Kappa2(mol),  # Kappa Shape Index (order 2)
            ]
        else:
            return [np.nan] * 30  # Return NaN for missing values
    except:
        return [np.nan] * 30  # Return NaN for exceptions

In [28]:
lst = data['passivating_molecule']
cleaned_list = lst.apply(lambda x: fix_unmatched_brackets(x))
cleaned_list = get_chemical_names(cleaned_list)
data['passivating_molecule_cleaned'] = cleaned_list
data['passivating_molecule_SMILES'] = data['passivating_molecule_cleaned'].apply(fetch_smiles)

In [29]:
# Apply function to dataset
mol_features = data['passivating_molecule_SMILES'].apply(compute_molecular_features)

# Convert list to DataFrame
mol_features_df = pd.DataFrame(mol_features.tolist(), 
                               columns=[
                                   'MolWt', 'ExactMolWt', 'LogP', 'TPSA', 'NumValenceElectrons',
                                   'NumRotBonds', 'NumHBA', 'NumHBD', 'FractionCSP3', 'AromaticRings',
                                   'SaturatedRings', 'Heteroatoms', 'HeavyAtoms', 'SpiroAtoms', 
                                   'BridgeheadAtoms', 'FpDensityMorgan1', 'FpDensityMorgan2', 
                                   'FpDensityMorgan3', 'QED', 'LipinskiHBA', 
                                   'LipinskiHBD', 'NumRings', 'NumAmideBonds', 'BalabanJ', 
                                   'BertzCT', 'Chi0', 'Chi1', 'Chi2n', 'Kappa1', 'Kappa2'
                               ],
                               index=data.index)

# Merge with original dataset
data = pd.concat([data, mol_features_df], axis=1)
data = data.dropna(subset=['passivating_molecule_SMILES'])
data

Unnamed: 0,passivating_molecule,treated_pce,perovskite_composition,passivating_molecule_cleaned,passivating_molecule_SMILES,MolWt,ExactMolWt,LogP,TPSA,NumValenceElectrons,...,LipinskiHBD,NumRings,NumAmideBonds,BalabanJ,BertzCT,Chi0,Chi1,Chi2n,Kappa1,Kappa2
0,4-chlorobenzenesulfonate (4Cl-BZS),26.9,Cs0.05FA0.85MA0.1PbI3,4-chlorobenzenesulfonate,C1=CC(=CC=C1S(=O)(=O)[O-])Cl,191.615,190.957516,1.2441,57.2,60.0,...,0.0,1.0,0.0,3.201516,340.595074,8.483128,4.999019,1.908044,8.515969,2.808706
6,iso-butylamine iodide,22.1,BA2PbI 4,iso-butylamine iodide,[I-].C(C(C)C)N,200.043,199.994171,-2.3949,26.02,40.0,...,2.0,0.0,0.0,0.0,21.509775,4.284457,2.270056,1.629549,9.847023,5.69
7,phenylethylammonium iodide,18.89,MAPbI 3,phenylethylammonium iodide,C1=CC=C(C=C1)CC[NH3+].[I-],249.095,249.001447,-2.525,27.64,56.0,...,3.0,1.0,0.0,8.099999e-07,162.168377,6.527098,4.431852,2.089152,9.91,5.676536
14,butylammonium bromide,19.8,Cs0.15FA0.85PbI2.19Br0.81,butylammonium bromide,CCCCN.Br,154.051,153.015311,1.3231,26.02,40.0,...,2.0,0.0,0.0,0.0,15.01955,4.12132,2.414214,1.142229,9.667579,9.062499
15,2-thiopheneethylammonium chloride,23.8,Cs0.12FA0.8MA0.08PbI1.8Br1,2-thiopheneethylammonium chloride,C1=CSC(=C1)CCN.Cl,163.673,163.022248,1.6711,26.02,52.0,...,2.0,1.0,0.0,6.399999e-07,138.777906,5.819991,3.931852,1.618039,8.95,4.793919
17,chlorophenylethylammonium iodide,23.7,Cs0.05(FA5/6MA1/6)0.95Pb(I0.85Br0.15 )3,chlorophenylethylammonium iodide,[I-].Cl[NH2+]CCC1=CC=CC=C1,283.54,282.962475,-2.0497,16.61,62.0,...,2.0,1.0,0.0,9.999998e-07,178.706749,7.234205,4.931852,2.178577,11.2,6.882398
18,fluorophenylethylammonium iodide,23.79,Cs0.05(FA5/6MA1/6)0.95Pb(I0.9Br0.1)3,fluorophenylethylammonium iodide,[I-].F[NH2+]CCC1=CC=CC=C1,267.085,266.992026,-2.319,16.61,62.0,...,2.0,1.0,0.0,9.999998e-07,178.706749,7.234205,4.931852,2.178577,10.84,6.543966
22,"3,4,5-trifluoroanilinium",24.09,Cs 0.05 MA 0.05 FA 0.9 Pb(I 0.95 Br 0.05 ) 3,"3,4,5-trifluoroanilinium",FC=1C=C([NH3+])C=C(C1F)F,148.107,148.03686,0.9773,27.64,54.0,...,3.0,1.0,0.0,3.310533,236.164996,7.723615,4.609061,1.873448,7.081483,2.301675
26,cyclohexylmethylammonium iodide,23.94,FAPbI 3,cyclohexylmethylammonium iodide,[I-].C1(CCCCC1)C[NH3+],241.116,241.032748,-2.1874,27.64,56.0,...,3.0,1.0,0.0,6.399999e-07,59.936491,5.819991,3.931852,2.792711,9.69,5.472993
31,phenethylammonium iodide,23.91,Cs0.05MA0.1FA0.85PbI3,phenethylammonium iodide,C1=CC=C(C=C1)CCN.I,249.095,249.001447,1.8058,26.02,56.0,...,2.0,1.0,0.0,8.099999e-07,162.168377,6.527098,4.431852,2.024274,9.91,5.676536


---
### Composition of the Perovskite

In [31]:
def parse_perovskite_formula(formula):
    # Define allowed species (order matters for multi-letter elements)
    allowed_species = ["FA", "MA", "CS", "Rb", "Pb", "Sn", "I", "Br", "Cl"]

    # if is the nan we return component dictionary with all zeros
    if formula is np.nan:
        formula = ""    
    
    # Dictionary to store parsed results (initialize with 0.0 for all species)
    parsed_result = {species: 0.0 for species in allowed_species}

    # Step 1: Handle groups in parentheses with coefficients (e.g., (FAPbI3)0.95)
    pattern_group = r"\(([^)]+)\)\s*([0-9\.]+)"

    
    
    groups = re.findall(pattern_group, formula)

    if groups:
        for group, coef in groups:
            coef = float(coef)  # Convert coefficient to float
            elements = re.findall(r"(FA|MA|CS|Rb|Pb|Sn|I|Br|Cl)\s*([\d\.]*)", group)
            for element, count in elements:
                count = float(count) if count else 1.0
                parsed_result[element] += count * coef  # Distribute coefficient

    # Step 2: Handle formulas without parentheses (e.g., FA1-xMAxPbI3)
    remaining_formula = re.sub(r"\([^)]*\)\s*[0-9\.]+", "", formula)  # Remove processed groups
    elements = re.findall(r"(FA|MA|CS|Rb|Pb|Sn|I|Br|Cl)\s*([\d\.]*)", remaining_formula)

    for element, count in elements:
        count = float(count) if count and 'x' not in count else 1.0  # Ignore '-x' or 'x'
        parsed_result[element] += count

    # Round to 2 decimal places for all values
    parsed_result = {k: round(v, 2) for k, v in parsed_result.items()}

    return parsed_result

# Test cases
formulas = [
    "(FAPbI3)0.95(MAPbBr3)0.05",
    "FA1-xMAxPbI3",
    "FA0.9CS0.1Rb0.05PbI2.9Br0.1",
    "(CS0.8Rb0.2FAPbI3)0.9(MAPbBr3)0.1",
    "(C4H9NH3)2PbI 4"  # Test case with space
]

for formula in formulas:
    print(f"Formula: {formula}")
    print("Parsed:", parse_perovskite_formula(formula))
    print()

Formula: (FAPbI3)0.95(MAPbBr3)0.05
Parsed: {'FA': 0.95, 'MA': 0.05, 'CS': 0.0, 'Rb': 0.0, 'Pb': 1.0, 'Sn': 0.0, 'I': 2.85, 'Br': 0.15, 'Cl': 0.0}

Formula: FA1-xMAxPbI3
Parsed: {'FA': 1.0, 'MA': 1.0, 'CS': 0.0, 'Rb': 0.0, 'Pb': 1.0, 'Sn': 0.0, 'I': 3.0, 'Br': 0.0, 'Cl': 0.0}

Formula: FA0.9CS0.1Rb0.05PbI2.9Br0.1
Parsed: {'FA': 0.9, 'MA': 0.0, 'CS': 0.1, 'Rb': 0.05, 'Pb': 1.0, 'Sn': 0.0, 'I': 2.9, 'Br': 0.1, 'Cl': 0.0}

Formula: (CS0.8Rb0.2FAPbI3)0.9(MAPbBr3)0.1
Parsed: {'FA': 0.9, 'MA': 0.1, 'CS': 0.72, 'Rb': 0.18, 'Pb': 1.0, 'Sn': 0.0, 'I': 2.7, 'Br': 0.3, 'Cl': 0.0}

Formula: (C4H9NH3)2PbI 4
Parsed: {'FA': 0.0, 'MA': 0.0, 'CS': 0.0, 'Rb': 0.0, 'Pb': 1.0, 'Sn': 0.0, 'I': 4.0, 'Br': 0.0, 'Cl': 0.0}



In [32]:
temp = data['perovskite_composition'].apply(parse_perovskite_formula).apply(pd.Series)
data = data.join(temp)
data

Unnamed: 0,passivating_molecule,treated_pce,perovskite_composition,passivating_molecule_cleaned,passivating_molecule_SMILES,MolWt,ExactMolWt,LogP,TPSA,NumValenceElectrons,...,Kappa2,FA,MA,CS,Rb,Pb,Sn,I,Br,Cl
0,4-chlorobenzenesulfonate (4Cl-BZS),26.9,Cs0.05FA0.85MA0.1PbI3,4-chlorobenzenesulfonate,C1=CC(=CC=C1S(=O)(=O)[O-])Cl,191.615,190.957516,1.2441,57.2,60.0,...,2.808706,0.85,0.1,0.0,0.0,1.0,0.0,3.0,0.0,0.0
6,iso-butylamine iodide,22.1,BA2PbI 4,iso-butylamine iodide,[I-].C(C(C)C)N,200.043,199.994171,-2.3949,26.02,40.0,...,5.69,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0
7,phenylethylammonium iodide,18.89,MAPbI 3,phenylethylammonium iodide,C1=CC=C(C=C1)CC[NH3+].[I-],249.095,249.001447,-2.525,27.64,56.0,...,5.676536,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0
14,butylammonium bromide,19.8,Cs0.15FA0.85PbI2.19Br0.81,butylammonium bromide,CCCCN.Br,154.051,153.015311,1.3231,26.02,40.0,...,9.062499,0.85,0.0,0.0,0.0,1.0,0.0,2.19,0.81,0.0
15,2-thiopheneethylammonium chloride,23.8,Cs0.12FA0.8MA0.08PbI1.8Br1,2-thiopheneethylammonium chloride,C1=CSC(=C1)CCN.Cl,163.673,163.022248,1.6711,26.02,52.0,...,4.793919,0.8,0.08,0.0,0.0,1.0,0.0,1.8,1.0,0.0
17,chlorophenylethylammonium iodide,23.7,Cs0.05(FA5/6MA1/6)0.95Pb(I0.85Br0.15 )3,chlorophenylethylammonium iodide,[I-].Cl[NH2+]CCC1=CC=CC=C1,283.54,282.962475,-2.0497,16.61,62.0,...,6.882398,4.75,0.95,0.0,0.0,1.0,0.0,2.55,0.45,0.0
18,fluorophenylethylammonium iodide,23.79,Cs0.05(FA5/6MA1/6)0.95Pb(I0.9Br0.1)3,fluorophenylethylammonium iodide,[I-].F[NH2+]CCC1=CC=CC=C1,267.085,266.992026,-2.319,16.61,62.0,...,6.543966,4.75,0.95,0.0,0.0,1.0,0.0,2.7,0.3,0.0
22,"3,4,5-trifluoroanilinium",24.09,Cs 0.05 MA 0.05 FA 0.9 Pb(I 0.95 Br 0.05 ) 3,"3,4,5-trifluoroanilinium",FC=1C=C([NH3+])C=C(C1F)F,148.107,148.03686,0.9773,27.64,54.0,...,2.301675,0.9,0.05,0.0,0.0,1.0,0.0,2.85,0.15,0.0
26,cyclohexylmethylammonium iodide,23.94,FAPbI 3,cyclohexylmethylammonium iodide,[I-].C1(CCCCC1)C[NH3+],241.116,241.032748,-2.1874,27.64,56.0,...,5.472993,1.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0
31,phenethylammonium iodide,23.91,Cs0.05MA0.1FA0.85PbI3,phenethylammonium iodide,C1=CC=C(C=C1)CCN.I,249.095,249.001447,1.8058,26.02,56.0,...,5.676536,0.85,0.1,0.0,0.0,1.0,0.0,3.0,0.0,0.0


In [33]:
data.to_csv('data.csv')

---
## Model Building

In [177]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Define feature matrix X and target variable y
X = data.drop(columns=["perovskite_composition", "passivating_molecule", 
                       "passivating_molecule_cleaned", "passivating_molecule_SMILES", "treated_pce"])
y = data["treated_pce"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models and hyperparameter grids
models = {
    "Ridge": (Ridge(), {"alpha": [0.1, 1, 10, 100]}),
    "Lasso": (Lasso(), {"alpha": [0.1, 1, 10, 100]}),
    "SVR": (SVR(), {"C": [0.1, 1, 10], "gamma": ["scale", "auto"], "kernel": ["rbf", "linear"]}),
    "RandomForest": (RandomForestRegressor(), {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}),
    "GradientBoosting": (GradientBoostingRegressor(), {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.2]})
}

# Store results
results = []

# Train and evaluate models
for name, (model, param_grid) in models.items():
    pipeline = Pipeline([("scaler", StandardScaler()), ("model", model)])
    grid_search = GridSearchCV(pipeline, {"model__" + key: value for key, value in param_grid.items()}, cv=5, scoring="neg_mean_squared_error")
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({"Model": name, "Best Params": grid_search.best_params_, "MSE": mse, "R2": r2})

# Convert results to a DataFrame and display
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Best Params,MSE,R2
0,Ridge,{'model__alpha': 100},5.538115,0.123865
1,Lasso,{'model__alpha': 10},6.360932,-0.006305
2,SVR,"{'model__C': 10, 'model__gamma': 'auto', 'mode...",5.682659,0.100998
3,RandomForest,"{'model__max_depth': 20, 'model__n_estimators'...",4.993213,0.210069
4,GradientBoosting,"{'model__learning_rate': 0.01, 'model__n_estim...",5.160751,0.183564


---
### Predict new PCE values

In [213]:
from itertools import product

# Define feature groups
passivating_features = ['MolWt', 'ExactMolWt', 'LogP', 'TPSA', 'NumValenceElectrons',
                        'NumRotBonds', 'NumHBA', 'NumHBD', 'FractionCSP3', 'AromaticRings',
                        'SaturatedRings', 'Heteroatoms', 'HeavyAtoms', 'SpiroAtoms', 
                        'BridgeheadAtoms', 'FpDensityMorgan1', 'FpDensityMorgan2', 
                        'FpDensityMorgan3', 'QED', 'LipinskiHBA', 
                        'LipinskiHBD', 'NumRings', 'NumAmideBonds', 'BalabanJ', 
                        'BertzCT', 'Chi0', 'Chi1', 'Chi2n', 'Kappa1', 'Kappa2']

perovskite_features = ["FA", "MA", "CS", "Rb", "Pb", "Sn", "I", "Br", "Cl"]

# Extract unique passivating molecules and their features
passivating_data = data[['passivating_molecule'] + passivating_features].drop_duplicates()

# Extract unique perovskite compositions and their features
perovskite_data = data[['perovskite_composition'] + perovskite_features].drop_duplicates()

# Generate all unique combinations
new_combinations = list(product(passivating_data.values, perovskite_data.values))

# Construct the new dataframe
new_data = []
for passivating, perovskite in new_combinations:
    passivating_dict = dict(zip(passivating_data.columns, passivating))
    perovskite_dict = dict(zip(perovskite_data.columns, perovskite))
    new_entry = {**passivating_dict, **perovskite_dict}  # Merge dictionaries
    new_data.append(new_entry)

# Convert to DataFrame
new_df = pd.DataFrame(new_data)

In [207]:
new_df.to_csv('permutation_df.csv')

In [209]:
results_df

Unnamed: 0,Model,Best Params,MSE,R2
0,Ridge,{'model__alpha': 100},5.538115,0.123865
1,Lasso,{'model__alpha': 10},6.360932,-0.006305
2,SVR,"{'model__C': 10, 'model__gamma': 'auto', 'mode...",5.682659,0.100998
3,RandomForest,"{'model__max_depth': 20, 'model__n_estimators'...",4.993213,0.210069
4,GradientBoosting,"{'model__learning_rate': 0.01, 'model__n_estim...",5.160751,0.183564


In [179]:
# Assuming you want to use the best model from Gradient Boosting
best_model = results_df.loc[results_df["Model"] == "GradientBoosting", "Best Params"].values[0]

# Retrain the best model with the best parameters on the full dataset
best_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", GradientBoostingRegressor(**best_model))
])

# Fit on the entire training dataset
bbbbest_pipeline.fit(X_train, y_train)

# Predict on new data
y_new_pred = best_pipeline.predict(X_new)

# Convert to DataFrame for easy analysis
new_predictions_df = pd.DataFrame({"Predicted_treated_pce": y_new_pred})

new_predictions_df

TypeError: GradientBoostingRegressor.__init__() got an unexpected keyword argument 'model__learning_rate'

In [181]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Define feature matrix X and target variable y
X = data.drop(columns=["perovskite_composition", "passivating_molecule", 
                       "passivating_molecule_cleaned", "passivating_molecule_SMILES", "treated_pce"])
y = data["treated_pce"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models and hyperparameter grids
models = {
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor()
}

param_grids = {
    "Ridge": {"model__alpha": [0.1, 1, 10, 100]},
    "Lasso": {"model__alpha": [0.1, 1, 10, 100]},
    "SVR": {"model__C": [0.1, 1, 10], "model__gamma": ["scale", "auto"], "model__kernel": ["rbf", "linear"]},
    "RandomForest": {"model__n_estimators": [50, 100, 200], "model__max_depth": [None, 10, 20]},
    "GradientBoosting": {"model__n_estimators": [50, 100, 200], "model__learning_rate": [0.01, 0.1, 0.2]}
}

# Dictionary to store the best models
best_models = {}

# Train and evaluate models
for name, model in models.items():
    pipeline = Pipeline([
        ("scaler", StandardScaler()),  # Standardize features
        ("model", model)
    ])
    
    grid_search = GridSearchCV(pipeline, param_grids[name], cv=5, scoring="neg_mean_squared_error")
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_models[name] = best_model  # Store the best model
    
    # Evaluate on test set
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Model: {name}, Best Params: {grid_search.best_params_}, MSE: {mse:.4f}, R2: {r2:.4f}")

# The best_models dictionary now contains trained models ready for prediction

Model: Ridge, Best Params: {'model__alpha': 100}, MSE: 5.5381, R2: 0.1239
Model: Lasso, Best Params: {'model__alpha': 10}, MSE: 6.3609, R2: -0.0063
Model: SVR, Best Params: {'model__C': 10, 'model__gamma': 'auto', 'model__kernel': 'rbf'}, MSE: 5.6827, R2: 0.1010
Model: RandomForest, Best Params: {'model__max_depth': 20, 'model__n_estimators': 50}, MSE: 4.7870, R2: 0.2427
Model: GradientBoosting, Best Params: {'model__learning_rate': 0.2, 'model__n_estimators': 200}, MSE: 5.0675, R2: 0.1983


In [215]:
def predict_new_data(new_data, model_name):
    """
    Predict 'treated_pce' for new data using a trained model.

    Parameters:
        new_data (pd.DataFrame): The new dataset (should have the same features as training data).
        model_name (str): The name of the trained model to use. Choose from:
                          'Ridge', 'Lasso', 'SVR', 'RandomForest', 'GradientBoosting'.
    
    Returns:
        pd.DataFrame: DataFrame with predictions.
    """
    if model_name not in best_models:
        raise ValueError(f"Model '{model_name}' not found. Choose from {list(best_models.keys())}")

    # Prepare new data (drop same columns as before)
    X_new = new_data.drop(columns=["perovskite_composition", "passivating_molecule", 
                                   "passivating_molecule_cleaned", "passivating_molecule_SMILES", "treated_pce"], errors="ignore")

    # Make predictions
    model = best_models[model_name]
    y_pred = model.predict(X_new)

    # Return DataFrame with predictions
    return pd.DataFrame({"Predicted_treated_pce": y_pred})

# Example usage:
new_predictions_df = predict_new_data(new_df, "GradientBoosting")
new_df['Predicted_treated_pce'] = new_predictions_df['Predicted_treated_pce']
pce_result_df = new_df[['perovskite_composition', 'passivating_molecule', 'Predicted_treated_pce']]

In [231]:
ordered_df = pce_result_df.sort_values(by='Predicted_treated_pce', ascending=False).reset_index(drop=True)

In [239]:
idx = ordered_df.groupby('passivating_molecule')['Predicted_treated_pce'].idxmax()
ordered_df.loc[idx, ['passivating_molecule', 'perovskite_composition', 'Predicted_treated_pce']]

Unnamed: 0,passivating_molecule,perovskite_composition,Predicted_treated_pce
136,(phenethylamino)methaniminium iodide,Cs0.05MA0.1FA0.85PbI3,24.338766
112,2-thiopheneethylammonium chloride,Cs0.05MA0.1FA0.85PbI3,24.574131
312,2-thiophenemethylammonium bromide,Cs0.05MA0.1FA0.85PbI3,23.33524
131,"3,4,5-trifluoroanilinium",FAPbI 3,24.378346
19,3-(aminomethyl)pyridine,Cs0.05FA0.85MA0.1PbI3,25.978622
0,4-chlorobenzenesulfonate (4Cl-BZS),Cs0.05FA0.85MA0.1PbI3,26.9
632,4-fluoroaniline,FAPbI 3,21.757426
609,4-fluorophenylethylammonium iodide,Cs0.05MA0.1FA0.85PbI3,21.943087
363,4-tert-butyl-benzylammonium iodide,Cs0.05FA0.85MA0.1PbI3,23.086051
202,4-trifluoromethyl-phenylammonium,Cs0.05MA0.1FA0.85PbI3,23.818161
