### Table of Content
- [`passivating_molecule` into SMILES format](Converting-`passivating_molecule`-into-SMILES-format)
- [`perovskite_composition` into features](`perovskite_composition`-into-features)
- [baseline ML model](baseline-ML-model)

In [1]:
import seaborn as sns
import pandas as pd

In [2]:
df = pd.read_json('data/finetuned_llama_output.json')
data = df.T.sort_index()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149 entries, 0 to 149
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   control_pce               71 non-null     object
 1   control_voc               53 non-null     object
 2   treated_pce               140 non-null    object
 3   treated_voc               124 non-null    object
 4   passivating_molecule      143 non-null    object
 5   perovskite_composition    134 non-null    object
 6   electron_transport_layer  118 non-null    object
 7   hole_transport_layer      115 non-null    object
 8   pin_nip_structure         147 non-null    object
 9   stability_tests           149 non-null    object
 10  pin_structure             1 non-null      object
dtypes: object(11)
memory usage: 14.0+ KB


In [3]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

# Function to clean and standardize data
def clean_data(df):
    # Convert PCE and VOC to numeric
    for col in ['control_pce', 'control_voc', 'treated_pce', 'treated_voc']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Drop rows where treated_pce or passivating_molecule is missing
    df = df.dropna(subset=['treated_pce', 'passivating_molecule', 'perovskite_composition'])

    return df

data = clean_data(data)
data.to_csv("cleaned_data.csv", index=False) 
data.head()

Unnamed: 0,control_pce,control_voc,treated_pce,treated_voc,passivating_molecule,perovskite_composition,electron_transport_layer,hole_transport_layer,pin_nip_structure,stability_tests,pin_structure
0,25.7,1.17,26.15,1.18,4-chlorobenzenesulfonate (4Cl-BZS),α-phase FAPbI3,C60,SAMs (self-assembled monolayers),PIN,"[{'test_name': 'ISOS-D-2I', 'temperature': 85,...",
3,24.5,1.2,24.5,1.2,BA2MA2Pb3I10,BA2MA2Pb3I10,SnO2,PTAA,PIN,"[{'test_name': 'ISOS-L-1', 'temperature': None...",
4,,,21.06,1.14,vinylbenzylammonium bromide (VBABr),(FAPbI3)0.95(MAPbBr3)0.05,Spiro-OMeTAD,Spiro-OMeTAD,PIN,"[{'test_name': None, 'temperature': None, 'tim...",
5,,,22.1,1135.0,iso-BAI,FA(MA)PbI3,Spiro-OMeTAD,TPBI,PIN,"[{'test_name': 'ISOS-LT', 'temperature': None,...",
6,,,15.3,1.06,phenylethylammonium,PEA2(CH3NH3)n-1Pb(nI3n+1),TiO2,spiro-OMeTAD,PIN,"[{'test_name': 'ISOS-D', 'temperature': None, ...",


In [4]:
data.isna().sum(axis=0)

control_pce                  59
control_voc                  73
treated_pce                   0
treated_voc                  10
passivating_molecule          0
perovskite_composition        0
electron_transport_layer     19
hole_transport_layer         20
pin_nip_structure             0
stability_tests               0
pin_structure               107
dtype: int64

## `passivating_molecule` into SMILES format

### Cleaning Data

In [28]:
def fix_unmatched_brackets(s):
    """
    Fixes unmatched brackets in the given string by adding the correct brackets where necessary.

    :param s: Input string with potential unmatched brackets.
    :return: A corrected string with properly balanced brackets.
    """
    opening = "({["
    closing = ")}]"
    match = {')': '(', '}': '{', ']': '['}
    stack = []

    # Step 1: Identify missing closing brackets
    fixed_s = []
    for char in s:
        if char in opening:
            stack.append(char)
            fixed_s.append(char)
        elif char in closing:
            if stack and stack[-1] == match[char]:
                stack.pop()
                fixed_s.append(char)
            else:
                # Add missing opening bracket before unmatched closing
                fixed_s.insert(0, match[char])
                fixed_s.append(char)
        else:
            fixed_s.append(char)

    # Step 2: Add missing closing brackets at the end
    while stack:
        open_bracket = stack.pop()
        fixed_s.append(closing[opening.index(open_bracket)])

    return "".join(fixed_s)

Original: (Hello [World]
Fixed: (Hello [World])

Original: [(])
Fixed: [[(])]

Original: {[()]}
Fixed: {[()]}

Original: {[Hello(]
Fixed: [{[Hello(])]}

Original: ((Test))]
Fixed: [((Test))]



In [None]:
def fix_unmatched_brackets(s):
    """
    Fixes unmatched brackets in the given string by adding the correct brackets where necessary.

    :param s: Input string with potential unmatched brackets.
    :return: A corrected string with properly balanced brackets.
    """
    opening = "({["
    closing = ")}]"
    match = {')': '(', '}': '{', ']': '['}
    stack = []

    # Step 1: Identify missing closing brackets
    fixed_s = []
    for char in s:
        if char in opening:
            stack.append(char)
            fixed_s.append(char)
        elif char in closing:
            if stack and stack[-1] == match[char]:
                stack.pop()
                fixed_s.append(char)
            else:
                # Add missing opening bracket before unmatched closing
                fixed_s.insert(0, match[char])
                fixed_s.append(char)
        else:
            fixed_s.append(char)

    # Step 2: Add missing closing brackets at the end
    while stack:
        open_bracket = stack.pop()
        fixed_s.append(closing[opening.index(open_bracket)])

    return "".join(fixed_s)

# Example usage:
test_strings = [
    "(Hello [World]",   # Missing closing ')'
    "[(])",             # Incorrect nesting
    "{[()]}",          # Already correct
    "{[Hello(]",       # Missing closing '}'
    "((Test))]",       # Extra closing bracket
]

for test in test_strings:
    print(f"Original: {test}\nFixed: {fix_unmatched_brackets(test)}\n")


In [26]:
import re

def clean_chemical_names(chemical_list):
    cleaned_list = []
    for name in chemical_list:
        # Remove text inside parentheses *only* if it's extra information (e.g., abbreviations)
        name = re.sub(r"\s*\([^)]*\)$", "", name).strip()

        # If there is an unmatched closing bracket at the end, add an opening bracket at the beginning
        if name.endswith("]") and "[" not in name:
            name = "[" + name

        cleaned_list.append(name)

    return cleaned_list

def clean_chemical_names(chemical_list):
    cleaned_list = []
    for name in chemical_list:
        # Remove text inside parentheses at the end if it's extra information (abbreviations)
        name = re.sub(r"\s*\([^)]*\)$", "", name).strip()

        # If there is an unmatched closing bracket at the end, add an opening bracket at the beginning
        if name.endswith("]") and "[" not in name:
            name = "[" + name

        cleaned_list.append(name)

    return cleaned_list

# Example input list
chemical_list = [
    "4-chlorobenzenesulfonate (4Cl-BZS)",
    "BA2MA2Pb3I10",
    "vinylbenzylammonium bromide (VBABr)",
    "iso-BAI",
    "phenylethylammonium",
    "2-(pyren-1-yl)ethan-1-amine",
    "Tosylate ([TsO] -)",
    "Benzotriazole",
    "n-butylammonium bromide (BABr)",
    "2-(9H-carbazol-9-yl)ethyl] phosphonic acid (2PACz)",
    "2-phenylethylammonium iodide (PEAI), 4-chlorophenylethylammonium iodide (Cl-PEAI), and 4-fluorophenylethylammonium iodide (F-PEAI)"
]

# Cleaning the list
cleaned_list = clean_chemical_names(chemical_list)

# Output result
cleaned_list

['4-chlorobenzenesulfonate',
 'BA2MA2Pb3I10',
 'vinylbenzylammonium bromide',
 'iso-BAI',
 'phenylethylammonium',
 '2-(pyren-1-yl)ethan-1-amine',
 'Tosylate',
 'Benzotriazole',
 'n-butylammonium bromide',
 '2-(9H-carbazol-9-yl)ethyl] phosphonic acid',
 '2-phenylethylammonium iodide (PEAI), 4-chlorophenylethylammonium iodide (Cl-PEAI), and 4-fluorophenylethylammonium iodide']

In [5]:
list(data['passivating_molecule'])

['4-chlorobenzenesulfonate (4Cl-BZS)',
 'BA2MA2Pb3I10',
 'vinylbenzylammonium bromide (VBABr)',
 'iso-BAI',
 'phenylethylammonium',
 '2-(pyren-1-yl)ethan-1-amine',
 'Tosylate ([TsO] -)',
 'Benzotriazole',
 'n-butylammonium bromide (BABr)',
 '2-(9H-carbazol-9-yl)ethyl] phosphonic acid (2PACz)',
 '2-phenylethylammonium iodide (PEAI), 4-chlorophenylethylammonium iodide (Cl-PEAI), and 4-fluorophenylethylammonium iodide (F-PEAI)',
 'phenylethylammonium (PEAI) and 4-fluoro-phenylethylammonium iodide (F-PEAI)',
 'CF3 PEAI',
 'hexyltrimethylammonium bromide',
 '2-thiophenemethylammonium iodide (TMAI) and phenylethylammonium iodide (PEAI)',
 '3,4,5-trifluoroanilinium (345FAn)',
 'n-octylammonium bromide (C8 Br)',
 'Cyclohexylmethylammonium iodide (CMAI)',
 '3-fluoro-phenethylammonium (3F-PEA)',
 'Cs2PbI2Cl2',
 'CF3-PEA',
 'ammonia',
 '3-(aminomethyl)pyridine (3-APy)',
 'CH3NH3+',
 'Benzene',
 'C60',
 '1,3-propylene diammonium (PDA)',
 '2-thiophenemethylammonium bromide (2-TMABr)',
 'Oleylammoni

In [23]:
import pubchempy as pcp

# Function to fetch SMILES from PubChem

count = 0

def fetch_smiles_from_name(molecule_name):
    try:
        # Search for the molecule in PubChem by name
        compounds = pcp.get_compounds(molecule_name, 'name')
        if compounds:
            return compounds[0].isomeric_smiles  # Return the first match's SMILES
        else:
            return None
    except Exception as e:
        print(f"Error fetching SMILES for {molecule_name}: {e}")
        return None

chemical_list = [
    "(2-(9H-Carbazol-9-yl)ethyl)phosphonic acid", 
    "4-chlorobenzenesulfonate (4Cl-BZS)",
    "BA2MA2Pb3I10",
    "vinylbenzylammonium bromide (VBABr)",
    "iso-BAI",
    "phenylethylammonium",
    "2-(pyren-1-yl)ethan-1-amine",
    "Tosylate ([TsO] -)",
    "Benzotriazole",
    "n-butylammonium bromide (BABr)",
    "2-(9H-carbazol-9-yl)ethyl] phosphonic acid (2PACz)",
    "2-phenylethylammonium iodide (PEAI), 4-chlorophenylethylammonium iodide (Cl-PEAI), and 4-fluorophenylethylammonium iodide (F-PEAI)"
]
molecules = chemical_list


# Convert each molecule name to SMILES
for molecule in molecules:
    smiles = fetch_smiles_from_name(molecule)
    print(f"Molecule: {molecule}, SMILES: {smiles}")

Molecule: (2-(9H-Carbazol-9-yl)ethyl)phosphonic acid, SMILES: C1=CC=C2C(=C1)C3=CC=CC=C3N2CCP(=O)(O)O
Molecule: 4-chlorobenzenesulfonate (4Cl-BZS), SMILES: None
Molecule: BA2MA2Pb3I10, SMILES: None
Molecule: vinylbenzylammonium bromide (VBABr), SMILES: None
Molecule: iso-BAI, SMILES: None
Molecule: phenylethylammonium, SMILES: None
Molecule: 2-(pyren-1-yl)ethan-1-amine, SMILES: C1=CC2=C3C(=C1)C=CC4=C(C=CC(=C43)C=C2)CCN
Molecule: Tosylate ([TsO] -), SMILES: None
Molecule: Benzotriazole, SMILES: C1=CC2=NNN=C2C=C1
Molecule: n-butylammonium bromide (BABr), SMILES: None
Molecule: 2-(9H-carbazol-9-yl)ethyl] phosphonic acid (2PACz), SMILES: None
Molecule: 2-phenylethylammonium iodide (PEAI), 4-chlorophenylethylammonium iodide (Cl-PEAI), and 4-fluorophenylethylammonium iodide (F-PEAI), SMILES: None


In [24]:
molecules = cleaned_list

# Convert each molecule name to SMILES
for molecule in molecules:
    smiles = fetch_smiles_from_name(molecule)
    print(f"Molecule: {molecule}, SMILES: {smiles}")

Molecule: 4-chlorobenzenesulfonate, SMILES: C1=CC(=CC=C1S(=O)(=O)[O-])Cl
Molecule: BA2MA2Pb3I10, SMILES: None
Molecule: vinylbenzylammonium bromide, SMILES: None
Molecule: iso-BAI, SMILES: None
Molecule: phenylethylammonium, SMILES: None
Molecule: 2-(pyren-1-yl)ethan-1-amine, SMILES: C1=CC2=C3C(=C1)C=CC4=C(C=CC(=C43)C=C2)CCN
Molecule: Tosylate, SMILES: CC1=CC=C(C=C1)S(=O)(=O)[O-]
Molecule: Benzotriazole, SMILES: C1=CC2=NNN=C2C=C1
Molecule: n-butylammonium bromide, SMILES: CCCCN.Br
Molecule: 2-(9H-carbazol-9-yl)ethyl] phosphonic acid, SMILES: None
Molecule: 2-phenylethylammonium iodide (PEAI), 4-chlorophenylethylammonium iodide (Cl-PEAI), and 4-fluorophenylethylammonium iodide, SMILES: None


In [None]:
data['passivating_molecule']

In [7]:
count

0

## `perovskite_composition` into features

In [None]:
list(data['perovskite_composition'])#.apply(lambda x: x.split(' ')))

## baseline ML model