### Table of Content
- [`passivating_molecule` into SMILES format](Converting-`passivating_molecule`-into-SMILES-format)
- [`perovskite_composition` into features](`perovskite_composition`-into-features)
- [baseline ML model](baseline-ML-model)

In [None]:
pip uninstall -y -r requirements.txt

[0mFound existing installation: aiohappyeyeballs 2.4.3
Uninstalling aiohappyeyeballs-2.4.3:
  Successfully uninstalled aiohappyeyeballs-2.4.3
Found existing installation: aiohttp 3.10.10
Uninstalling aiohttp-3.10.10:
  Successfully uninstalled aiohttp-3.10.10
Found existing installation: aiosignal 1.3.1
Uninstalling aiosignal-1.3.1:
  Successfully uninstalled aiosignal-1.3.1
Found existing installation: annotated-types 0.7.0
Uninstalling annotated-types-0.7.0:
  Successfully uninstalled annotated-types-0.7.0
Found existing installation: anyio 3.5.0
Uninstalling anyio-3.5.0:
  Successfully uninstalled anyio-3.5.0
Found existing installation: asttokens 2.0.5
Uninstalling asttokens-2.0.5:
  Successfully uninstalled asttokens-2.0.5
Found existing installation: attrs 24.2.0
Uninstalling attrs-24.2.0:
  Successfully uninstalled attrs-24.2.0
Found existing installation: beautifulsoup4 4.11.1
Uninstalling beautifulsoup4-4.11.1:
  Successfully uninstalled beautifulsoup4-4.11.1
[0mFound existi

Found existing installation: psutil 5.9.0
Uninstalling psutil-5.9.0:
  Successfully uninstalled psutil-5.9.0
Found existing installation: ptyprocess 0.7.0
Uninstalling ptyprocess-0.7.0:
  Successfully uninstalled ptyprocess-0.7.0
Found existing installation: pure-eval 0.2.2
Uninstalling pure-eval-0.2.2:
  Successfully uninstalled pure-eval-0.2.2
[0mFound existing installation: pydantic 2.9.2
Uninstalling pydantic-2.9.2:
  Successfully uninstalled pydantic-2.9.2
Found existing installation: pydantic_core 2.23.4
Uninstalling pydantic_core-2.23.4:
  Successfully uninstalled pydantic_core-2.23.4
Found existing installation: Pygments 2.18.0
Uninstalling Pygments-2.18.0:
  Successfully uninstalled Pygments-2.18.0
[0mFound existing installation: PySocks 1.7.1
Uninstalling PySocks-1.7.1:
  Successfully uninstalled PySocks-1.7.1
Found existing installation: python-dateutil 2.8.2
Uninstalling python-dateutil-2.8.2:
  Successfully uninstalled python-dateutil-2.8.2
Found existing installation: p

In [1]:
import pandas as pd
import numpy as np
import pubchempy as pcp

  from pandas.core import (


In [None]:
df = pd.read_json('data/finetuned_llama_output.json')
data = df.T.sort_index()
data.info()

In [None]:
# Function to select columns
def select_data(df):
    # Convert PCE and VOC to numeric
    for col in ['control_pce', 'control_voc', 'treated_pce', 'treated_voc']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Drop rows where treated_pce or passivating_molecule is missing
    df = df.dropna(subset=['treated_pce', 'passivating_molecule', 'perovskite_composition'])

    return df

data = select_data(data)
data.head()

In [None]:
data.isna().sum(axis=0)

## `passivating_molecule` into SMILES format

In [None]:
def fetch_smiles_from_name(molecule_name):
    try:
        # Search for the molecule in PubChem by name
        compounds = pcp.get_compounds(molecule_name, 'name')
        if compounds:
            return compounds[0].isomeric_smiles  # Return the first match's SMILES
        else:
            return np.nan
    except Exception as e:
        print(f"Error fetching SMILES for {molecule_name}: {e}")
        return None

In [None]:
data['passivating_molecule']

In [None]:
data['passivating_molecule'].apply(fetch_smiles_from_name).isna().sum()

### Cleaning Data

In [None]:
import re

def fix_unmatched_brackets(s):
    """
    Fixes unmatched brackets in the given string by adding the correct brackets where necessary.

    :param s: Input string with potential unmatched brackets.
    :return: A corrected string with properly balanced brackets.
    """
    opening = "({["
    closing = ")}]"
    match = {')': '(', '}': '{', ']': '['}
    stack = []

    # Step 1: Identify missing closing brackets
    fixed_s = []
    for char in s:
        if char in opening:
            stack.append(char)
            fixed_s.append(char)
        elif char in closing:
            if stack and stack[-1] == match[char]:
                stack.pop()
                fixed_s.append(char)
            else:
                # Add missing opening bracket before unmatched closing
                fixed_s.insert(0, match[char])
                fixed_s.append(char)
        else:
            fixed_s.append(char)

    # Step 2: Add missing closing brackets at the end
    while stack:
        open_bracket = stack.pop()
        fixed_s.append(closing[opening.index(open_bracket)])

    return "".join(fixed_s)


def get_chemical_names(chemical_list):
    cleaned_list = []
    for name in chemical_list:
        # Remove text inside parentheses at the end if it's extra information (abbreviations)
        name = re.sub(r"\s*\([^)]*\)$", "", name).strip()

        cleaned_list.append(name)

    return cleaned_list

In [None]:
# Example input list
lst = data['passivating_molecule']

# Cleaning the list
cleaned_list = lst.apply(lambda x: fix_unmatched_brackets(x))
cleaned_list = get_chemical_names(cleaned_list)

# Output result
data['passivating_molecule_cleaned'] = cleaned_list

In [None]:
data['passivating_molecule_SMILES'] = data['passivating_molecule_cleaned'].apply(fetch_smiles_from_name)

## `perovskite_composition` into features

In [None]:
data['perovskite_composition']#.apply(lambda x: x.split(' ')))

In [None]:
pip install chemparse

In [None]:
import chemparse

# Example formula
formula = "(FAPbI3)0.95(MAPbBr3)0.05"

# Parse formula
result = chemparse.parse_formula(formula)
print(result)  # Output: {'FA': 0.95, 'Pb': 1.0, 'I': 2.85, 'MA': 0.05, 'Br': 0.15}

In [None]:
from sympy import sympify

# Example formula with weights
formula = "0.95 * (FA + Pb + 3*I) + 0.05 * (MA + Pb + 3*Br)"

# Evaluate
expanded_formula = sympify(formula)
print(expanded_formula)

## baseline ML model