In [1]:
import pandas as pd 
import json

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt

  from pandas.core import (


In [2]:
def load_molecular_data(smiles):
    with open(f'molecule_properties/{smiles}.json', 'r') as handle:
        d = json.load(handle)

    # for charges, fukui_electrophilicity, fukui_nucleophilicity, fukui_radical
    # it is a dict. take the min, max, mean of the values of the dict and make a new key/value pairs in the root 
    # of d
    for key in ['charges', 'fukui_electrophilicity', 'fukui_nucleophilicity', 'fukui_radical']:
        d[key + '_min'] = min(d[key].values())
        d[key + '_max'] = max(d[key].values())
        d[key + '_mean'] = sum(d[key].values()) / len(d[key].values())

    d['dipole_x'] = d['dipole'][0]
    d['dipole_y'] = d['dipole'][1]
    d['dipole_z'] = d['dipole'][2]

    return d

In [3]:
def molecular_features(smiles):
    d = load_molecular_data(smiles)
    # select only the keys for which we have float values
    d = {k: v for k, v in d.items() if isinstance(v, float)}
    return d

In [4]:
molecular_features('C_C=C_C')

FileNotFoundError: [Errno 2] No such file or directory: 'molecule_properties/C_C=C_C.json'

In [None]:
with open('processed_reactions/all_data_filtered.json', 'r') as file:
    data = json.load(file)

def is_within_deviation(actual_product, expected_product, deviation=0.10):
    if expected_product == 0:
        return actual_product == 0
    return abs(actual_product - expected_product) / abs(expected_product) <= deviation


for entry in data:
    r1 = entry['r_values'].get('constant_1')
    r2 = entry['r_values'].get('constant_2')
    r_product = entry.get('r-product')
    
    if r_product is None:
        entry['r-product_filter'] = False
        continue
    
    actual_product = r1 * r2
    
    # Check for division by zero
    if r_product == 0:
        deviation = float('inf') if actual_product != 0 else 0
    else:
        deviation = abs(actual_product - r_product) / abs(r_product)
    
    if is_within_deviation(actual_product, r_product):
        entry['r-product_filter'] = False
    else:
        entry['r-product_filter'] = True # reaction should be filtered out


def filter_conf_intervals(row):
    if 'conf_intervals' in row and 'constant_conf_1' in row['conf_intervals'] and 'constant_conf_2' in row['conf_intervals']:
        conf_1 = row['conf_intervals']['constant_conf_1']
        conf_2 = row['conf_intervals']['constant_conf_2']
        
        # Ensure 'r1' and 'r2' are correctly retrieved from the row
        r1 = row.get('r_values', {}).get('constant_1')
        r2 = row.get('r_values', {}).get('constant_2')
        
        if r1 is not None and r2 is not None and conf_1 is not None and conf_2 is not None:
            # Filter condition: Confidence intervals should not be greater than the corresponding r-values
            return (conf_1 <= 1 * r1) and (conf_2 <= 1 * r2)
    
    # If conditions are not met, return True by default, meaning the row will not be filtered out
    return True

In [None]:
# Convert JSON data to DataFrame
df_full = pd.DataFrame(data)

print('Initial datapoints: ', len(df_full))
df_full = df_full[df_full.apply(filter_conf_intervals, axis=1)]
print('Datapoints after confidence filter:', len(df_full))

# Separate the filtered data
df_filtered = df_full[df_full['r-product_filter'] == False]
print('Datapoints after r-product filter:', len(df_filtered))

In [None]:
df_filtered['r1'] = df_filtered['r_values'].apply(lambda x: x['constant_1'] if isinstance(x, dict) and 'constant_1' in x else None)
df_filtered['r2'] = df_filtered['r_values'].apply(lambda x: x['constant_2'] if isinstance(x, dict) and 'constant_2' in x else None)

df_filtered.dropna(subset=['r1', 'r2', 'solvent', 'monomer1_s', 'monomer2_s', 'temperature', 'calculation_method', 'polymerization_type'], inplace=True)

df_filtered.drop(columns=['r-product_filter', 'r_values', 'r-product', 'monomer1_data', 'monomer2_data', 'conf_intervals'], inplace=True)

In [None]:
df_filtered_flipped = []

# create the same rows, but with flipped monomers, i.e. monomer1_s <-> monomer2_s, monomer1 <-> monomer2, and r1 <-> r2
for index, row in df_filtered.iterrows():
    flipped_row = row.copy()
    flipped_row['monomer1_s'] = row['monomer2_s']
    flipped_row['monomer2_s'] = row['monomer1_s']
    flipped_row['monomer1'] = row['monomer2']
    flipped_row['monomer2'] = row['monomer1']
    flipped_row['r1'] = row['r2']
    flipped_row['r2'] = row['r1']
    df_filtered_flipped.append(flipped_row)

df_filtered_flipped = pd.DataFrame(df_filtered_flipped)
df_filtered_flipped

In [None]:
from rdkit import Chem
from rdkit.Chem.Descriptors import MolLogP

In [None]:
# now, add the features for each monomer by loading the corresponding JSON file with `molecular_features` based on the monomer{i}_s column
def add_molecular_features(df): 
    new_rows = []
    for index, row in df.iterrows():
        try:
            monomer1_s = row['monomer1_s']
            monomer2_s = row['monomer2_s']
            monomer1 = row['monomer1']
            monomer2 = row['monomer2']
            temperature = row['temperature']
            solvent = row['solvent']
            calculation_method = row['calculation_method']
            polymerization_method = row['polymerization_type']
            solvent_logp = MolLogP(Chem.MolFromSmiles(solvent))

            monomer1_data = molecular_features(monomer1_s)
            monomer2_data = molecular_features(monomer2_s)
            
            # add _1 to the keys of monomer1_data and _2 to the keys of monomer2_data
            monomer1_data = {f'{k}_1': v for k, v in monomer1_data.items()}
            monomer2_data = {f'{k}_2': v for k, v in monomer2_data.items()}

            # now, create new dict with all the data 
            new_row = {**row, **monomer1_data, **monomer2_data, 'temperature': temperature, 'solvent': solvent, 'calculation_method': calculation_method, 'polymerization_method': polymerization_method, 'solvent_logp': solvent_logp}

            new_rows.append(new_row)
        except FileNotFoundError as e: 
            print(f"File not found: {e}")
    return pd.DataFrame(new_rows) 

In [None]:
df_filtered = add_molecular_features(df_filtered)
df_filtered_flipped = add_molecular_features(df_filtered_flipped)

In [None]:
len(df_filtered)

In [None]:
df_filtered

## Modeling

In [None]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

for train_idx, test_idx in KFold(n_splits=10).split(df_filtered):
    train = df_filtered.iloc[train_idx]
    test = df_filtered.iloc[test_idx]
    
    train_flipped = df_filtered_flipped.iloc[train_idx]
    test_flipped = df_filtered_flipped.iloc[test_idx]
    train = pd.concat([train, train_flipped])
    test = pd.concat([test, test_flipped])

    train['r1r2'] = train['r1'] * train['r2']
    test['r1r2'] = test['r1'] * test['r2']

    # Separate numerical and categorical features
    numerical_features = ['temperature', 
       'ip_corrected_1', 'ea_1', 'homo_1', 'lumo_1',
       'global_electrophilicity_1', 'global_nucleophilicity_1',
       'best_conformer_energy_1', 'charges_min_1', 'charges_max_1',
       'charges_mean_1', 'fukui_electrophilicity_min_1',
       'fukui_electrophilicity_max_1', 'fukui_electrophilicity_mean_1',
       'fukui_nucleophilicity_min_1', 'fukui_nucleophilicity_max_1',
       'fukui_nucleophilicity_mean_1', 'fukui_radical_min_1',
       'fukui_radical_max_1', 'fukui_radical_mean_1', 'dipole_x_1',
       'dipole_y_1', 'dipole_z_1',  'ip_corrected_2', 'ea_2', 'homo_2',
       'lumo_2', 'global_electrophilicity_2', 'global_nucleophilicity_2',  'charges_min_2', 'charges_max_2',
       'charges_mean_2', 'fukui_electrophilicity_min_2',
       'fukui_electrophilicity_max_2', 'fukui_electrophilicity_mean_2',
       'fukui_nucleophilicity_min_2', 'fukui_nucleophilicity_max_2',
       'fukui_nucleophilicity_mean_2', 'fukui_radical_min_2',
       'fukui_radical_max_2', 'fukui_radical_mean_2', 'dipole_x_2',
       'dipole_y_2', 'dipole_z_2', 'solvent_logp']
    
    categorical_features = ['temperature_unit', 'solvent', 'calculation_method', 'polymerization_type', 'polymerization_method']
  
    # Use column transformation
    transformer = ColumnTransformer([
        ('numerical', StandardScaler(), numerical_features),
        ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
    
    # Fit the transformer to the train data
    transformer.fit(train)
    
    # Transform the train and test data
    train_transformed = transformer.transform(train)
    test_transformed = transformer.transform(test)
    
    feature_names = transformer.get_feature_names_out()
    label = 'r1r2'

    # power transform the label
    pt = PowerTransformer(method='yeo-johnson')
    pt.fit(train[label].values.reshape(-1, 1))

    train[label] = pt.transform(train[label].values.reshape(-1, 1))
    test[label] = pt.transform(test[label].values.reshape(-1, 1))
    
    model = HistGradientBoostingRegressor()
    model.fit(train_transformed, train[label])
    y_pred = model.predict(test_transformed)
    mse = mean_squared_error(test[label], y_pred)
    r2 = r2_score(test[label], y_pred)
    print(f"MSE: {mse}, R2: {r2}")