In [98]:
import pandas as pd 
import json

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt

In [99]:
def load_molecular_data(smiles):
    with open(f'molecule_properties/{smiles}.json', 'r') as handle:
        d = json.load(handle)

    # for charges, fukui_electrophilicity, fukui_nucleophilicity, fukui_radical
    # it is a dict. take the min, max, mean of the values of the dict and make a new key/value pairs in the root 
    # of d
    for key in ['charges', 'fukui_electrophilicity', 'fukui_nucleophilicity', 'fukui_radical']:
        d[key + '_min'] = min(d[key].values())
        d[key + '_max'] = max(d[key].values())
        d[key + '_mean'] = sum(d[key].values()) / len(d[key].values())

    d['dipole_x'] = d['dipole'][0]
    d['dipole_y'] = d['dipole'][1]
    d['dipole_z'] = d['dipole'][2]

    return d

In [100]:
def molecular_features(smiles):
    d = load_molecular_data(smiles)
    # select only the keys for which we have float values
    d = {k: v for k, v in d.items() if isinstance(v, float)}
    return d

In [101]:
molecular_features('C_C=C_C')

{'ip': 13.682973015266317,
 'ip_corrected': 8.836973015266317,
 'ea': 1.2218398518750326,
 'homo': -0.3816405868399534,
 'lumo': -0.1874791546885192,
 'global_electrophilicity': 0.27258173104659283,
 'global_nucleophilicity': -8.836973015266317,
 'best_conformer_energy': -12.612394464140023,
 'charges_min': -0.10224603368115683,
 'charges_max': 0.04109184075729364,
 'charges_mean': -1.9024134119878985e-16,
 'fukui_electrophilicity_min': -0.05083731556660036,
 'fukui_electrophilicity_max': 0.15345645443935063,
 'fukui_electrophilicity_mean': 0.08333333333333313,
 'fukui_nucleophilicity_min': -0.026684857821392627,
 'fukui_nucleophilicity_max': 0.13523914792194044,
 'fukui_nucleophilicity_mean': 0.08333333333340077,
 'fukui_radical_min': -0.038761080705294174,
 'fukui_radical_max': 0.11937515617831551,
 'fukui_radical_mean': 0.08333333333336708,
 'dipole_x': -3.718452829648182e-07,
 'dipole_y': -3.899439447074193e-07,
 'dipole_z': -2.845134707431372e-09}

In [102]:
with open('processed_reactions/all_reactions.json', 'r') as file:
    data = json.load(file)

def is_within_deviation(actual_product, expected_product, deviation=0.10):
    if expected_product == 0:
        return actual_product == 0
    return abs(actual_product - expected_product) / abs(expected_product) <= deviation


for entry in data:
    r1 = entry['r_values'].get('constant_1')
    r2 = entry['r_values'].get('constant_2')
    r_product = entry.get('r-product')
    
    if r_product is None:
        entry['r-product_filter'] = False
        continue
    
    actual_product = r1 * r2
    
    # Check for division by zero
    if r_product == 0:
        deviation = float('inf') if actual_product != 0 else 0
    else:
        deviation = abs(actual_product - r_product) / abs(r_product)
    
    if is_within_deviation(actual_product, r_product):
        entry['r-product_filter'] = False
    else:
        entry['r-product_filter'] = True # reaction should be filtered out


def filter_conf_intervals(row):
    if 'conf_intervals' in row and 'constant_conf_1' in row['conf_intervals'] and 'constant_conf_2' in row['conf_intervals']:
        conf_1 = row['conf_intervals']['constant_conf_1']
        conf_2 = row['conf_intervals']['constant_conf_2']
        
        # Ensure 'r1' and 'r2' are correctly retrieved from the row
        r1 = row.get('r_values', {}).get('constant_1')
        r2 = row.get('r_values', {}).get('constant_2')
        
        if r1 is not None and r2 is not None and conf_1 is not None and conf_2 is not None:
            # Filter condition: Confidence intervals should not be greater than the corresponding r-values
            return (conf_1 <= 1 * r1) and (conf_2 <= 1 * r2)
    
    # If conditions are not met, return True by default, meaning the row will not be filtered out
    return True

In [103]:
# Convert JSON data to DataFrame
df_full = pd.DataFrame(data)

print('Initial datapoints: ', len(df_full))
df_full = df_full[df_full.apply(filter_conf_intervals, axis=1)]
print('Datapoints after confidence filter:', len(df_full))

# Separate the filtered data
df_filtered = df_full[df_full['r-product_filter'] == False]
print('Datapoints after r-product filter:', len(df_filtered))

Initial datapoints:  1138
Datapoints after confidence filter: 1060
Datapoints after r-product filter: 1037


In [112]:
df_filtered['r1'] = df_filtered['r_values'].apply(lambda x: x['constant_1'] if isinstance(x, dict) and 'constant_1' in x else None)
df_filtered['r2'] = df_filtered['r_values'].apply(lambda x: x['constant_2'] if isinstance(x, dict) and 'constant_2' in x else None)

df_filtered.dropna(subset=['r1', 'r2', 'solvent', 'monomer1_s', 'monomer2_s', 'temperature', 'calculation_method', 'polymerization_type'], inplace=True)

df_filtered.drop(columns=['r-product_filter', 'r_values', 'r-product', 'monomer1_data', 'monomer2_data', 'conf_intervals'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['r1'] = df_filtered['r_values'].apply(lambda x: x['constant_1'] if isinstance(x, dict) and 'constant_1' in x else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['r2'] = df_filtered['r_values'].apply(lambda x: x['constant_2'] if isinstance(x, dict) and 'constant_2' in x else None)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [113]:
df_filtered_flipped = []

# create the same rows, but with flipped monomers, i.e. monomer1_s <-> monomer2_s, monomer1 <-> monomer2, and r1 <-> r2
for index, row in df_filtered.iterrows():
    flipped_row = row.copy()
    flipped_row['monomer1_s'] = row['monomer2_s']
    flipped_row['monomer2_s'] = row['monomer1_s']
    flipped_row['monomer1'] = row['monomer2']
    flipped_row['monomer2'] = row['monomer1']
    flipped_row['r1'] = row['r2']
    flipped_row['r2'] = row['r1']
    df_filtered_flipped.append(flipped_row)

df_filtered_flipped = pd.DataFrame(df_filtered_flipped)
df_filtered_flipped

Unnamed: 0,file,monomer1_s,monomer2_s,monomer1,monomer2,temperature,temperature_unit,solvent,method,source,calculation_method,polymerization_type,logP,r1,r2
0,paper01.json,C=Cc1ccccc1,C=C(C)C(=O)O,styrene,methacrylic acid,60.0,°C,ClC(Cl)(Cl)Cl,solvent,https://doi.org/10.1002/macp.1985.021860819,Kelen-Tudor,free radical,2.55290,0.06,0.54
1,paper01.json,C=Cc1ccccc1,C=C(C)C(=O)O,styrene,methacrylic acid,60.0,°C,ClC(Cl)Cl,solvent,https://doi.org/10.1002/macp.1985.021860819,Kelen-Tudor,free radical,1.98640,0.08,0.51
2,paper01.json,C=Cc1ccccc1,C=C(C)C(=O)O,styrene,methacrylic acid,60.0,°C,CC(C)=O,solvent,https://doi.org/10.1002/macp.1985.021860819,Kelen-Tudor,free radical,0.59530,0.65,0.43
3,paper01.json,C=Cc1ccccc1,C=C(C)C(=O)O,styrene,methacrylic acid,60.0,°C,C1COCCO1,solvent,https://doi.org/10.1002/macp.1985.021860819,Kelen-Tudor,free radical,0.03320,0.59,0.41
4,paper01.json,C=Cc1ccccc1,C=C(C)C(=O)O,styrene,methacrylic acid,60.0,°C,CC#N,solvent,https://doi.org/10.1002/macp.1985.021860819,Kelen-Tudor,free radical,0.52988,0.29,0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1133,paper_99.json,C=CC#N,C=CC(=O)OCCCCCCCC,Acrylonitrile,Octyl acrylate,60.0,°C,CC(C)(C)O,solvent,https://doi.org/10.1002/app.1960.070040111,Fineman and Ross,free radical,0.77720,0.83,1.93
1134,paper_99.json,C=CC#N,C=CC(=O)OCCCCCCCCCCCCCCCCCC,Acrylonitrile,Octadecyl acrylate,60.0,°C,CC(C)(C)O,solvent,https://doi.org/10.1002/app.1960.070040111,Fineman and Ross,free radical,0.77720,0.68,1.74
1135,paper_99.json,C=C(Cl)Cl,C=CC(=O)OCCCC,Vinylidene chloride,Butyl acrylate,50.0,°C,CC(C)(C)O,solvent,https://doi.org/10.1002/app.1960.070040111,Fineman and Ross,free radical,0.77720,0.83,0.88
1136,paper_99.json,C=C(Cl)Cl,C=CC(=O)OCCCCCCCC,Vinylidene chloride,Octyl acrylate,50.0,°C,CC(C)(C)O,solvent,https://doi.org/10.1002/app.1960.070040111,Fineman and Ross,free radical,0.77720,0.70,0.87


In [114]:
from rdkit import Chem
from rdkit.Chem.Descriptors import MolLogP

In [115]:
# now, add the features for each monomer by loading the corresponding JSON file with `molecular_features` based on the monomer{i}_s column
def add_molecular_features(df): 
    new_rows = []
    for index, row in df.iterrows():
        try:
            monomer1_s = row['monomer1_s']
            monomer2_s = row['monomer2_s']
            monomer1 = row['monomer1']
            monomer2 = row['monomer2']
            temperature = row['temperature']
            solvent = row['solvent']
            calculation_method = row['calculation_method']
            polymerization_method = row['polymerization_type']
            solvent_logp = MolLogP(Chem.MolFromSmiles(solvent))

            monomer1_data = molecular_features(monomer1_s)
            monomer2_data = molecular_features(monomer2_s)
            
            # add _1 to the keys of monomer1_data and _2 to the keys of monomer2_data
            monomer1_data = {f'{k}_1': v for k, v in monomer1_data.items()}
            monomer2_data = {f'{k}_2': v for k, v in monomer2_data.items()}

            # now, create new dict with all the data 
            new_row = {**row, **monomer1_data, **monomer2_data, 'temperature': temperature, 'solvent': solvent, 'calculation_method': calculation_method, 'polymerization_method': polymerization_method, 'solvent_logp': solvent_logp}

            new_rows.append(new_row)
        except FileNotFoundError as e: 
            print(f"File not found: {e}")
    return pd.DataFrame(new_rows) 

In [116]:
df_filtered = add_molecular_features(df_filtered)
df_filtered_flipped = add_molecular_features(df_filtered_flipped)

File not found: [Errno 2] No such file or directory: 'molecule_properties/C=Cc1cc(O)ccc1O.O=C(O)c1ccccc1.O=C(O)c1ccccc1.json'
File not found: [Errno 2] No such file or directory: 'molecule_properties/C=Cc1cc(O)ccc1O.O=C(O)c1ccccc1.O=C(O)c1ccccc1.json'
File not found: [Errno 2] No such file or directory: 'molecule_properties/C/C=C/C.json'
File not found: [Errno 2] No such file or directory: 'molecule_properties/C/C=C/C.json'
File not found: [Errno 2] No such file or directory: 'molecule_properties/C/C=C\\C.json'
File not found: [Errno 2] No such file or directory: 'molecule_properties/C/C=C\\C.json'
File not found: [Errno 2] No such file or directory: 'molecule_properties/C=CC(=O)[O-].C=CC(=O)[O-].[Zn+2].json'
File not found: [Errno 2] No such file or directory: 'molecule_properties/O=C(O)/C=C/C=C/C(=O)O.json'
File not found: [Errno 2] No such file or directory: 'molecule_properties/O=C(O)/C=C/C=C/C(=O)O.json'
File not found: [Errno 2] No such file or directory: 'molecule_properties/CCO

In [117]:
len(df_filtered)

535

In [118]:
df_filtered

Unnamed: 0,file,monomer1_s,monomer2_s,monomer1,monomer2,temperature,temperature_unit,solvent,method,source,...,fukui_nucleophilicity_max_2,fukui_nucleophilicity_mean_2,fukui_radical_min_2,fukui_radical_max_2,fukui_radical_mean_2,dipole_x_2,dipole_y_2,dipole_z_2,polymerization_method,solvent_logp
0,paper01.json,C=C(C)C(=O)O,C=Cc1ccccc1,methacrylic acid,styrene,60.0,°C,ClC(Cl)(Cl)Cl,solvent,https://doi.org/10.1002/macp.1985.021860819,...,0.095350,0.062500,0.028710,0.097048,0.062500,-0.005997,0.000196,-0.001846,free radical,2.55290
1,paper01.json,C=C(C)C(=O)O,C=Cc1ccccc1,methacrylic acid,styrene,60.0,°C,ClC(Cl)Cl,solvent,https://doi.org/10.1002/macp.1985.021860819,...,0.095350,0.062500,0.028710,0.097048,0.062500,-0.005997,0.000196,-0.001846,free radical,1.98640
2,paper01.json,C=C(C)C(=O)O,C=Cc1ccccc1,methacrylic acid,styrene,60.0,°C,CC(C)=O,solvent,https://doi.org/10.1002/macp.1985.021860819,...,0.095350,0.062500,0.028710,0.097048,0.062500,-0.005997,0.000196,-0.001846,free radical,0.59530
3,paper01.json,C=C(C)C(=O)O,C=Cc1ccccc1,methacrylic acid,styrene,60.0,°C,C1COCCO1,solvent,https://doi.org/10.1002/macp.1985.021860819,...,0.095350,0.062500,0.028710,0.097048,0.062500,-0.005997,0.000196,-0.001846,free radical,0.03320
4,paper01.json,C=C(C)C(=O)O,C=Cc1ccccc1,methacrylic acid,styrene,60.0,°C,CC#N,solvent,https://doi.org/10.1002/macp.1985.021860819,...,0.095350,0.062500,0.028710,0.097048,0.062500,-0.005997,0.000196,-0.001846,free radical,0.52988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530,paper_99.json,C=CC(=O)OCCCCCCCC,C=CC#N,Octyl acrylate,Acrylonitrile,60.0,°C,CC(C)(C)O,solvent,https://doi.org/10.1002/app.1960.070040111,...,0.334867,0.142857,0.073560,0.307145,0.142857,-1.368240,0.419375,0.307398,free radical,0.77720
531,paper_99.json,C=CC(=O)OCCCCCCCCCCCCCCCCCC,C=CC#N,Octadecyl acrylate,Acrylonitrile,60.0,°C,CC(C)(C)O,solvent,https://doi.org/10.1002/app.1960.070040111,...,0.334867,0.142857,0.073560,0.307145,0.142857,-1.368240,0.419375,0.307398,free radical,0.77720
532,paper_99.json,C=CC(=O)OCCCC,C=C(Cl)Cl,Butyl acrylate,Vinylidene chloride,50.0,°C,CC(C)(C)O,solvent,https://doi.org/10.1002/app.1960.070040111,...,0.334595,0.166667,0.057445,0.287168,0.166667,-0.552206,-0.056956,0.110539,free radical,0.77720
533,paper_99.json,C=CC(=O)OCCCCCCCC,C=C(Cl)Cl,Octyl acrylate,Vinylidene chloride,50.0,°C,CC(C)(C)O,solvent,https://doi.org/10.1002/app.1960.070040111,...,0.334595,0.166667,0.057445,0.287168,0.166667,-0.552206,-0.056956,0.110539,free radical,0.77720


## Modeling

In [120]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

for train_idx, test_idx in KFold(n_splits=10).split(df_filtered):
    train = df_filtered.iloc[train_idx]
    test = df_filtered.iloc[test_idx]
    
    train_flipped = df_filtered_flipped.iloc[train_idx]
    test_flipped = df_filtered_flipped.iloc[test_idx]
    train = pd.concat([train, train_flipped])
    test = pd.concat([test, test_flipped])

    train['r1r2'] = train['r1'] * train['r2']
    test['r1r2'] = test['r1'] * test['r2']

    # Separate numerical and categorical features
    numerical_features = ['temperature', 
       'ip_corrected_1', 'ea_1', 'homo_1', 'lumo_1',
       'global_electrophilicity_1', 'global_nucleophilicity_1',
       'best_conformer_energy_1', 'charges_min_1', 'charges_max_1',
       'charges_mean_1', 'fukui_electrophilicity_min_1',
       'fukui_electrophilicity_max_1', 'fukui_electrophilicity_mean_1',
       'fukui_nucleophilicity_min_1', 'fukui_nucleophilicity_max_1',
       'fukui_nucleophilicity_mean_1', 'fukui_radical_min_1',
       'fukui_radical_max_1', 'fukui_radical_mean_1', 'dipole_x_1',
       'dipole_y_1', 'dipole_z_1',  'ip_corrected_2', 'ea_2', 'homo_2',
       'lumo_2', 'global_electrophilicity_2', 'global_nucleophilicity_2',  'charges_min_2', 'charges_max_2',
       'charges_mean_2', 'fukui_electrophilicity_min_2',
       'fukui_electrophilicity_max_2', 'fukui_electrophilicity_mean_2',
       'fukui_nucleophilicity_min_2', 'fukui_nucleophilicity_max_2',
       'fukui_nucleophilicity_mean_2', 'fukui_radical_min_2',
       'fukui_radical_max_2', 'fukui_radical_mean_2', 'dipole_x_2',
       'dipole_y_2', 'dipole_z_2', 'solvent_logp']
    
    categorical_features = ['temperature_unit', 'solvent', 'calculation_method', 'polymerization_type', 'polymerization_method']
  
    # Use column transformation
    transformer = ColumnTransformer([
        ('numerical', StandardScaler(), numerical_features),
        ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
    
    # Fit the transformer to the train data
    transformer.fit(train)
    
    # Transform the train and test data
    train_transformed = transformer.transform(train)
    test_transformed = transformer.transform(test)
    
    feature_names = transformer.get_feature_names_out()
    label = 'r1r2'

    # power transform the label
    pt = PowerTransformer(method='yeo-johnson')
    pt.fit(train[label].values.reshape(-1, 1))

    train[label] = pt.transform(train[label].values.reshape(-1, 1))
    test[label] = pt.transform(test[label].values.reshape(-1, 1))
    
    model = HistGradientBoostingRegressor()
    model.fit(train_transformed, train[label])
    y_pred = model.predict(test_transformed)
    mse = mean_squared_error(test[label], y_pred)
    r2 = r2_score(test[label], y_pred)
    print(f"MSE: {mse}, R2: {r2}")

MSE: 0.8654872304613355, R2: 0.11338807629113667
MSE: 0.6418953414035224, R2: 0.2554662619283343
MSE: 0.7727371013025197, R2: 0.2909045740548504
MSE: 0.5427371991577247, R2: 0.5150527013016566
MSE: 0.7902721554318949, R2: 0.3469681580551546
MSE: 0.7420987709960015, R2: -0.34608244513199793
MSE: 1.2266112722460636, R2: -0.1982477295812295
MSE: 0.4821799881230051, R2: 0.5412702716935469
MSE: 0.7626146455596584, R2: 0.005818656958897828
MSE: 0.5874720336953873, R2: 0.11064168243801398
