In [1]:
import os, glob
import base64
import json
import graphviz
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import matplotlib.pyplot as plt
from matplotlib.colors import ColorConverter
from rdkit import Chem
from rdkit.Chem import Draw

# Display the updated DataFrame in the notebook
from IPython.display import display

random_seed = 42
data_folder = 'data'

In [2]:
# load the dataframe from pickle files
select_properties_df = pd.read_pickle(os.path.join(data_folder, 'Select_properties.pkl'))
yields_df = pd.read_pickle(os.path.join(data_folder, 'Yields.pkl'))
yield_data_df = pd.read_pickle(os.path.join(data_folder, 'yield_data_df.pkl'))
select_properties_data_df = pd.read_pickle(os.path.join(data_folder, 'select_properties_data_df.pkl'))
select_properties_data_removed_highlycorr_df = pd.read_pickle(os.path.join(data_folder, 'select_properties_data_removed_highlycorr_df.pkl'))
custom_descriptors_df = pd.read_pickle(os.path.join(data_folder, 'custom_descriptors.pkl'))

# load mol_image_paths
with open(os.path.join(data_folder, 'mol_image_paths.json'), 'r') as f:
    mol_image_paths = json.load(f)
# load mol_image_data
with open(os.path.join(data_folder, 'mol_image_data.json'), 'r') as f:
    mol_image_data = json.load(f)
# load mol_image_paths_captioned
with open(os.path.join(data_folder, 'mol_image_paths_captioned.json'), 'r') as f:
    mol_image_paths_captioned = json.load(f)
# load mol_image_data_captioned
with open(os.path.join(data_folder, 'mol_image_data_captioned.json'), 'r') as f:
    mol_image_data_captioned = json.load(f)
# Create a dictionary mapping Compound_Name to RDKit molecule objects
compound_to_mol = {row["id"]: Chem.MolFromSmiles(row["SMILES"]) for _, row in yields_df.iterrows()}

In [3]:
custom_descriptors_data_df = custom_descriptors_df.select_dtypes(include="number")
custom_descriptors_data_df

Unnamed: 0,primary,secondary,tertiary,N1_1,N1_2,N1_3,N2_1,N2_2,N2_3,phenyl,biphenyl,ring_size
0,1,0,0,1,0,0,0,0,0,0,1,6
1,1,0,0,1,0,0,0,0,0,0,1,6
2,1,0,0,1,0,0,0,0,0,0,1,6
3,1,0,0,1,0,0,0,0,0,0,1,6
4,0,0,1,1,0,0,0,0,0,0,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...
247,1,0,0,1,0,0,0,0,0,1,0,6
248,0,1,0,1,0,0,0,0,0,1,0,6
249,1,0,0,1,0,0,0,0,0,1,0,6
250,1,0,0,1,0,0,0,0,0,1,0,6


In [4]:
# standardize the data the custom descriptors
def standardize_data(df):
    return (df - df.mean()) / df.std()

# standardize the custom descriptors data
custom_descriptors_data_standardized_df = standardize_data(custom_descriptors_data_df)

In [5]:
custom_descriptors_data_standardized_df

Unnamed: 0,primary,secondary,tertiary,N1_1,N1_2,N1_3,N2_1,N2_2,N2_3,phenyl,biphenyl,ring_size
0,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,-1.728611,1.728611,0.141995
1,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,-1.728611,1.728611,0.141995
2,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,-1.728611,1.728611,0.141995
3,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,-1.728611,1.728611,0.141995
4,-1.577999,-0.508889,3.399113,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,-1.728611,1.728611,0.141995
...,...,...,...,...,...,...,...,...,...,...,...,...
247,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,0.576204,-0.576204,0.141995
248,-1.577999,1.957266,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,0.576204,-0.576204,0.141995
249,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,0.576204,-0.576204,0.141995
250,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,0.576204,-0.576204,0.141995


In [6]:
custom_descriptors_data_standardized_df

Unnamed: 0,primary,secondary,tertiary,N1_1,N1_2,N1_3,N2_1,N2_2,N2_3,phenyl,biphenyl,ring_size
0,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,-1.728611,1.728611,0.141995
1,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,-1.728611,1.728611,0.141995
2,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,-1.728611,1.728611,0.141995
3,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,-1.728611,1.728611,0.141995
4,-1.577999,-0.508889,3.399113,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,-1.728611,1.728611,0.141995
...,...,...,...,...,...,...,...,...,...,...,...,...
247,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,0.576204,-0.576204,0.141995
248,-1.577999,1.957266,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,0.576204,-0.576204,0.141995
249,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,0.576204,-0.576204,0.141995
250,0.631199,-0.508889,-0.293027,0.843476,-0.618942,-0.400812,-0.202875,-0.359901,-0.284994,0.576204,-0.576204,0.141995


In [7]:
custom_descriptors_data_df.drop(columns=["ring_size"], inplace=True)

In [8]:
yield_data_df

Unnamed: 0,Britton,Maity Stahl 2AP,Maity Stahl 4AP,DeLuca,Golden Stahl,Newkome,Li Xu,Schreiner
0,11.064759,16.616248,18.157002,20.925286,27.389265,9.284346,12.020000,23.471340
1,20.914884,6.897721,31.601265,70.746624,29.475752,0.697700,13.340000,24.785947
2,39.067826,78.611392,28.557671,32.555724,31.541641,21.070600,47.370000,56.752826
3,0.000000,4.374196,0.000000,3.896726,0.000000,24.263953,14.470000,3.891398
4,0.000000,2.272457,63.001771,30.797186,62.327967,25.801782,0.000000,28.485916
...,...,...,...,...,...,...,...,...
247,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
248,7.649008,5.982042,6.543653,12.004268,0.000000,7.109793,5.964534,4.473617
249,37.313310,44.363352,0.000000,9.193061,16.172496,17.925838,6.936924,67.092649
250,27.262636,6.821651,0.000000,5.331128,20.671050,21.421993,7.084672,41.457825


In [9]:
# now we will find a weight vector for each column in the yield_data_df
# that is custom_descriptors_data_df dot_product weight_vector = yield_data_df
w = np.linalg.lstsq(custom_descriptors_data_df, yield_data_df, rcond=None)

In [10]:
# calculate the RMSE
predicted_yield = custom_descriptors_data_df.dot(w[0])
predicted_yield.columns = yield_data_df.columns
diff = (predicted_yield - yield_data_df) ** 2
# calculate the RMSE for each column
rmse = np.sqrt(diff.mean()) / yield_data_df.mean()
rmse

Britton            1.858515
Maity Stahl 2AP    2.123859
Maity Stahl 4AP    2.240660
DeLuca             1.232739
Golden Stahl       1.820205
Newkome            1.138532
Li Xu              1.796083
Schreiner          1.554538
dtype: float64

In [11]:
# turn the weight vector back to a dataframe
# set the row index to the custom_descriptors_data_df.columns
# set the column header to the yield_data_df.columns
result = pd.DataFrame(w[0], index=custom_descriptors_data_df.columns, columns=yield_data_df.columns)
result.T

Unnamed: 0,primary,secondary,tertiary,N1_1,N1_2,N1_3,N2_1,N2_2,N2_3,phenyl,biphenyl
Britton,3.632157,3.458474,-1.124277,4.851859,-1.302834,2.417329,-8.350924,-1.594941,-1.734736,-3.185893,9.152248
Maity Stahl 2AP,4.303056,6.368479,-0.98537,9.790012,-1.234709,1.130863,-11.005671,-1.378565,-10.202459,-4.269114,13.955279
Maity Stahl 4AP,2.037157,4.869989,4.432711,2.271952,-1.301282,10.369186,-5.522476,-1.07283,7.18811,-3.324682,14.664538
DeLuca,5.643497,9.093501,-0.820024,8.896383,3.45795,1.562642,-17.844385,-10.367315,-10.216201,0.305332,13.611643
Golden Stahl,1.381313,8.18248,1.613408,6.116031,5.274222,-0.213053,-10.405834,-9.942613,-7.53376,2.185283,8.991917
Newkome,4.929843,7.478267,1.617269,6.33534,3.759013,3.931026,-6.077628,-0.661846,-6.968368,3.619764,10.405615
Li Xu,3.255746,6.589109,2.078553,4.002838,-1.705717,9.626287,-8.224664,-0.219733,2.518849,-4.189497,16.112906
Schreiner,4.105575,6.382959,0.176594,6.710126,-0.619778,4.57478,-12.913274,-5.685858,-2.912227,-3.730017,14.395145
