In [1]:
import os
import pandas as pd
import numpy as np
import plotly.express as px

# Display the updated DataFrame in the notebook
from IPython.display import display

random_seed = 42
data_folder = "data"

### Load the data

In [2]:
file_path = os.path.join(
    data_folder, "Summary_Properties_all_species_published_252mols.xlsx"
)

# Load sheets "Select_Properties" and "Yields"
select_properties_df = pd.read_excel(file_path, sheet_name="Properties")
select_properties_df = select_properties_df.fillna(0)
assert not select_properties_df.isnull().values.any()
select_properties_df.to_pickle(os.path.join(data_folder, "Select_properties.pkl"))
yields_df = df = pd.read_excel(file_path, sheet_name="Yields")
yields_df = yields_df.fillna(0)
assert not yields_df.isnull().values.any()
yields_df.to_pickle(os.path.join(data_folder, "yields.pkl"))
robussness_df = pd.read_excel(file_path, sheet_name="Robustness_screen")
robussness_df = robussness_df.fillna(0)
assert not robussness_df.isnull().values.any()
robussness_df.to_pickle(os.path.join(data_folder, "robustness_screen.pkl"))

# Display a preview of both DataFrames
display(select_properties_df.head(3))
display(yields_df.head(3))
display(robussness_df.head(3))

Unnamed: 0,Compound_Name,BDE (kcal/mol),BDFE (kcal/mol),E_spc (Hartree),E_spc (Hartree)_anion,E_spc (Hartree)_openshell,H_spc(Hartree),H_spc(Hartree)_anion,H_spc(Hartree)_openshell,qh_G(T)_spc(Hartree),...,Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_openshell,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz_openshell,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_openshell,pyramidalization_Gavrish_C1(deg)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
0,Het036,90.912876,83.109802,-3014.81067,-3014.22574,-3014.156362,-3014.646485,-3014.076477,-3014.005828,-3014.693505,...,8.900579,8.783913,1.966938,1.736366,1.735037,5.920017,5.885243,5.946735,5.895945,0.798319
1,Het037,91.034613,83.325038,-716.522879,-715.951137,-715.868367,-716.328927,-715.772602,-715.688076,-716.380591,...,8.871687,8.774516,1.974611,1.755033,1.752779,6.797259,6.771417,6.903904,5.935005,0.80233
2,Het038,91.467594,83.692758,-1197.852061,-1197.276752,-1197.197092,-1197.699107,-1197.138324,-1197.057566,-1197.750542,...,8.947646,8.762297,1.979748,1.84511,1.838265,6.361003,6.182154,6.404913,5.882007,0.796915


Unnamed: 0,SMILES,Name,screening_id,id,HTE,Heteroaromatic_scaffold,Informer_Library,Britton,Maity Stahl 2AP,Maity Stahl 4AP,DeLuca,Golden Stahl,Newkome,Li Xu,Schreiner
0,Cc1ccc2cccc(Br)c2n1,8-bromo-2-methyl-quinoline,INF36,Het036,Selected,Quinoline,Fused,11.064759,16.616248,18.157002,20.925286,27.389265,9.284346,12.02,23.47134
1,Cc1ccc2ccc(F)c(B(O)O)c2n1,(7-fluoro-2-methyl-8-quinolyl)boronic acid,INF37,Het037,Selected,Quinoline,Fused,20.914884,6.897721,31.601265,70.746624,29.475752,0.6977,13.34,24.785947
2,Cc1cc(Cl)c2ccc(I)cc2n1,4-chloro-7-iodo-2-methyl-quinoline,INF38,Het038,Selected,Quinoline,Fused,39.067826,78.611392,28.557671,32.555724,31.541641,21.0706,47.37,56.752826


Unnamed: 0,Additive_smiles,Additive_name,Britton,Maity Stahl (2AP),Maity Stahl (4AP),Golden Stahl,DeLuca,Newkome,Li Xu,Schreiner,% Change LCAP in Chlorination Method
0,FC1=CC=CC=C1,ADT1,20.615155,1.800241,26.916542,9.264748,0.977556,0.939578,2.711945,0.571018,0.0
1,ClC1=CC=CC=C1,ADT2,23.015253,1.422139,31.109814,5.548014,8.991435,1.553725,2.403315,2.308031,0.0
2,BrC1=CC=CC=C1,ADT3,24.801006,4.924011,30.250697,6.665015,14.006317,4.089579,2.409219,1.981927,0.0


In [3]:
# Prepare data for the grid plot
# Extract yield columns (ignoring the first few metadata columns like id, SMILES, and captions)
yield_columns = yields_df.select_dtypes(include=[np.number]).columns
yield_data_df = yields_df[yield_columns]
display(yield_data_df.head(3))
# dump the yield dataframes to pickle files
yield_data_df.to_pickle(os.path.join(data_folder, "yield_data_df.pkl"))

# Same for the select properties
select_properties_data_columns = select_properties_df.select_dtypes(include=[np.number]).columns
select_properties_data_df = select_properties_df[select_properties_data_columns]
display(select_properties_data_df.head(3))
select_properties_data_df.to_pickle(os.path.join(data_folder, "select_properties_data_df.pkl"))

Unnamed: 0,Britton,Maity Stahl 2AP,Maity Stahl 4AP,DeLuca,Golden Stahl,Newkome,Li Xu,Schreiner
0,11.064759,16.616248,18.157002,20.925286,27.389265,9.284346,12.02,23.47134
1,20.914884,6.897721,31.601265,70.746624,29.475752,0.6977,13.34,24.785947
2,39.067826,78.611392,28.557671,32.555724,31.541641,21.0706,47.37,56.752826


Unnamed: 0,BDE (kcal/mol),BDFE (kcal/mol),E_spc (Hartree),E_spc (Hartree)_anion,E_spc (Hartree)_openshell,H_spc(Hartree),H_spc(Hartree)_anion,H_spc(Hartree)_openshell,qh_G(T)_spc(Hartree),qh_G(T)_spc(Hartree)_anion,...,Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_openshell,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz_openshell,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_openshell,pyramidalization_Gavrish_C1(deg)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
0,90.912876,83.109802,-3014.81067,-3014.22574,-3014.156362,-3014.646485,-3014.076477,-3014.005828,-3014.693505,-3014.122552,...,8.900579,8.783913,1.966938,1.736366,1.735037,5.920017,5.885243,5.946735,5.895945,0.798319
1,91.034613,83.325038,-716.522879,-715.951137,-715.868367,-716.328927,-715.772602,-715.688076,-716.380591,-715.822869,...,8.871687,8.774516,1.974611,1.755033,1.752779,6.797259,6.771417,6.903904,5.935005,0.80233
2,91.467594,83.692758,-1197.852061,-1197.276752,-1197.197092,-1197.699107,-1197.138324,-1197.057566,-1197.750542,-1197.188905,...,8.947646,8.762297,1.979748,1.84511,1.838265,6.361003,6.182154,6.404913,5.882007,0.796915


### Remove highly correlated features

In [4]:
# remove highly correlated columns from the properties dataframe
corr_matrix = select_properties_data_df.corr().abs()

In [5]:
# plot the correlation matrix
fig = px.imshow(
    corr_matrix.abs().values,
    labels=dict(x="Property", y="Property", color="Correlation"),
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    color_continuous_scale="Viridis",
    title="Correlation Matrix of Select Properties",
)

fig.update_layout(
    height=len(select_properties_data_columns) * 30,
    width=len(select_properties_data_columns) * 30,
    xaxis=dict(tickangle=30),
    template="plotly",
    font=dict(size=12, family="Arial", color="black", weight="bold"),
)

fig.update_coloraxes(
    colorbar_len=0.8,
    colorbar_title="Correlation",
)

fig.show()
# save to html and open in browser
fig.write_html("temp.html", auto_open=False)

In [6]:
# plot the correlation matrix, only plot from the first column to "SASA_surface_area(amstrong^2)_Boltz"
# get the index of the column "SASA_surface_area(amstrong^2)_Boltz"
column_cutoff_index = select_properties_data_df.columns.get_loc(
    "SASA_sphericity_Boltz_openshell"
)
fig = px.imshow(
    corr_matrix.iloc[:column_cutoff_index, :column_cutoff_index].abs().values,
    labels=dict(x="Property", y="Property", color="Correlation"),
    x=corr_matrix.columns[:column_cutoff_index],
    y=corr_matrix.columns[:column_cutoff_index],
    color_continuous_scale="Viridis",
    title="Correlation Matrix of Select Properties",
)

fig.update_layout(
    height=column_cutoff_index * 25,
    width=column_cutoff_index * 25,
    xaxis=dict(tickangle=30),
    # increase font size
    font=dict(size=11, family="Arial", color="black", weight="bold"),
)

fig.update_coloraxes(
    colorbar_len=0.7,
    colorbar_title="Correlation",
    colorbar_title_font_size=9,
    colorbar_tickfont_size=9,
)

fig.show()
# save to html and open in browser
fig.write_html("temp.html", auto_open=False)

In [7]:
# plot the correlation matrix, only starting from the "SASA_surface_area(amstrong^2)_Boltz" column
# get the index of the column "SASA_surface_area(amstrong^2)_Boltz"
column_cutoff_index = select_properties_data_df.columns.get_loc(
    "SASA_sphericity_Boltz_openshell"
)
fig = px.imshow(
    corr_matrix.iloc[column_cutoff_index:, column_cutoff_index:].abs().values,
    labels=dict(x="Property", y="Property", color="Correlation"),
    x=corr_matrix.columns[column_cutoff_index:],
    y=corr_matrix.columns[column_cutoff_index:],
    color_continuous_scale="Viridis",
    title="Correlation Matrix of Select Properties",
)

fig.update_layout(
    height=column_cutoff_index * 25,
    width=column_cutoff_index * 25,
    xaxis=dict(tickangle=30),
    # increase font size
    font=dict(size=11, family="Arial", color="black", weight="bold"),
)

fig.update_coloraxes(
    colorbar_len=0.6,
    colorbar_title="Correlation",
    colorbar_title_font_size=10,
    colorbar_tickfont_size=10,
)

fig.show()
# save to html and open in browser
fig.write_html("temp.html", auto_open=False)

In [8]:
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper

Unnamed: 0,BDE (kcal/mol),BDFE (kcal/mol),E_spc (Hartree),E_spc (Hartree)_anion,E_spc (Hartree)_openshell,H_spc(Hartree),H_spc(Hartree)_anion,H_spc(Hartree)_openshell,qh_G(T)_spc(Hartree),qh_G(T)_spc(Hartree)_anion,...,Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_openshell,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz_openshell,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_openshell,pyramidalization_Gavrish_C1(deg)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
BDE (kcal/mol),,0.989422,0.022494,0.022495,0.022499,0.022484,0.022484,0.022488,0.022484,0.022485,...,0.117698,0.110716,0.211905,0.097188,0.126565,0.008586,0.053980,0.079321,0.462053,0.472088
BDFE (kcal/mol),,,0.021528,0.021528,0.021532,0.021515,0.021515,0.021519,0.021515,0.021516,...,0.136076,0.128492,0.240856,0.119547,0.145535,0.011969,0.046833,0.075162,0.378906,0.389287
E_spc (Hartree),,,,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.050612,0.048870,0.041753,0.079013,0.070255,0.096561,0.096137,0.091882,0.077245,0.075914
E_spc (Hartree)_anion,,,,,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.050614,0.048873,0.041753,0.079013,0.070256,0.096563,0.096138,0.091883,0.077246,0.075916
E_spc (Hartree)_openshell,,,,,,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.050611,0.048869,0.041754,0.079013,0.070255,0.096561,0.096136,0.091882,0.077247,0.075917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz,,,,,,,,,,,...,,,,,,,0.952445,0.952478,0.054935,0.050398
Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_anion,,,,,,,,,,,...,,,,,,,,0.985590,0.055207,0.050915
Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_openshell,,,,,,,,,,,...,,,,,,,,,0.073628,0.070451
pyramidalization_Gavrish_C1(deg)_Boltz,,,,,,,,,,,...,,,,,,,,,,0.999201


In [9]:
# Find index of feature columns with correlation greater than the cutoff
corr_cutoff = 0.8
to_drop = [column for column in upper.columns if any(upper[column].abs() > corr_cutoff)]
# display the columns to drop
print(f"Columns to drop: {to_drop}")
# Drop the highly correlated features
select_properties_data_removed_highlycorr_df = select_properties_data_df.drop(columns=to_drop)
print(f"Number of columns to drop: {len(to_drop)}, remaining columns: {len(select_properties_data_removed_highlycorr_df.columns)}")
# dump the yield dataframes to pickle files
select_properties_data_removed_highlycorr_df.to_pickle(os.path.join(data_folder, "select_properties_data_removed_highlycorr_df.pkl"))

Columns to drop: ['BDFE (kcal/mol)', 'E_spc (Hartree)_anion', 'E_spc (Hartree)_openshell', 'H_spc(Hartree)', 'H_spc(Hartree)_anion', 'H_spc(Hartree)_openshell', 'qh_G(T)_spc(Hartree)', 'qh_G(T)_spc(Hartree)_anion', 'qh_G(T)_spc(Hartree)_openshell', 'LUMO_Boltz_openshell', 'mu_Boltz', 'mu_Boltz_anion', 'mu_Boltz_openshell', 'omega_Boltz', 'omega_Boltz_anion', 'omega_Boltz_openshell', 'polar_iso(Debye)_Boltz_anion', 'polar_iso(Debye)_Boltz_openshell', 'polar_aniso(Debye)_Boltz_openshell', 'dipole(Debye)_Boltz_openshell', 'volume(Bohr_radius^3/mol)_Boltz', 'volume(Bohr_radius^3/mol)_Boltz_anion', 'volume(Bohr_radius^3/mol)_Boltz_openshell', 'SASA_surface_area(Amgstrom^3)_Boltz', 'SASA_surface_area(Amgstrom^3)_Boltz_anion', 'SASA_surface_area(Amgstrom^3)_Boltz_openshell', 'SASA_volume(Amgstrom^3)_Boltz', 'SASA_volume(Amgstrom^3)_Boltz_anion', 'SASA_volume(Amgstrom^3)_Boltz_openshell', 'SASA_sphericity_Boltz', 'SASA_sphericity_Boltz_anion', 'SASA_sphericity_Boltz_openshell', 'NBO_charge_C1_