In [7]:
import os
import pandas as pd
import numpy as np
import plotly.express as px

# Display the updated DataFrame in the notebook
from IPython.display import display

random_seed = 42
data_folder = "data"

### Load the data

In [8]:
file_path = os.path.join(data_folder, "Summary_Properties_all_species_published_v3.xlsx")

# Load sheets "Select_Properties" and "Yields"
select_properties_df = pd.read_excel(file_path, sheet_name="Properties_renamed")
select_properties_df.to_pickle(os.path.join(data_folder, "Select_properties.pkl"))
yields_df = df = pd.read_excel(file_path, sheet_name="Yields")
yields_df.to_pickle(os.path.join(data_folder, "yields.pkl"))

# Display a preview of both DataFrames
display(select_properties_df.head(3))
display(yields_df.head(3))

Unnamed: 0,Compound_Name,BDE (kcal/mol),BDFE (kcal/mol),E_spc (Hartree),E_spc (Hartree)_anion,E_spc (Hartree)_openshell,H_spc(Hartree),H_spc(Hartree)_anion,H_spc(Hartree)_openshell,qh_G(T)_spc(Hartree),...,C1_Electro-Valency_Boltz,C1_Electro-Valency_Boltz_anion,C1_FormalCharge_Boltz,C1_FormalCharge_Boltz_anion,C2_Co-Valency_Boltz,C2_Co-Valency_Boltz_anion,C2_Electro-Valency_Boltz,C2_Electro-Valency_Boltz_anion,C2_FormalCharge_Boltz,C2_FormalCharge_Boltz_anion
0,Het001,90.96998,83.154355,-441.215801,-440.620459,-440.561375,-441.043098,-440.463017,-440.40235,-441.085759,...,0.6784,0.5491,0.0006,-0.2896,3.5377,3.4047,0.4177,0.5367,-0.0308,-0.0333
1,Het002,89.224877,81.40235,-441.212895,-440.61349,-440.561089,-441.040035,-440.456235,-440.402068,-441.082728,...,0.689,0.6075,-0.0078,-0.2527,3.7837,3.5178,0.1419,0.3552,-0.0467,-0.0673
2,Het003,90.373846,82.194266,-441.213933,-440.625214,-440.560232,-441.04084,-440.467396,-440.401042,-441.082982,...,0.6948,0.5667,0.0212,-0.2805,3.8102,3.5553,0.1638,0.3499,-0.0203,-0.0444


Unnamed: 0,SMILES,id,Heterobenzylic_Cl_Pdt,Britton,Maity2AP Stahl,Maity4AP Stahl,Golden Stahl,DeLuca,Fujisaki,Newkome,Xu Zhang,Schreiner,Chen,Wu,Lopez Stahl,Ariarfard,Kanai
0,CC1=NC2=C(C=CC=C2)C=C1,Het001,1b,37.0,0.0,39.0,50.0,83.0,6.0,33.0,46.0,50.0,21.0,8.9,3.0,24.0,49.0
1,CC1=CC(C=CC=C2)=C2N=C1,Het002,2b,0.5,0.0,0.0,48.0,11.0,8.1,11.5,0.0,2.8,2.7,6.7,0.0,3.1,5.4
2,CC1=CC=NC2=C1C=CC=C2,Het003,3b,13.0,12.0,22.0,6.0,28.0,6.1,2.9,18.0,12.0,5.2,8.5,0.0,14.0,22.0


In [9]:
# Prepare data for the grid plot
# Extract yield columns (ignoring the first few metadata columns like id, SMILES, and captions)
yield_columns = yields_df.select_dtypes(include=[np.number]).columns
yield_data_df = yields_df[yield_columns]
display(yield_data_df.head(3))
# dump the yield dataframes to pickle files
yield_data_df.to_pickle(os.path.join(data_folder, "yield_data_df.pkl"))

# Same for the select properties
select_properties_data_columns = select_properties_df.select_dtypes(include=[np.number]).columns
select_properties_data_df = select_properties_df[select_properties_data_columns]
display(select_properties_data_df.head(3))
select_properties_data_df.to_pickle(os.path.join(data_folder, "select_properties_data_df.pkl"))

Unnamed: 0,Britton,Maity2AP Stahl,Maity4AP Stahl,Golden Stahl,DeLuca,Fujisaki,Newkome,Xu Zhang,Schreiner,Chen,Wu,Lopez Stahl,Ariarfard,Kanai
0,37.0,0.0,39.0,50.0,83.0,6.0,33.0,46.0,50.0,21.0,8.9,3.0,24.0,49.0
1,0.5,0.0,0.0,48.0,11.0,8.1,11.5,0.0,2.8,2.7,6.7,0.0,3.1,5.4
2,13.0,12.0,22.0,6.0,28.0,6.1,2.9,18.0,12.0,5.2,8.5,0.0,14.0,22.0


Unnamed: 0,BDE (kcal/mol),BDFE (kcal/mol),E_spc (Hartree),E_spc (Hartree)_anion,E_spc (Hartree)_openshell,H_spc(Hartree),H_spc(Hartree)_anion,H_spc(Hartree)_openshell,qh_G(T)_spc(Hartree),qh_G(T)_spc(Hartree)_anion,...,C1_Electro-Valency_Boltz,C1_Electro-Valency_Boltz_anion,C1_FormalCharge_Boltz,C1_FormalCharge_Boltz_anion,C2_Co-Valency_Boltz,C2_Co-Valency_Boltz_anion,C2_Electro-Valency_Boltz,C2_Electro-Valency_Boltz_anion,C2_FormalCharge_Boltz,C2_FormalCharge_Boltz_anion
0,90.96998,83.154355,-441.215801,-440.620459,-440.561375,-441.043098,-440.463017,-440.40235,-441.085759,-440.504702,...,0.6784,0.5491,0.0006,-0.2896,3.5377,3.4047,0.4177,0.5367,-0.0308,-0.0333
1,89.224877,81.40235,-441.212895,-440.61349,-440.561089,-441.040035,-440.456235,-440.402068,-441.082728,-440.497959,...,0.689,0.6075,-0.0078,-0.2527,3.7837,3.5178,0.1419,0.3552,-0.0467,-0.0673
2,90.373846,82.194266,-441.213933,-440.625214,-440.560232,-441.04084,-440.467396,-440.401042,-441.082982,-440.509091,...,0.6948,0.5667,0.0212,-0.2805,3.8102,3.5553,0.1638,0.3499,-0.0203,-0.0444


### Remove highly correlated features

In [10]:
# remove highly correlated columns from the properties dataframe
corr_matrix = select_properties_data_df.corr()

In [11]:

# plot the correlation matrix
fig = px.imshow(
    corr_matrix.abs().values,
    labels=dict(x="Property", y="Property", color="Correlation"),
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    color_continuous_scale="Viridis",
    title="Correlation Matrix of Select Properties",
)

fig.update_layout(
    height=len(select_properties_data_columns) * 30,
    width=len(select_properties_data_columns) * 30,
    xaxis=dict(tickangle=30),
    yaxis=dict(tickangle=30),
    template="plotly",
)

fig.show()

In [12]:
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper

Unnamed: 0,BDE (kcal/mol),BDFE (kcal/mol),E_spc (Hartree),E_spc (Hartree)_anion,E_spc (Hartree)_openshell,H_spc(Hartree),H_spc(Hartree)_anion,H_spc(Hartree)_openshell,qh_G(T)_spc(Hartree),qh_G(T)_spc(Hartree)_anion,...,C1_Electro-Valency_Boltz,C1_Electro-Valency_Boltz_anion,C1_FormalCharge_Boltz,C1_FormalCharge_Boltz_anion,C2_Co-Valency_Boltz,C2_Co-Valency_Boltz_anion,C2_Electro-Valency_Boltz,C2_Electro-Valency_Boltz_anion,C2_FormalCharge_Boltz,C2_FormalCharge_Boltz_anion
BDE (kcal/mol),,0.99531,0.367444,0.367501,0.367480,0.367341,0.367398,0.367379,0.367349,0.367412,...,0.918134,0.746933,0.589578,-0.483966,-0.085148,-0.379581,0.148446,0.411169,0.158058,-0.053261
BDFE (kcal/mol),,,0.392512,0.392562,0.392548,0.392408,0.392459,0.392445,0.392416,0.392473,...,0.932322,0.772010,0.581289,-0.455589,-0.094030,-0.382603,0.154108,0.406134,0.116606,-0.084873
E_spc (Hartree),,,,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.286469,0.510037,0.370614,0.014605,0.139140,-0.058893,-0.146442,-0.095811,-0.153298,-0.525813
E_spc (Hartree)_anion,,,,,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.286526,0.510068,0.370610,0.014401,0.139166,-0.058944,-0.146465,-0.095743,-0.153261,-0.525790
E_spc (Hartree)_openshell,,,,,,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.286503,0.510061,0.370633,0.014584,0.139135,-0.058908,-0.146434,-0.095793,-0.153289,-0.525807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C2_Co-Valency_Boltz_anion,,,,,,,,,,,...,,,,,,,-0.867506,-0.958035,0.142176,0.443639
C2_Electro-Valency_Boltz,,,,,,,,,,,...,,,,,,,,0.871600,-0.106758,-0.301277
C2_Electro-Valency_Boltz_anion,,,,,,,,,,,...,,,,,,,,,0.017921,-0.201301
C2_FormalCharge_Boltz,,,,,,,,,,,...,,,,,,,,,,0.607007


In [13]:
# Find index of feature columns with correlation greater than the cutoff
corr_cutoff = 0.98
to_drop = [column for column in upper.columns if any(upper[column] > corr_cutoff)]
# Drop the highly correlated features
select_properties_data_removed_highlycorr_df = select_properties_data_df.drop(columns=to_drop)
print(f"Number of columns to drop: {len(to_drop)}, remaining columns: {len(select_properties_data_removed_highlycorr_df.columns)}")
# dump the yield dataframes to pickle files
select_properties_data_removed_highlycorr_df.to_pickle(os.path.join(data_folder, "select_properties_data_removed_highlycorr_df.pkl"))

Number of columns to drop: 20, remaining columns: 63
