In [1]:
import os
import pandas as pd
import numpy as np
import plotly.express as px

# Display the updated DataFrame in the notebook
from IPython.display import display

random_seed = 42
data_folder = "data"

### Load the data

In [2]:
file_path = os.path.join(
    data_folder, "Summary_Properties_all_species_published_252mols.xlsx"
)

# Load sheets "Select_Properties" and "Yields"
select_properties_df = pd.read_excel(file_path, sheet_name="Properties")
select_properties_df = select_properties_df.fillna(0)
assert not select_properties_df.isnull().values.any()
select_properties_df.to_pickle(os.path.join(data_folder, "Select_properties.pkl"))
yields_df = df = pd.read_excel(file_path, sheet_name="Yields")
yields_df = yields_df.fillna(0)
assert not yields_df.isnull().values.any()
yields_df.to_pickle(os.path.join(data_folder, "yields.pkl"))

# read the custom descriptor sheet
custom_descriptor_df = pd.read_excel(file_path, sheet_name="Custom descriptors")
assert not custom_descriptor_df.isnull().values.any()
custom_descriptor_df.to_pickle(os.path.join(data_folder, "custom_descriptors.pkl"))


# Display a preview of both DataFrames
display(select_properties_df.head(3))
display(yields_df.head(3))
display(custom_descriptor_df.head(3))

Unnamed: 0,Compound_Name,primary,secondary,tertiary,N1_1,N1_2,N1_3,N2_1,N2_2,N2_3,...,Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_openshell,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz_openshell,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_openshell,pyramidalization_Gavrish_C1(deg)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
0,Het036,1,0,0,1,0,0,0,0,0,...,8.900579,8.783913,1.966938,1.736366,1.735037,5.920017,5.885243,5.946735,5.895945,0.798319
1,Het037,1,0,0,1,0,0,0,0,0,...,8.871687,8.774516,1.974611,1.755033,1.752779,6.797259,6.771417,6.903904,5.935005,0.80233
2,Het038,1,0,0,1,0,0,0,0,0,...,8.947646,8.762297,1.979748,1.84511,1.838265,6.361003,6.182154,6.404913,5.882007,0.796915


Unnamed: 0,SMILES,Name,screening_id,id,HTE,Heteroaromatic_scaffold,Informer_Library,Britton,Maity Stahl 2AP,Maity Stahl 4AP,DeLuca,Golden Stahl,Newkome,Li Xu,Schreiner
0,Cc1ccc2cccc(Br)c2n1,8-bromo-2-methyl-quinoline,INF36,Het036,Selected,Quinoline,Fused,11.064759,16.616248,18.157002,20.925286,27.389265,9.284346,12.02,23.47134
1,Cc1ccc2ccc(F)c(B(O)O)c2n1,(7-fluoro-2-methyl-8-quinolyl)boronic acid,INF37,Het037,Selected,Quinoline,Fused,20.914884,6.897721,31.601265,70.746624,29.475752,0.6977,13.34,24.785947
2,Cc1cc(Cl)c2ccc(I)cc2n1,4-chloro-7-iodo-2-methyl-quinoline,INF38,Het038,Selected,Quinoline,Fused,39.067826,78.611392,28.557671,32.555724,31.541641,21.0706,47.37,56.752826


Unnamed: 0,Compound_Name,primary,secondary,tertiary,N1_1,N1_2,N1_3,N2_1,N2_2,N2_3,phenyl,biphenyl,ring_size
0,Het036,1,0,0,1,0,0,0,0,0,0,1,6
1,Het037,1,0,0,1,0,0,0,0,0,0,1,6
2,Het038,1,0,0,1,0,0,0,0,0,0,1,6


In [3]:
# Prepare data for the grid plot
# Extract yield columns (ignoring the first few metadata columns like id, SMILES, and captions)
yield_columns = yields_df.select_dtypes(include=[np.number]).columns
yield_data_df = yields_df[yield_columns]
display(yield_data_df.head(3))
# dump the yield dataframes to pickle files
yield_data_df.to_pickle(os.path.join(data_folder, "yield_data_df.pkl"))

# Same for the select properties
select_properties_data_columns = select_properties_df.select_dtypes(include=[np.number]).columns
select_properties_data_df = select_properties_df[select_properties_data_columns]
display(select_properties_data_df.head(3))
select_properties_data_df.to_pickle(os.path.join(data_folder, "select_properties_data_df.pkl"))

Unnamed: 0,Britton,Maity Stahl 2AP,Maity Stahl 4AP,DeLuca,Golden Stahl,Newkome,Li Xu,Schreiner
0,11.064759,16.616248,18.157002,20.925286,27.389265,9.284346,12.02,23.47134
1,20.914884,6.897721,31.601265,70.746624,29.475752,0.6977,13.34,24.785947
2,39.067826,78.611392,28.557671,32.555724,31.541641,21.0706,47.37,56.752826


Unnamed: 0,primary,secondary,tertiary,N1_1,N1_2,N1_3,N2_1,N2_2,N2_3,phenyl,...,Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_openshell,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz_openshell,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_openshell,pyramidalization_Gavrish_C1(deg)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
0,1,0,0,1,0,0,0,0,0,0,...,8.900579,8.783913,1.966938,1.736366,1.735037,5.920017,5.885243,5.946735,5.895945,0.798319
1,1,0,0,1,0,0,0,0,0,0,...,8.871687,8.774516,1.974611,1.755033,1.752779,6.797259,6.771417,6.903904,5.935005,0.80233
2,1,0,0,1,0,0,0,0,0,0,...,8.947646,8.762297,1.979748,1.84511,1.838265,6.361003,6.182154,6.404913,5.882007,0.796915


### Remove highly correlated features

In [4]:
# remove highly correlated columns from the properties dataframe
corr_matrix = select_properties_data_df.corr().abs()

In [5]:
# plot the correlation matrix
fig = px.imshow(
    corr_matrix.abs().values,
    labels=dict(x="Property", y="Property", color="Correlation"),
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    color_continuous_scale="Viridis",
    title="Correlation Matrix of Select Properties",
)

fig.update_layout(
    height=len(select_properties_data_columns) * 30,
    width=len(select_properties_data_columns) * 30,
    xaxis=dict(tickangle=30),
    template="plotly",
    font=dict(size=12, family="Arial", color="black", weight="bold"),
)

fig.update_coloraxes(
    colorbar_len=0.8,
    colorbar_title="Correlation",
)

fig.show()
# save to html and open in browser
fig.write_html("temp.html", auto_open=False)

In [6]:
# plot the correlation matrix, only plot from the first column to "SASA_surface_area(amstrong^2)_Boltz"
# get the index of the column "SASA_surface_area(amstrong^2)_Boltz"
column_cutoff_index = select_properties_data_df.columns.get_loc(
    "SASA_sphericity_Boltz_openshell"
)
fig = px.imshow(
    corr_matrix.iloc[:column_cutoff_index, :column_cutoff_index].abs().values,
    labels=dict(x="Property", y="Property", color="Correlation"),
    x=corr_matrix.columns[:column_cutoff_index],
    y=corr_matrix.columns[:column_cutoff_index],
    color_continuous_scale="Viridis",
    title="Correlation Matrix of Select Properties",
)

fig.update_layout(
    height=column_cutoff_index * 25,
    width=column_cutoff_index * 25,
    xaxis=dict(tickangle=30),
    # increase font size
    font=dict(size=11, family="Arial", color="black", weight="bold"),
)

fig.update_coloraxes(
    colorbar_len=0.7,
    colorbar_title="Correlation",
    colorbar_title_font_size=9,
    colorbar_tickfont_size=9,
)

fig.show()
# save to html and open in browser
fig.write_html("temp.html", auto_open=False)

In [7]:
# plot the correlation matrix, only starting from the "SASA_surface_area(amstrong^2)_Boltz" column
# get the index of the column "SASA_surface_area(amstrong^2)_Boltz"
column_cutoff_index = select_properties_data_df.columns.get_loc(
    "SASA_sphericity_Boltz_openshell"
)
fig = px.imshow(
    corr_matrix.iloc[column_cutoff_index:, column_cutoff_index:].abs().values,
    labels=dict(x="Property", y="Property", color="Correlation"),
    x=corr_matrix.columns[column_cutoff_index:],
    y=corr_matrix.columns[column_cutoff_index:],
    color_continuous_scale="Viridis",
    title="Correlation Matrix of Select Properties",
)

fig.update_layout(
    height=column_cutoff_index * 25,
    width=column_cutoff_index * 25,
    xaxis=dict(tickangle=30),
    # increase font size
    font=dict(size=11, family="Arial", color="black", weight="bold"),
)

fig.update_coloraxes(
    colorbar_len=0.6,
    colorbar_title="Correlation",
    colorbar_title_font_size=10,
    colorbar_tickfont_size=10,
)

fig.show()
# save to html and open in browser
fig.write_html("temp.html", auto_open=False)

In [8]:
custom_descriptor_df["ring_size"].value_counts()

ring_size
6    247
5      5
Name: count, dtype: int64

In [9]:
# first remove data with zero variance
# remove columns with zero variance
def remove_zero_variance_columns(df):
    columns_to_remove = []
    for column in df.columns:
        if df[column].var() == 0:
            columns_to_remove.append(column)
    print(f"Removing {len(columns_to_remove)} columns with 0 variance")
    df.drop(columns=columns_to_remove, inplace=True)


remove_zero_variance_columns(select_properties_data_df)

Removing 0 columns with 0 variance




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [10]:
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper

Unnamed: 0,primary,secondary,tertiary,N1_1,N1_2,N1_3,N2_1,N2_2,N2_3,phenyl,...,Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_openshell,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_B1_C1_C2(Amgstrom)_morfeus_Boltz_openshell,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_anion,Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_openshell,pyramidalization_Gavrish_C1(deg)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
primary,,0.806226,0.464238,0.053452,0.019612,0.050800,0.038569,0.047188,0.047528,0.142002,...,0.170737,0.166774,0.253003,0.180702,0.221252,0.059465,0.023140,0.013655,0.369680,0.358530
secondary,,,0.149712,0.013260,0.012163,0.034655,0.053421,0.030240,0.034191,0.113228,...,0.229372,0.224697,0.081025,0.059039,0.094351,0.080106,0.065649,0.017562,0.016874,0.026311
tertiary,,,,0.069481,0.050985,0.033017,0.015518,0.124136,0.028242,0.067806,...,0.058053,0.057677,0.544137,0.213610,0.228514,0.020547,0.059611,0.049114,0.643094,0.638586
N1_1,,,,,0.733799,0.475191,0.171802,0.027325,0.149887,0.190533,...,0.131795,0.135234,0.141402,0.046043,0.019001,0.014803,0.010206,0.022732,0.000299,0.018227
N1_2,,,,,,0.249068,0.126068,0.081746,0.076432,0.173914,...,0.043246,0.045382,0.121317,0.043195,0.001183,0.100733,0.082850,0.108662,0.016021,0.002476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz,,,,,,,,,,,...,,,,,,,0.952445,0.952478,0.054935,0.050398
Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_anion,,,,,,,,,,,...,,,,,,,,0.985590,0.055207,0.050915
Sterimol_B5_C1_C2(Amgstrom)_morfeus_Boltz_openshell,,,,,,,,,,,...,,,,,,,,,0.073628,0.070451
pyramidalization_Gavrish_C1(deg)_Boltz,,,,,,,,,,,...,,,,,,,,,,0.999201


In [11]:
# Find index of feature columns with correlation greater than the cutoff
corr_cutoff = 0.95
to_drop = [column for column in upper.columns if any(upper[column].abs() > corr_cutoff)]
# display the columns to drop
print(f"Columns to drop: {to_drop}")
# Drop the highly correlated features
select_properties_data_removed_highlycorr_df = select_properties_data_df.drop(columns=to_drop)
print(f"Number of columns to drop: {len(to_drop)}, remaining columns: {len(select_properties_data_removed_highlycorr_df.columns)}")
# dump the yield dataframes to pickle files
select_properties_data_removed_highlycorr_df.to_pickle(os.path.join(data_folder, "select_properties_data_removed_highlycorr_df.pkl"))

Columns to drop: ['biphenyl', 'BDFE (kcal/mol)', 'E_spc (Hartree)_anion', 'E_spc (Hartree)_openshell', 'H_spc(Hartree)', 'H_spc(Hartree)_anion', 'H_spc(Hartree)_openshell', 'qh_G(T)_spc(Hartree)', 'qh_G(T)_spc(Hartree)_anion', 'qh_G(T)_spc(Hartree)_openshell', 'LUMO_Boltz_openshell', 'omega_Boltz', 'omega_Boltz_anion', 'omega_Boltz_openshell', 'polar_iso(Debye)_Boltz_anion', 'polar_iso(Debye)_Boltz_openshell', 'SASA_surface_area(Amgstrom^3)_Boltz', 'SASA_surface_area(Amgstrom^3)_Boltz_anion', 'SASA_surface_area(Amgstrom^3)_Boltz_openshell', 'SASA_volume(Amgstrom^3)_Boltz', 'SASA_volume(Amgstrom^3)_Boltz_anion', 'SASA_volume(Amgstrom^3)_Boltz_openshell', 'SASA_sphericity_Boltz', 'SASA_sphericity_Boltz_anion', 'SASA_sphericity_Boltz_openshell', 'NBO_charge_C2_Boltz_anion', '%Vbur_C1_2.0Amgstrom_Boltz', '%Vbur_C1_2.0Amgstrom_Boltz_openshell', '%Vbur_C2_2.0Amgstrom_Boltz_openshell', 'Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_anion', 'Sterimol_L_C1_C2(Amgstrom)_morfeus_Boltz_openshell', 'Ste