In [1]:
import os
import pandas as pd
import numpy as np
from tabulate import tabulate

In [2]:
# load all xls files in this directory, read the Summary_Properties sheet, and concatenate them into a single dataframe, skip those that don't have the sheet
# list all files in the current directory
files = [f for f in os.listdir() if f.endswith('.xls') or f.endswith('.xlsx')]
files.remove('smiles.xlsx')

print(f"Found {len(files)} files")
files

Found 4 files


['pyrdz_properties_postprocessed_for_pyrdz1_to_pyrdz1.xlsx',
 'pyrd_properties_postprocessed_for_pyrd1_to_pyrd13.xlsx',
 'pyrmd_properties_postprocessed_for_pyrmd1_to_pyrmd7.xlsx',
 'pyrz_properties_postprocessed_for_pyrz1_to_pyrz3.xlsx']

In [3]:
files = ['pyrd_properties_postprocessed_for_pyrd1_to_pyrd13.xlsx']

In [4]:
# read the smiles.xlsx file, keep the first two columns
smiles = pd.read_excel('smiles.xlsx')
smiles.rename(columns={'SMILES': 'smiles'}, inplace=True)
smiles = smiles.iloc[:, :2]
smiles

Unnamed: 0,smiles,id
0,BrC1=C2C(OCCC2)=CN=C1,pyrd1
1,ClC1=NC=CC=C1C,pyrd2
2,BrC1=CC(C)=NC=C1,pyrd3
3,O=C(NC1=NC=CC(C2CC2)=C1)C3=CC=C(B4OC(C)(C)C(C)...,pyrd4
4,ClC1=NC(C(F)(F)F)=CC(C)=C1,pyrd5
5,BrC1=CC(C)=CC(NC(OC(C)(C)C)=O)=N1,pyrd6
6,FC(F)(C1=CC(C(F)(F)F)=CC(C2=CN=CC(C)=C2)=C1)F,pyrd7
7,BrC1=NC(C2=CN=CS2)=CC(C)=C1,pyrd8
8,CC1=NC(Cl)=CC2=C1C=NN2,pyrd9
9,BrCCCC1=CN=CC=C1,pyrd10


In [5]:
dict = {}

# combine columns that have the same name
for file in files:
    temp = pd.read_excel(file, sheet_name="Summary_Properties", header=0)
    temp.rename(columns={'Compound_Name': 'id'}, inplace=True)
    # based on the Compound_Name, append a new column with the respective smiles
    temp = temp.merge(smiles, on='id', how='left')
    display(temp)
    # add it to the dictionary
    dict[file] = temp

Unnamed: 0,id,HOMO_Boltz,HOMO_Boltz_stdev,HOMO_min,HOMO_max,HOMO_range,HOMO_low_E,HOMO_Vbur_min,LUMO_Boltz,LUMO_Boltz_stdev,...,NBO_LP_occupancy_N1_low_E,NBO_LP_occupancy_N1_Vbur_min,NBO_LP_energy_N1_Boltz,NBO_LP_energy_N1_Boltz_stdev,NBO_LP_energy_N1_min,NBO_LP_energy_N1_max,NBO_LP_energy_N1_range,NBO_LP_energy_N1_low_E,NBO_LP_energy_N1_Vbur_min,smiles
0,pyrd1,-0.29088,0.0,-0.29088,-0.29088,0.0,-0.29088,-0.29088,-0.00553,0.0,...,1.90959,1.90959,-0.38489,0.0,-0.38489,-0.38489,0.0,-0.38489,-0.38489,BrC1=C2C(OCCC2)=CN=C1
1,pyrd2,-0.31,0.0,-0.31,-0.31,0.0,-0.31,-0.31,-0.00748,0.0,...,1.89792,1.89792,-0.38716,0.0,-0.38716,-0.38716,0.0,-0.38716,-0.38716,ClC1=NC=CC=C1C
2,pyrd3,-0.32162,0.0,-0.32162,-0.32162,0.0,-0.32162,-0.32162,-0.01138,0.0,...,1.91612,1.91612,-0.38423,0.0,-0.38423,-0.38423,0.0,-0.38423,-0.38423,BrC1=CC(C)=NC=C1
3,pyrd4,-0.28368,5.2e-05,-0.28569,-0.28364,0.00205,-0.28364,-0.28364,-0.032606,2.2e-05,...,1.91279,1.91279,-0.383852,8.7e-05,-0.38392,-0.36897,0.01495,-0.38391,-0.38391,O=C(NC1=NC=CC(C2CC2)=C1)C3=CC=C(B4OC(C)(C)C(C)...
4,pyrd5,-0.33403,0.0,-0.33403,-0.33403,0.0,-0.33403,-0.33403,-0.03023,0.0,...,1.8943,1.8943,-0.40792,0.0,-0.40792,-0.40792,0.0,-0.40792,-0.40792,ClC1=NC(C(F)(F)F)=CC(C)=C1
5,pyrd6,-0.288716,0.000161,-0.29521,-0.2887,0.00651,-0.2887,-0.29521,-0.004725,0.000447,...,1.89384,1.88823,-0.392468,0.000175,-0.39362,-0.38993,0.00369,-0.39245,-0.38993,BrC1=CC(C)=CC(NC(OC(C)(C)C)=O)=N1
6,pyrd7,-0.310806,0.00017,-0.31093,-0.31069,0.00024,-0.31069,-0.31069,-0.043408,8.5e-05,...,1.91468,1.91468,-0.390319,5.7e-05,-0.39036,-0.39028,8e-05,-0.39028,-0.39028,FC(F)(C1=CC(C(F)(F)F)=CC(C2=CN=CC(C)=C2)=C1)F
7,pyrd8,-0.28815,0.000481,-0.28849,-0.28781,0.00068,-0.28849,-0.28849,-0.04343,0.00024,...,1.89506,1.89506,-0.394048,0.002008,-0.39547,-0.39263,0.00284,-0.39263,-0.39263,BrC1=NC(C2=CN=CS2)=CC(C)=C1
8,pyrd9,-0.2928,0.0,-0.2928,-0.2928,0.0,-0.2928,-0.2928,-0.02545,0.0,...,1.89837,1.89837,-0.38438,0.0,-0.38438,-0.38438,0.0,-0.38438,-0.38438,CC1=NC(Cl)=CC2=C1C=NN2
9,pyrd10,-0.312966,0.001518,-0.31572,-0.30007,0.01565,-0.3121,-0.31568,-0.003239,0.002187,...,1.91714,1.91761,-0.380174,0.001563,-0.38336,-0.3727,0.01066,-0.37962,-0.38014,BrCCCC1=CN=CC=C1


In [6]:
# drop columns that are not present in all dataframes
columns = set.intersection(*[set(d.columns) for d in dict.values()])
print(f"Columns: {columns}, number of columns: {len(columns)}")
for key in dict.keys():
    dict[key] = dict[key][list(columns)]
    
# concatenate all dataframes
df = pd.concat(dict.values(), ignore_index=True)
df

Columns: {'Sterimol_B1_N1_C1(Å)_dbstep_Vbur_min', 'pyramidalization_Agranat-Radhakrishnan_C2_Vbur_min', 'NMR_shift_N1_low_E', 'η_range', 'μ_low_E', 'pyramidalization_Gavrish_C3(°)_low_E', '%Vbur_C3_3.5Å_max', '%Vbur_C5_4.0Å_min', 'Buried_Sterimol_B1_N1_C5_5.0(Å)_Vbur_min', '%Vbur_C3_3.0Å_max', '%Vbur_C4_3.5Å_Vbur_min', 'μ_max', 'NBO_charge_C2_Boltz_stdev', 'NMR_shift_N1_Vbur_min', 'NBO_LP_occupancy_N1_min', 'μ_Boltz_stdev', '%Vbur_C1_2.5Å_Boltz_stdev', 'Sterimol_B5_N1_C5(Å)_dbstep_max', 'Buried_Sterimol_L_N1_C1_5.0(Å)_Boltz', 'angle_C5_N1_C1(°)_low_E', 'Sterimol_B1_N1_C5(Å)_morfeus_range', 'dihedral_C4_C5_N1_C1(°)_max', '%Vbur_C1_2.0Å_Boltz', 'SASA_sphericity_Vbur_min', 'distance_N1_C5(Å)_Boltz_stdev', 'NBO_LP_energy_N1_Boltz_stdev', '%Vbur_C1_2.0Å_min', '%Vbur_C1_3.5Å_min', 'Sterimol_Bmax_N1_C5_0.0Å(Å)_Boltz_stdev', 'pyramidalization_Gavrish_C4(°)_Vbur_min', 'NMR_shift_C2_range', '%Vbur_N1_3.5Å_Boltz', 'polar_aniso(Debye)_Boltz', 'Sterimol_Bmin_N1_C1_1.0Å(Å)_Boltz_stdev', 'Buried_Ster

Unnamed: 0,Sterimol_B1_N1_C1(Å)_dbstep_Vbur_min,pyramidalization_Agranat-Radhakrishnan_C2_Vbur_min,NMR_shift_N1_low_E,η_range,μ_low_E,pyramidalization_Gavrish_C3(°)_low_E,%Vbur_C3_3.5Å_max,%Vbur_C5_4.0Å_min,Buried_Sterimol_B1_N1_C5_5.0(Å)_Vbur_min,%Vbur_C3_3.0Å_max,...,%Vbur_C5_3.5Å_low_E,Buried_Sterimol_B1_N1_C1_5.0(Å)_range,Sterimol_L_N1_C5(Å)_morfeus_Boltz,NBO_charge_C4_low_E,NBO_LP_energy_N1_max,NBO_charge_C4_Boltz,pyramidalization_Agranat-Radhakrishnan_C2_min,%Vbur_C4_4.0Å_Boltz,pyramidalization_Agranat-Radhakrishnan_C4_max,%Vbur_C5_4.0Å_range
0,1.6787,0.01262264,-152.6619,0.0,-0.148205,0.070941,71.657193,41.624511,1.7,81.114981,...,50.091432,0.0,6.231718,-0.09579,-0.38489,-0.09579,0.01262264,52.881572,0.0008571091,0.0
1,1.902733,2.21733e-11,-136.0998,0.0,-0.15874,0.0,52.435473,35.784557,1.7,63.720761,...,45.218156,0.0,4.676141,-0.25681,-0.38716,-0.25681,2.21733e-11,36.456846,8.160089e-11,0.0
2,1.919161,6.620001e-07,-142.7935,0.0,-0.1665,2.8e-05,60.471604,37.010816,1.7,74.493445,...,45.674734,0.0,4.70391,-0.29592,-0.38423,-0.29592,6.620001e-07,41.864642,3.086916e-07,0.0
3,2.810193,0.005380573,-87.0908,0.0058,-0.158135,0.009002,66.609206,36.890944,1.7,78.56358,...,45.640957,0.641758,5.154963,-0.28309,-0.36897,-0.288895,0.00481769,44.947271,0.001947814,3.157005
4,2.300057,7.990694e-06,-119.6215,0.0,-0.18213,0.000512,60.095392,52.661613,2.345237,74.61729,...,65.056606,0.0,5.190009,-0.23877,-0.40792,-0.23877,7.990694e-06,51.486174,2.843578e-05,0.0
5,2.045036,0.01617399,-83.5707,0.00191,-0.14669,0.102502,60.441903,47.222395,1.994585,74.787694,...,60.351635,0.265141,7.693055,-0.30027,-0.38993,-0.300469,0.001145209,51.728743,0.003469078,7.197722
6,1.6787,0.005864158,-157.3613,0.00012,-0.17702,0.130686,60.906634,44.884706,1.7,69.782665,...,53.392308,0.0,9.633156,-0.09367,-0.39028,-0.093728,0.005767447,54.903879,0.00440898,0.114828
7,1.818733,0.0004243139,-115.3763,0.00102,-0.165875,0.023674,60.212449,48.427317,1.873574,74.678747,...,59.64289,0.039266,5.245081,-0.28553,-0.39263,-0.263249,0.0004243139,49.761201,0.001882305,6.4397
8,1.858054,5.359846e-06,-106.6853,0.0,-0.159125,0.000182,60.618943,44.781128,1.863661,72.997058,...,57.343692,0.0,4.744074,-0.31004,-0.38438,-0.31004,5.359846e-06,45.30639,8.777287e-06,0.0
9,1.95,2.126168e-12,-155.3219,0.00982,-0.1571,0.022973,61.665463,34.717351,1.7,71.193385,...,44.899599,0.278176,7.451562,-0.25013,-0.3727,-0.094883,2.126168e-12,50.550643,0.0284504,16.506191


In [7]:
# drop column that match one capital letter followed by a number in any place
df = df[df.columns.drop(list(df.filter(regex='[C][2-9]')))]
df

Unnamed: 0,Sterimol_B1_N1_C1(Å)_dbstep_Vbur_min,NMR_shift_N1_low_E,η_range,μ_low_E,μ_max,NMR_shift_N1_Vbur_min,NBO_LP_occupancy_N1_min,μ_Boltz_stdev,%Vbur_C1_2.5Å_Boltz_stdev,Buried_Sterimol_L_N1_C1_5.0(Å)_Boltz,...,NBO_charge_C1_Boltz,Sterimol_B1_N1_C1(Å)_dbstep_range,SASA_volume(Å³)_Boltz_stdev,SASA_volume(Å³)_low_E,distance_N1_C1(Å)_max,NMR_shift_C1_Boltz,Buried_Sterimol_L_N1_C1_5.0(Å)_range,Sterimol_Bmax_N1_C1_0.0Å(Å)_range,Buried_Sterimol_B1_N1_C1_5.0(Å)_range,NBO_LP_energy_N1_max
0,1.6787,-152.6619,0.0,-0.148205,-0.148205,-152.6619,1.90959,0.0,0.0,6.315951,...,-0.00773,0.0,0.0,481.283655,1.33171,24.3521,0.0,0.0,0.0,-0.38489
1,1.902733,-136.0998,0.0,-0.15874,-0.15874,-136.0998,1.89792,0.0,0.0,5.74044,...,0.20474,0.0,0.0,389.546728,1.31767,3.0323,0.0,0.0,0.0,-0.38716
2,1.919161,-142.7935,0.0,-0.1665,-0.1665,-142.7935,1.91612,0.0,0.0,4.677361,...,0.22601,0.0,0.0,406.872695,1.34382,-2.658,0.0,0.0,0.0,-0.38423
3,2.810193,-87.0908,0.0058,-0.158135,-0.15712,-87.0908,1.90871,1.7e-05,0.04233,6.053364,...,0.364875,0.165456,0.300843,1050.322955,1.34199,10.473907,0.632022,1.088059,0.641758,-0.36897
4,2.300057,-119.6215,0.0,-0.18213,-0.18213,-119.6215,1.8943,0.0,0.0,4.847387,...,0.20832,0.0,0.0,493.760497,1.31594,2.3581,0.0,0.0,0.0,-0.40792
5,2.045036,-83.5707,0.00191,-0.14669,-0.14669,-97.5078,1.88823,0.000304,0.148577,4.799928,...,0.184721,0.748284,0.838528,712.496262,1.34529,4.844362,2.918696,1.412308,0.265141,-0.38993
6,1.6787,-157.3613,0.00012,-0.17702,-0.17702,-157.3613,1.91467,0.000127,0.026457,5.730348,...,0.03601,0.0,0.217,743.328356,1.33775,9.704503,4.1e-05,0.208993,0.0,-0.39028
7,1.818733,-115.3763,0.00102,-0.165875,-0.165705,-115.3763,1.89323,0.00012,0.805727,5.337227,...,0.178518,0.025064,0.004898,576.026799,1.353,6.168314,1.131644,0.258775,0.039266,-0.39263
8,1.858054,-106.6853,0.0,-0.159125,-0.159125,-106.6853,1.89837,0.0,0.0,5.803871,...,0.28641,0.0,0.0,464.536762,1.33291,7.0581,0.0,0.0,0.0,-0.38438
9,1.95,-155.3219,0.00982,-0.1571,-0.148095,-155.5752,1.91605,0.00181,1.131357,5.012735,...,0.024168,0.3,5.836909,504.392277,1.34032,12.192396,3.21399,4.470544,0.278176,-0.3727


In [8]:
# move coloumn "id" and "SMILES" to first position, order the rest of the columns alphabetically
cols = list(df.columns)
cols.remove("id")
cols.remove("smiles")
cols.sort()
cols.insert(0, "id")
cols.insert(1, "smiles")
df = df[cols]

# remove columns that contain any cell with value = 0
columns_with_zeros = df.columns[(df == 0).any()]
df = df.drop(columns=columns_with_zeros)
display(df)

Unnamed: 0,id,smiles,%Vbur_C1_2.0Å_Boltz,%Vbur_C1_2.0Å_Vbur_min,%Vbur_C1_2.0Å_low_E,%Vbur_C1_2.0Å_max,%Vbur_C1_2.0Å_min,%Vbur_C1_2.5Å_Boltz,%Vbur_C1_2.5Å_Vbur_min,%Vbur_C1_2.5Å_low_E,...,μ_Boltz,μ_Vbur_min,μ_low_E,μ_max,μ_min,ω_Boltz,ω_Vbur_min,ω_low_E,ω_max,ω_min
0,pyrd1,BrC1=C2C(OCCC2)=CN=C1,90.705708,90.705708,90.705708,90.705708,90.705708,74.778617,74.778617,74.778617,...,-0.148205,-0.148205,-0.148205,-0.148205,-0.148205,0.03849,0.03849,0.03849,0.03849,0.03849
1,pyrd2,ClC1=NC=CC=C1C,95.180139,95.180139,95.180139,95.180139,95.180139,85.671962,85.671962,85.671962,...,-0.15874,-0.15874,-0.15874,-0.15874,-0.15874,0.04165,0.04165,0.04165,0.04165,0.04165
2,pyrd3,BrC1=CC(C)=NC=C1,95.716038,95.716038,95.716038,95.716038,95.716038,85.74847,85.74847,85.74847,...,-0.1665,-0.1665,-0.1665,-0.1665,-0.1665,0.04468,0.04468,0.04468,0.04468,0.04468
3,pyrd4,O=C(NC1=NC=CC(C2CC2)=C1)C3=CC=C(B4OC(C)(C)C(C)...,94.741043,94.731405,94.731405,94.976756,94.731405,84.651442,84.703412,84.703412,...,-0.158143,-0.158135,-0.158135,-0.15712,-0.158165,0.049803,0.04981,0.04981,0.04981,0.04817
4,pyrd5,ClC1=NC(C(F)(F)F)=CC(C)=C1,95.276989,95.276989,95.276989,95.276989,95.276989,85.032231,85.032231,85.032231,...,-0.18213,-0.18213,-0.18213,-0.18213,-0.18213,0.05459,0.05459,0.05459,0.05459,0.05459
5,pyrd6,BrC1=CC(C)=CC(NC(OC(C)(C)C)=O)=N1,95.147784,94.957386,95.154313,95.154313,94.724948,85.525035,85.614989,85.540109,...,-0.146721,-0.153625,-0.14669,-0.14669,-0.153625,0.0379,0.04167,0.03788,0.04167,0.03788
6,pyrd7,FC(F)(C1=CC(C(F)(F)F)=CC(C2=CN=CC(C)=C2)=C1)F,90.977815,91.002712,91.002712,91.002712,90.951059,75.694937,75.712983,75.712983,...,-0.177107,-0.17702,-0.17702,-0.17702,-0.1772,0.058653,0.05861,0.05861,0.0587,0.05861
7,pyrd8,BrC1=NC(C2=CN=CS2)=CC(C)=C1,95.543657,95.858084,95.858084,95.858084,95.228564,86.068125,86.637257,86.637257,...,-0.16579,-0.165875,-0.165875,-0.165705,-0.165875,0.05616,0.0561,0.0561,0.05622,0.0561
8,pyrd9,CC1=NC(Cl)=CC2=C1C=NN2,95.699897,95.699897,95.699897,95.699897,95.699897,85.989387,85.989387,85.989387,...,-0.159125,-0.159125,-0.159125,-0.159125,-0.159125,0.04736,0.04736,0.04736,0.04736,0.04736
9,pyrd10,BrCCCC1=CN=CC=C1,90.734536,91.160899,90.902634,92.252066,90.625,74.209673,77.646829,76.258302,...,-0.158102,-0.160415,-0.1571,-0.148095,-0.162225,0.040361,0.04143,0.03981,0.04286,0.03604


In [9]:
# export to csv
df.to_csv("combined.csv", index=False)