In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv(r"C:\Users\nandi\Documents\ML_CO2_ER\data\df_merged.csv")

In [15]:
# --- Start with original features ---
meta_cols = ['solvent_ID', 'smiles']
target = 'Binding_Energy_eV'
X = df.drop(columns=meta_cols + [target])
y = df[target]

# Step 1.1: Remove constant or quasi-constant features
constant_thresh = 0.98  # 98% of rows have same value
quasi_constant = [col for col in X.columns if X[col].value_counts(normalize=True).values[0] > constant_thresh]
X_filtered = X.drop(columns=quasi_constant)

print(f"Removed {len(quasi_constant)} quasi-constant features")

# Step 1.2: Remove highly correlated features
corr_matrix = X_filtered.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
X_filtered = X_filtered.drop(columns=high_corr)

print(f"Removed {len(high_corr)} highly correlated features")

# Output the shape after filtering
print("Remaining features:", X_filtered.shape[1])


Removed 355 quasi-constant features
Removed 564 highly correlated features
Remaining features: 498


In [16]:
X_filtered["solvent_ID"] = df["solvent_ID"].copy()
X_filtered["smiles"] = df["smiles"].copy()
X_filtered["Binding_Energy_eV"] = df["Binding_Energy_eV"].copy()

In [17]:
X_filtered.to_csv(r"C:\Users\nandi\Documents\ML_CO2_ER\data\df_filtered.csv", index=False)

In [18]:
X_filtered.head()

Unnamed: 0,solubility_CO2 [mol/L],viscosity [Pa s],nBase,SpAbs_A,SpMax_A,SpDiam_A,SpMAD_A,VE1_A,VE2_A,VE3_A,...,homo_lumo_gap_eV,min_partial_charge,max_partial_charge,min_abs_partial_charge,max_abs_partial_charge,vib_freq_min_cm1,vib_freq_max_cm1,solvent_ID,smiles,Binding_Energy_eV
0,0.097848,0.011764,0.0,12.906117,2.122425,4.24485,1.173283,2.894598,0.263145,1.158157,...,181.930993,-0.832151,0.392923,0.034817,0.832151,23.0414,3627.7857,zinc_5593217,COCCNC(=O)[C@@H](CC)C,-0.246414
1,0.117575,0.007448,1.0,17.3437,2.348592,4.557405,1.334131,3.134592,0.241122,1.404863,...,161.701629,-1.140739,0.562172,0.001836,1.140739,28.8558,3225.798,zinc_400210175,CO[C@H](C1CC1)CN1CCCCC1,-0.239933
2,0.097619,0.002281,1.0,7.727407,1.931852,3.863703,1.103915,2.48138,0.354483,0.55214,...,168.343554,-0.802413,0.379028,0.149089,0.802413,52.144,3517.7383,zinc_5696920,NOCCN(C)C,-0.219656
3,0.088216,0.025603,1.0,14.943352,2.294945,4.451333,1.149489,3.070361,0.236182,1.384159,...,171.357226,-0.794729,0.647513,0.076133,0.794729,26.9357,3729.397,zinc_685002938,C[C@@H](CN(CC1CC1)CC(F)F)O,-0.325154
4,0.131763,0.00111,1.0,10.199109,2.119166,4.238332,1.133234,2.712314,0.301368,0.892442,...,167.803019,-0.847175,0.397601,0.172656,0.847175,62.569,3522.0361,zinc_52602135,CNC[C@@H](C(C)C)OC,-0.288984
