In [None]:
import pandas as pd
import numpy as np
import os
from src.config import CONFIG
from src.pipeline import run_pipeline


#file paths
TRAIN_CSV = "EGFR_train_scaffold.csv"
TEST_CSV  = "EGFR_test_scaffold.csv"

# Load
df_train = pd.read_csv(TRAIN_CSV)
df_test  = pd.read_csv(TEST_CSV)

print(f"Loaded: train={df_train.shape}, test={df_test.shape}")

#Label column
# Expect a binary column named 'y' already (1=active, 0=inactive)
assert "y" in df_train.columns and "y" in df_test.columns, \
    "Missing 'y' column in train/test."


Loaded: train=(6342, 2072), test=(1278, 2072)


In [10]:
df_test.columns[:25]

Index(['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'class',
       'standardized_smiles', 'MolWt', 'LogP', 'TPSA', 'HBD', 'HBA', 'RotB',
       'RingCount', 'AromaticRingCount', 'FractionCSP3', 'HeavyAtomCount',
       'HeteroAtomCount', 'HasPyridine', 'HasPyrimidine', 'MurckoScaffold',
       'MW_norm', 'TPSA_norm', 'LogP_norm', 'SP3_norm', 'ECFP_0', 'ECFP_1'],
      dtype='object')

In [12]:
#
# Choose feature columns 
# Drop obvious non-features if present; keep everything else numeric.
non_features = {
    "y", "class", "MurckoScaffold",'standard_value'
    "canonical_smiles", "standardized_smiles",
    "molecule_chembl_id", 
}
train_feat_cols = [c for c in df_train.columns if c not in non_features]
test_feat_cols  = [c for c in df_test.columns  if c not in non_features]

# Align to common columns (defensive)
common_cols = [c for c in train_feat_cols if c in test_feat_cols]
assert len(common_cols) > 0, "No common feature columns between train and test!"

#Keep only numeric features
common_cols = [c for c in common_cols if pd.api.types.is_numeric_dtype(df_train[c])]

# Show a quick view of features
print("\nFeature columns (first 20 shown):")
print(common_cols[:20])
print(f"Total feature columns: {len(common_cols)}")

# Optional: peek at dtypes
print("\nDtypes of first few features:")
print(df_train[common_cols[:20]].dtypes)



Feature columns (first 20 shown):
['standard_value', 'MolWt', 'LogP', 'TPSA', 'HBD', 'HBA', 'RotB', 'RingCount', 'AromaticRingCount', 'FractionCSP3', 'HeavyAtomCount', 'HeteroAtomCount', 'HasPyridine', 'HasPyrimidine', 'MW_norm', 'TPSA_norm', 'LogP_norm', 'SP3_norm', 'ECFP_0', 'ECFP_1']
Total feature columns: 2066

Dtypes of first few features:
standard_value       float64
MolWt                float64
LogP                 float64
TPSA                 float64
HBD                    int64
HBA                    int64
RotB                   int64
RingCount              int64
AromaticRingCount      int64
FractionCSP3         float64
HeavyAtomCount         int64
HeteroAtomCount        int64
HasPyridine            int64
HasPyrimidine          int64
MW_norm              float64
TPSA_norm            float64
LogP_norm            float64
SP3_norm             float64
ECFP_0                 int64
ECFP_1                 int64
dtype: object


In [18]:
# Build X/y
X_train = df_train[common_cols].values
y_train = df_train["y"].astype(int).values

X_test  = df_test[common_cols].values
y_test  = df_test["y"].astype(int).values

print(f"\nShapes --> X_train={X_train.shape}, y_train={y_train.shape}")
print(f"           X_test ={X_test.shape},  y_test ={y_test.shape}")

#Sanity checks
# NaNs?
n_nan_train = int(np.isnan(X_train).sum())
n_nan_test  = int(np.isnan(X_test).sum())
print(f"\nNaN check: train NaNs={n_nan_train}, test NaNs={n_nan_test}")
if n_nan_train or n_nan_test:
    print("Filling NaNs with 0 to avoid model errors.")
    X_train = np.nan_to_num(X_train, nan=0.0)
    X_test  = np.nan_to_num(X_test,  nan=0.0)

# Class balance
print("\nClass balance:")
print("train:", np.bincount(y_train))
print("test :", np.bincount(y_test))




Shapes --> X_train=(6342, 2066), y_train=(6342,)
           X_test =(1278, 2066),  y_test =(1278,)

NaN check: train NaNs=0, test NaNs=0

Class balance:
train: [1755 4587]
test : [336 942]


In [None]:
# Ensure output directory exists before running pipeline (important in notebooks)
os.makedirs(CONFIG["output_dir"], exist_ok=True)

#Run pipeline
selected = ["forest", "xgboost", "svm"]   # your selection
artefacts = run_pipeline(
    X_train, y_train,
    X_test, y_test,
    voting="soft",
    selected_models=selected
)

print("\nPipeline done. Keys in summary:", list(artefacts.keys()))