In [1]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit

# Load dataset
# Change this to your actual file if different:
INPUT = "EGFR_final_modeling_dataset.csv"
df = pd.read_csv(INPUT)
print(f"Loaded: {INPUT}  |  shape: {df.shape}")

# Minimal checks
assert "class" in df.columns, "Expected a 'class' column."
assert "MurckoScaffold" in df.columns, "Expected 'MurckoScaffold' column."


Loaded: EGFR_final_modeling_dataset.csv  |  shape: (10074, 2071)


In [3]:

#Keep only active/inactive and create binary target
df["class"] = df["class"].astype(str).str.lower().str.strip()
before = len(df)
df = df[df["class"].isin(["active", "inactive"])].copy()
after = len(df)
print(f"Dropped 'intermediate' (and any others): {before - after} rows  |  remaining: {after}")

# Binary target: active=1, inactive=0
df["y"] = (df["class"] == "active").astype(int)

# Quick class balance check
print("\nClass counts (overall):")
print(df["class"].value_counts().to_string())

df.head()

Dropped 'intermediate' (and any others): 0 rows  |  remaining: 7620

Class counts (overall):
class
active      5529
inactive    2091


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class,standardized_smiles,MolWt,LogP,TPSA,HBD,HBA,...,ECFP_2039,ECFP_2040,ECFP_2041,ECFP_2042,ECFP_2043,ECFP_2044,ECFP_2045,ECFP_2046,ECFP_2047,y
0,CHEMBL68920,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,41.0,active,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,383.814,4.45034,82.7,3,4,...,0,0,0,0,0,0,0,0,1,1
1,CHEMBL69960,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,170.0,active,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,482.903,3.61432,112.24,3,6,...,0,0,0,0,0,0,0,0,0,1
3,CHEMBL306988,CC(=C(C#N)C#N)c1ccc(NC(=O)CCC(=O)O)cc1,500000.0,inactive,CC(=C(C#N)C#N)c1ccc(NC(=O)CCC(=O)O)cc1,283.287,2.31056,113.98,2,4,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL66879,O=C(O)/C=C/c1ccc(O)cc1,3000000.0,inactive,O=C(O)/C=C/c1ccc(O)cc1,164.16,1.49,57.53,2,2,...,0,0,0,0,0,0,0,0,0,0
5,CHEMBL77085,N#CC(C#N)=Cc1cc(O)ccc1[N+](=O)[O-],96000.0,inactive,N#CC(C#N)=Cc1cc(O)ccc1[N+](=O)[O-],215.168,1.73096,110.95,1,5,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Scaffold split (with no leakage)
TEST_SIZE = 0.20
RANDOM_STATE = 42

gss = GroupShuffleSplit(n_splits=1, test_size=TEST_SIZE, random_state=RANDOM_STATE)
groups = df["MurckoScaffold"].fillna("NA_SCAFFOLD")
train_idx, test_idx = next(gss.split(df, df["y"], groups=groups))

train_df = df.iloc[train_idx].copy()
test_df  = df.iloc[test_idx].copy()

print(f"\nSplit sizes ----> train: {train_df.shape}, test: {test_df.shape}")



Split sizes ----> train: (6342, 2072), test: (1278, 2072)


In [6]:
# Sanity checks 
#A. No scaffold leakage
train_scafs = set(train_df["MurckoScaffold"].fillna("NA_SCAFFOLD"))
test_scafs  = set(test_df["MurckoScaffold"].fillna("NA_SCAFFOLD"))
overlap = train_scafs & test_scafs
assert len(overlap) == 0, f"Scaffold leakage detected! Overlap count: {len(overlap)}"

#B. Class balance in each split
def report(part, name):
    n = len(part)
    vc = part["y"].value_counts().sort_index()
    act = int(vc.get(1, 0)); inact = int(vc.get(0, 0))
    frac_active = act / n if n else 0
    print(f"{name}: n={n}, active={act}, inactive={inact}, frac_active={frac_active:.3f}")

print("\nClass balance:")
report(df, "Overall")
report(train_df, "Train")
report(test_df, "Test")

#C. Targets are binary
for name, part in [("Train", train_df), ("Test", test_df)]:
    assert set(part["y"].unique()).issubset({0, 1}), f"{name} has non-binary y!"



Class balance:
Overall: n=7620, active=5529, inactive=2091, frac_active=0.726
Train: n=6342, active=4587, inactive=1755, frac_active=0.723
Test: n=1278, active=942, inactive=336, frac_active=0.737


In [7]:
# Save splits 
OUT_TRAIN = "EGFR_train_scaffold.csv"
OUT_TEST  = "EGFR_test_scaffold.csv"
train_df.to_csv(OUT_TRAIN, index=False)
test_df.to_csv(OUT_TEST, index=False)
print(f"\nSaved:\n  {OUT_TRAIN}\n  {OUT_TEST}")



Saved:
  EGFR_train_scaffold.csv
  EGFR_test_scaffold.csv
