In [1]:
# Colab: mount drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path('/content/drive/MyDrive/ProteinMO/Dataset')
in_csv   = DATA_DIR / 'bindingdb_final_labeled.csv'

df = pd.read_csv(in_csv)
print("Loaded:", df.shape)
print(df.columns.tolist())
df.head()


Mounted at /content/drive
Loaded: (1262353, 7)
['smiles', 'ic50_nM', 'protein_seq', 'protein_name', 'uniprot_id', 'protein_desc', 'label']


Unnamed: 0,smiles,ic50_nM,protein_seq,protein_name,uniprot_id,protein_desc,label
0,CC(C)[C@H](NC(C)=O)C(=O)N[C@@H](Cc1ccccc1)[C@@...,8.5,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Gag-Pol polyprotein,P12497,Gag-Pol polyprotein,1.0
1,CCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1)[...,177.0,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Gag-Pol polyprotein,P12497,Gag-Pol polyprotein,1.0
2,COCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1...,164.0,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Gag-Pol polyprotein,P12497,Gag-Pol polyprotein,1.0
3,COCCOCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccc...,67.0,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Gag-Pol polyprotein,P12497,Gag-Pol polyprotein,1.0
4,COC(=O)N[C@@H](C(C)C)C(=O)NN(C[C@H](O)[C@H](Cc...,27.0,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Gag-Pol polyprotein,P12497,Gag-Pol polyprotein,1.0


In [2]:
# Basic checks
assert {'smiles', 'protein_seq', 'uniprot_id', 'label'}.issubset(df.columns), "Missing required columns!"
print("Unique proteins:", df['uniprot_id'].nunique())
print("Label counts:\n", df['label'].value_counts(normalize=True))


Unique proteins: 4933
Label counts:
 label
1.0    0.850808
0.0    0.149192
Name: proportion, dtype: float64


In [3]:
from sklearn.model_selection import GroupShuffleSplit

def group_split(df, groups, test_size, random_state):
    """Return boolean mask selecting held-out group split."""
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    idx_train, idx_holdout = next(gss.split(df, groups=groups))
    mask = pd.Series(False, index=df.index)
    mask.iloc[idx_holdout] = True
    return mask

RANDOM_STATE = 42

# First: split off test (10%)
test_mask = group_split(df, groups=df['uniprot_id'], test_size=0.10, random_state=RANDOM_STATE)
df_test = df[test_mask]
df_rest = df[~test_mask]

# Then: split rest into train (80%) and val (10% of total -> 10/90 ~ 11.11% of rest)
val_mask = group_split(df_rest, groups=df_rest['uniprot_id'], test_size=0.1111, random_state=RANDOM_STATE)
df_val = df_rest[val_mask]
df_train = df_rest[~val_mask]

print("Train / Val / Test sizes:", len(df_train), len(df_val), len(df_test))
print("Proteins per split:", df_train['uniprot_id'].nunique(), df_val['uniprot_id'].nunique(), df_test['uniprot_id'].nunique())



Train / Val / Test sizes: 1033794 121461 107098
Proteins per split: 3945 494 494


In [4]:
train_prot = set(df_train['uniprot_id'].unique())
val_prot   = set(df_val['uniprot_id'].unique())
test_prot  = set(df_test['uniprot_id'].unique())

assert train_prot.isdisjoint(val_prot)
assert train_prot.isdisjoint(test_prot)
assert val_prot.isdisjoint(test_prot)
print("No protein leakage across splits ✅")


No protein leakage across splits ✅


In [5]:
def describe_split(name, d):
    counts = d['label'].value_counts()
    frac = counts / counts.sum()
    print(f"\n{name}: {len(d)} rows")
    print("Counts:\n", counts)
    print("Fractions:\n", frac)

describe_split("Train", df_train)
describe_split("Val",   df_val)
describe_split("Test",  df_test)


Train: 1033794 rows
Counts:
 label
1.0    880799
0.0    152995
Name: count, dtype: int64
Fractions:
 label
1.0    0.852006
0.0    0.147994
Name: count, dtype: float64

Val: 121461 rows
Counts:
 label
1.0    100971
0.0     20490
Name: count, dtype: int64
Fractions:
 label
1.0    0.831304
0.0    0.168696
Name: count, dtype: float64

Test: 107098 rows
Counts:
 label
1.0    92250
0.0    14848
Name: count, dtype: int64
Fractions:
 label
1.0    0.861361
0.0    0.138639
Name: count, dtype: float64


In [6]:
"""
(Explanation: if you encode your labels as {0,1} and use BCEWithLogitsLoss, you pass pos_weight=torch.tensor([pos_weight]),
where pos_weight > 1 up-weights the positive (label=1) class.
Given your dataset has more 1s than 0s, pos_weight will be < 1. Alternatively, you can flip the encoding or use weighted sampling.)
"""
# Use the training split to compute weights
train_counts = df_train['label'].value_counts()
N0, N1 = train_counts[0.0], train_counts[1.0]
N = N0 + N1

# weights per class (common heuristic)
w0 = N / (2 * N0)  # for label 0 (non-binder)
w1 = N / (2 * N1)  # for label 1 (binder)
print("Class weights -> label=0 (non-binder):", w0, ", label=1 (binder):", w1)

# For PyTorch BCEWithLogitsLoss you typically pass pos_weight = w0/w1 (i.e., weight for positive class relative to negative)
pos_weight = (N0 / N1)
print("PyTorch BCEWithLogitsLoss pos_weight:", pos_weight)


Class weights -> label=0 (non-binder): 3.3785221739272524 , label=1 (binder): 0.5868501213103103
PyTorch BCEWithLogitsLoss pos_weight: 0.1737002426206206


In [9]:
DT_DIR = Path('/content/drive/MyDrive/ProteinMO/Dataset/Cleaned')
out_train = DT_DIR / 'train.csv'
out_val   = DT_DIR / 'val.csv'
out_test  = DT_DIR / 'test.csv'

df_train.to_csv(out_train, index=False)
df_val.to_csv(out_val, index=False)
df_test.to_csv(out_test, index=False)

print("Saved:")
print("  ", out_train)
print("  ", out_val)
print("  ", out_test)


Saved:
   /content/drive/MyDrive/ProteinMO/Dataset/Cleaned/train.csv
   /content/drive/MyDrive/ProteinMO/Dataset/Cleaned/val.csv
   /content/drive/MyDrive/ProteinMO/Dataset/Cleaned/test.csv
