In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

# Validation split (Zero shot prediction)

- This code prepares a dataset for zero-shot prediction in drug response modeling:
    - Loads and processes a drug response dataset
    - Splits data into train, validation, and test sets
    - Ensures no overlap of cell lines or compounds between train and test sets
    - Creates features (X) and labels (y) for each set
    - Saves processed data as CSV files and NumPy arrays
- Key aspects of the zero-shot prediction setup:
    - Completely different cell lines and compounds in train vs. test sets
    - Evaluates model's ability to predict for unseen cell-compound combinations
    - Tests generalization to novel cell lines and compounds

In [2]:
df = pd.read_csv("../data/dataset.csv", index_col=0)
df

Unnamed: 0,NSC,CELL_NAME_nci,HI_CONCENTRATION
0,17,786_0,0.0001
1,17,A498,0.0001
2,17,A549,0.0001
3,17,ACHN,0.0001
4,17,BT_549,0.0001
...,...,...,...
219999,849222,T47D,0.0001
220000,849222,TK_10,0.0001
220001,849222,U251,0.0001
220002,849222,UACC_257,0.0001


In [3]:
genes = pd.read_csv("../data/genes.csv").T
dti = pd.read_csv("../../DTI-quantification/data/drug_gene_score.csv.gz")
dti = dti[dti.gene.isin(list(genes.index))]
dti

Unnamed: 0,NSC,gene,PMID_count,log,Y,log_Y,CID
0,3188.0,AAK1,0.0,0.0,0.5,0.5,4.0
1,3188.0,ADRB1,0.0,0.0,0.5,0.5,4.0
2,3188.0,BMP2K,0.0,0.0,0.5,0.5,4.0
5,3188.0,CACNB3,0.0,0.0,0.5,0.5,4.0
10,3188.0,CREBBP,0.0,0.0,0.5,0.5,4.0
...,...,...,...,...,...,...,...
1038831,852991.0,BRAF,0.0,0.0,0.5,0.5,156297592.0
1038832,852991.0,NRAS,0.0,0.0,0.5,0.5,156297592.0
1038833,841442.0,ATM,0.0,0.0,0.5,0.5,156487652.0
1038834,841442.0,CDK12,0.0,0.0,0.5,0.5,156487652.0


In [4]:
# Get unique CELL_NAME and NSC
unique_cells = df["CELL_NAME_nci"].unique()
unique_nscs = df["NSC"].unique()

print("unique cells: ", len(unique_cells))
print("unique nscs: ", len(unique_nscs))

unique cells:  60
unique nscs:  3897


In [5]:
np.random.seed(42)
train_cells = np.random.choice(
    unique_cells, size=int(len(unique_cells) * 0.7), replace=False
)
test_cells = np.setdiff1d(unique_cells, train_cells)

train_nscs = np.random.choice(
    unique_nscs, size=int(len(unique_nscs) * 0.6), replace=False
)
test_nscs = np.setdiff1d(unique_nscs, train_nscs)

In [6]:
train = df[df.NSC.isin(train_nscs) & df.CELL_NAME_nci.isin(train_cells)]
test = df[df.NSC.isin(test_nscs) & df.CELL_NAME_nci.isin(test_cells)]

train = train.sample(frac=1, random_state=42)
test = test.sample(frac=1, random_state=42)

val_size = int(len(train) * 0.2)
val = train.sample(n=val_size, random_state=42)
train = train.drop(val.index)

In [7]:
# Get unique CELL_NAME and NSC
train_unique_cells = train["CELL_NAME_nci"].unique()
train_unique_nscs = train["NSC"].unique()

print("# of data: ", len(train))
print("unique cells: ", len(train_unique_cells))
print("unique nscs: ", len(train_unique_nscs))

# of data:  74457
unique cells:  42
unique nscs:  2338


In [8]:
# Get unique CELL_NAME and NSC
val_unique_cells = val["CELL_NAME_nci"].unique()
val_unique_nscs = val["NSC"].unique()

print("# of data: ", len(val))
print("unique cells: ", len(val_unique_cells))
print("unique nscs: ", len(val_unique_nscs))

# of data:  18614
unique cells:  42
unique nscs:  2337


In [9]:
# Get unique CELL_NAME and NSC
test_unique_cells = test["CELL_NAME_nci"].unique()
test_unique_nscs = test["NSC"].unique()

print("# of data: ", len(test))
print("unique cells: ", len(test_unique_cells))
print("unique nscs: ", len(test_unique_nscs))

# of data:  25861
unique cells:  18
unique nscs:  1558


In [10]:
print(f"Total data size: {len(df)}")
print(f"Train data size: {len(train)}")
print(f"Validation data size: {len(val)}")
print(f"Test data size: {len(test)}")

Total data size: 220004
Train data size: 74457
Validation data size: 18614
Test data size: 25861


In [11]:
# Check the number of unique NSCs and CELL_NAMEs for each set
for name, dataset in [("Train", train), ("Validation", val), ("Test", test)]:
    print(f"\n{name} set:")
    print(f"Number of unique NSCs: {dataset['NSC'].nunique()}")
    print(f"Number of unique CELL_NAMEs: {dataset['CELL_NAME_nci'].nunique()}")


Train set:
Number of unique NSCs: 2338
Number of unique CELL_NAMEs: 42

Validation set:
Number of unique NSCs: 2337
Number of unique CELL_NAMEs: 42

Test set:
Number of unique NSCs: 1558
Number of unique CELL_NAMEs: 18


In [12]:
# Check for overlaps
train_nscs = set(train["NSC"])
train_cells = set(train["CELL_NAME_nci"])
test_nscs = set(test["NSC"])
test_cells = set(test["CELL_NAME_nci"])

print("\nOverlap check:")
print(f"NSC overlap between train and test: {len(train_nscs.intersection(test_nscs))}")
print(
    f"CELL_NAME overlap between train and test: {len(train_cells.intersection(test_cells))}"
)


Overlap check:
NSC overlap between train and test: 0
CELL_NAME overlap between train and test: 0


In [13]:
y_train = list(train["HI_CONCENTRATION"])
y_val = list(val["HI_CONCENTRATION"])
y_test = list(test["HI_CONCENTRATION"])

X_train = train.drop("HI_CONCENTRATION", axis=1)
X_val = val.drop("HI_CONCENTRATION", axis=1)
X_test = test.drop("HI_CONCENTRATION", axis=1)

In [14]:
X_train.to_csv("../data/train_IC50.csv", index=False)
X_test.to_csv("../data/test_IC50.csv", index=False)
X_val.to_csv("../data/val_IC50.csv", index=False)

np.save("../data/train_IC50_labels.npy", y_train)
np.save("../data/test_IC50_labels.npy", y_test)
np.save("../data/val_IC50_labels.npy", y_val)