In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from ydata_profiling import ProfileReport
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from tqdm import tqdm

In [27]:
# https://depmap.org/portal/data_page/?tab=allData&releasename=Sanger+GDSC1+and+GDSC2&filename=sanger-dose-response.csv
# IC50 score of drugs (DRUG_ID) per cell line (COSMIC_ID) for GDSC1 and GDSC2 
try:
    df_dose_resp = pd.read_csv("D:\\auth\\dataset\\depmap\\sanger-dose-response.csv", delimiter=',')
    df_dose_resp_gdsc2 = df_dose_resp[(df_dose_resp.DATASET == "GDSC2")]
    df_dose_resp_gdsc2_edited = df_dose_resp_gdsc2.filter(items = ['DRUG_ID', 'ARXSPAN_ID', 'IC50_PUBLISHED'])
    print("Shape of df_dose_resp_gdsc2 = {}".format(df_dose_resp_gdsc2_edited.shape))
    print("Unique cell lines (ARXSPAN_ID) = {}".format(df_dose_resp_gdsc2_edited['ARXSPAN_ID'].nunique()))
    print("Unique drugs = {}".format(df_dose_resp_gdsc2_edited['DRUG_ID'].nunique()))
    print("Unique combinations of cell line x drug = {}".format(df_dose_resp_gdsc2_edited.groupby(['ARXSPAN_ID', 'DRUG_ID']).size().count()))
    print(df_dose_resp_gdsc2_edited.head)
except pd.errors.ParserError as e:
    print(f"Error: {e}")

Shape of df_dose_resp_gdsc2 = (118908, 3)
Unique cell lines (ARXSPAN_ID) = 793
Unique drugs = 175
Unique combinations of cell line x drug = 116202
<bound method NDFrame.head of         DRUG_ID  ARXSPAN_ID  IC50_PUBLISHED
268718     1003  ACH-000958        0.025129
268719     1003  ACH-000651        0.049577
268720     1003  ACH-000856        0.028549
268721     1003  ACH-000360        0.039996
268722     1003  ACH-001199        1.986678
...         ...         ...             ...
387621     2172  ACH-000288       25.410793
387622     2172  ACH-001065        0.339325
387623     2172  ACH-000930        7.780877
387624     2172  ACH-000859      534.688321
387625     2172  ACH-000536      120.177282

[118908 rows x 3 columns]>


In [3]:
# https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap+Public+24Q2&filename=Model.csv
# Mapping between 'ModelID', 'PatientID', 'SangerModelID', 'COSMICID', etc
try:
    df_depmap_model = pd.read_csv("D:\\auth\\dataset\\depmap\\Model.csv", delimiter=',')
    df_depmap_model_edited = df_depmap_model.filter(items=['ModelID', 'PatientID', 'SangerModelID', 'COSMICID'])
    print("Shape of df_depmap_model = {}".format(df_depmap_model_edited.shape))
    print("Unique cell lines (ModelID) ACH-XXXXXX = {}".format(df_depmap_model_edited['ModelID'].nunique()))
    print("Unique cell lines (SangerModelID) = {}".format(df_depmap_model_edited['SangerModelID'].nunique()))
    print("Unique cell lines (COSMICID) = {}".format(df_depmap_model_edited['COSMICID'].nunique()))
    # print(df_depmap_model_edited.head())
except pd.errors.ParserError as e:
    print(f"Error: {e}")

Shape of df_depmap_model = (1959, 4)
Unique cell lines (ModelID) ACH-XXXXXX = 1959
Unique cell lines (SangerModelID) = 1215
Unique cell lines (COSMICID) = 977


In [7]:
# https://depmap.org/portal/data_page/?tab=customDownloads -> CRISPR (DepMap Public 24Q2+Score, Chronos)
# https://depmap.org/portal/download/data_slicer/download?file_path=20240719%2F26f7bf40-b499-4d1d-ab35-964b10b3dfa0%2Fexport.csv&name=CRISPR_%28DepMap_Public_24Q2%2BScore%2C_Chronos%29_subsetted.csv
# 1150 unique cell lines (+7 info columns) x 18435 genes.
# About the values:
# Gene Effect scores derived from CRISPR knockout screens published by Broad’s Achilles and Sanger’s SCORE projects.
# Negative scores imply cell growth inhibition and/or death following gene knockout. Scores are normalized such that nonessential genes have a median score of 0 and independently identified common essentials have a median score of -1.
# Gene Effect scores were inferenced by Chronos (a cell population dynamics model of CRISPR experiments that improves inference of gene fitness effects)
try:
    df_crispr = pd.read_csv("D:\\auth\\dataset\\depmap\\CRISPR_(DepMap_Public_24Q2+Score,_Chronos)_subsetted.csv", delimiter=',', low_memory=False)
    # df_crispr_edited = df_crispr.drop(columns = ['cell_line_display_name', 'lineage_1', 'lineage_2', 'lineage_3', 'lineage_5', 'lineage_6', 'lineage_4'])
    df_crispr_edited = df_crispr.filter(items = ['depmap_id', 'A1BG', 'A1CF', 'A2M', 'A2ML1', 'A3GALT2'])
    print("Shape of df_crispr = {}".format(df_crispr_edited.shape))
    print("Unique cell lines (depmap_id) ACH-XXXXXX = {}".format(df_crispr_edited['depmap_id'].nunique()))
    print(df_crispr_edited.head())
    # print(df_crispr_edited.describe())
    

    # # Identify columns with object data type
    # object_columns = df_crispr_edited.select_dtypes(include=['object']).columns
    # print("Columns with object data type:", object_columns)
    
    # # Check each column for mixed data types
    # for col in object_columns:
    #     unique_types = df_crispr_edited[col].map(type).nunique()
    #     if unique_types > 1:
    #         print(f"Column '{col}' has mixed data types")

    # df_crispr_edited_for_profiling = df_crispr_edited.drop(columns = ['depmap_id'])
    # df_crispr_edited_for_profiling = df_crispr_edited_for_profiling.fillna(0)
    # df_crispr_edited_for_profiling.to_csv('df_crispr_edited_for_profiling.csv', index=False)

    # # print(df_crispr_edited_for_profiling.head())
    # profile = ProfileReport(df_crispr_edited_for_profiling, title="Profiling Report")
    # profile.to_file("profiling_report.html")

except pd.errors.ParserError as e:
    print(f"Error: {e}")

Shape of df_crispr = (1150, 6)
Unique cell lines (depmap_id) ACH-XXXXXX = 1150
    depmap_id      A1BG      A1CF       A2M     A2ML1   A3GALT2
0  ACH-001270 -0.125700 -0.190342  0.034115  0.114257  0.041126
1  ACH-002680 -0.056783 -0.074939  0.070967 -0.047154 -0.189602
2  ACH-002401 -0.039836 -0.116822  0.094329  0.110162 -0.192706
3  ACH-002399  0.013372 -0.082142  0.036879  0.014451 -0.232645
4  ACH-000520 -0.222447 -0.096272  0.081117  0.123556 -0.010445
              A1BG         A1CF          A2M        A2ML1      A3GALT2
count  1150.000000  1150.000000  1150.000000  1150.000000  1150.000000
mean     -0.073435    -0.077739     0.026734     0.070217    -0.130254
std       0.101372     0.110613     0.097544     0.098477     0.123472
min      -0.625954    -0.816378    -0.387043    -0.383116    -0.743550
25%      -0.133659    -0.136863    -0.031875     0.017431    -0.202255
50%      -0.074917    -0.078141     0.025570     0.068801    -0.126639
75%      -0.015774    -0.017229     0.08

In [11]:
# Merge ic50 dataset with model dataset
# This should have been 118908 rows just like df_dose_resp_gdsc2 but it's 115502 probably because model df doesn't have some cell lines of ic50 df.
try:
    assert df_dose_resp_gdsc2_edited["ARXSPAN_ID"].dtype == df_depmap_model_edited["ModelID"].dtype
    df_dose_model = pd.merge(df_dose_resp_gdsc2_edited, df_depmap_model_edited, left_on="ARXSPAN_ID", right_on="ModelID")
    print("Shape of df_dose_model = {}".format(df_dose_model.shape))
    print(df_dose_model.head())
except pd.errors.ParserError as e:
    print(f"Error: {e}")

Shape of df_dose_model = (115502, 7)
   DRUG_ID  ARXSPAN_ID  IC50_PUBLISHED     ModelID  PatientID SangerModelID  \
0     1003  ACH-000958        0.025129  ACH-000958  PT-EB6qeM     SIDM00837   
1     1003  ACH-000651        0.049577  ACH-000651  PT-IPboWn     SIDM00841   
2     1003  ACH-000856        0.028549  ACH-000856  PT-PgOEtc     SIDM00933   
3     1003  ACH-000360        0.039996  ACH-000360  PT-M0lnCb     SIDM00777   
4     1003  ACH-001199        1.986678  ACH-001199  PT-GfrL06     SIDM00781   

   COSMICID  
0  909751.0  
1  905962.0  
2  910927.0  
3  908442.0  
4  909740.0  


In [15]:
# Merge ic50/model dataset with crispr dataset
# 75735 cell lines x info columns + genes
try:
    assert df_dose_model["ARXSPAN_ID"].dtype == df_crispr_edited["depmap_id"].dtype
    df_dose_model_crispr = pd.merge(df_dose_model, df_crispr_edited, left_on="ARXSPAN_ID", right_on="depmap_id")
    print("Shape of df_dose_model_crispr = {}".format(df_dose_model_crispr.shape))
    print(df_dose_model_crispr.head())
except pd.errors.ParserError as e:
    print(f"Error: {e}")

Shape of df_dose_model_crispr = (75735, 13)
   DRUG_ID  ARXSPAN_ID  IC50_PUBLISHED     ModelID  PatientID SangerModelID  \
0     1003  ACH-000958        0.025129  ACH-000958  PT-EB6qeM     SIDM00837   
1     1003  ACH-000651        0.049577  ACH-000651  PT-IPboWn     SIDM00841   
2     1003  ACH-000856        0.028549  ACH-000856  PT-PgOEtc     SIDM00933   
3     1003  ACH-001081        0.398397  ACH-001081  PT-JITDXL     SIDM00118   
4     1003  ACH-000699        0.111788  ACH-000699  PT-8MisfZ     SIDM00884   

   COSMICID   depmap_id      A1BG      A1CF       A2M     A2ML1   A3GALT2  
0  909751.0  ACH-000958 -0.214908 -0.023742 -0.006509  0.116443 -0.049269  
1  905962.0  ACH-000651  0.039654 -0.157516 -0.034580  0.034380 -0.151170  
2  910927.0  ACH-000856 -0.318675 -0.238497  0.010796  0.047775 -0.036154  
3  905971.0  ACH-001081  0.053277 -0.025616 -0.036041 -0.175023 -0.051108  
4  749712.0  ACH-000699 -0.086116 -0.091741 -0.008565  0.076621 -0.208394  


In [19]:
# Filter columns to be used as features in X
dataset_final = df_dose_model_crispr.select_dtypes(include=[np.number])
print("Shape of dataset_final = {}".format(dataset_final.shape))

# TODO preprocessing to convert non-numerical columns we want to keep as features

Shape of dataset_final = (75735, 8)


In [21]:
# # Check for NANs
# print(dataset_final.isna().sum().sort_values(ascending=False))
# # dataset_final_filled = dataset_final.fillna(0)
print(dataset_final.isna().sum().sort_values(ascending=False))

DRUG_ID           0
IC50_PUBLISHED    0
COSMICID          0
A1BG              0
A1CF              0
A2M               0
A2ML1             0
A3GALT2           0
dtype: int64


In [23]:
# Define a simple ranking dataset
class RankingDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

# Define a simple neural network for ranking
class SimpleRankingNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleRankingNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(50, 1)  # Output single score for ranking

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [25]:
# Split into features (X) and target (y)
y = dataset_final['IC50_PUBLISHED'].values
X = dataset_final.drop(columns=['IC50_PUBLISHED']).values

# Standardize the dataset
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

# Training parameters
learning_rate = 0.001
num_epochs = 20
batch_size = 32
k_folds = 5

# Cross-validation
kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)
results = {}

for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
    print(f'Fold {fold + 1}/{k_folds}')

    # Create data loaders
    train_dataset = RankingDataset(X[train_idx], y[train_idx])
    val_dataset = RankingDataset(X[val_idx], y[val_idx])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Initialize the model, loss function, and optimizer
    model = SimpleRankingNN(input_size=X.shape[1])
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop with progress bar
    for epoch in tqdm(range(num_epochs), desc=f"Training Fold {fold+1}/{k_folds}"):
        model.train()
        for batch_features, batch_targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
            optimizer.zero_grad()
            outputs = model(batch_features).squeeze()
            loss = criterion(outputs, batch_targets)
            loss.backward()
            optimizer.step()

    # Validation with progress bar
    model.eval()
    val_loss = 0
    val_rmse = 0
    total = 0
    with torch.no_grad():
        for batch_features, batch_targets in tqdm(val_loader, desc=f"Validating Fold {fold+1}/{k_folds}", leave=False):
            outputs = model(batch_features).squeeze()
            loss = criterion(outputs, batch_targets)
            val_loss += loss.item()
            val_rmse += torch.sqrt(loss).item() * batch_targets.size(0)
            total += batch_targets.size(0)

    avg_val_loss = val_loss / len(val_loader)
    avg_val_rmse = val_rmse / total
    print(f'Validation Loss: {avg_val_loss:.4f}, RMSE: {avg_val_rmse:.4f}')
    results[fold] = avg_val_rmse

# Print fold results
print(f'\nK-Fold Cross Validation results for {k_folds} folds')
for key, value in results.items():
    print(f'Fold {key+1}: RMSE {value:.4f}')
print(f'Average RMSE: {np.mean(list(results.values())):.4f}')


Fold 1/5


Training Fold 1/5:   0%|          | 0/20 [00:00<?, ?it/s]
Epoch 1/20:   0%|          | 0/1894 [00:00<?, ?it/s][A
Epoch 1/20:   9%|▊         | 164/1894 [00:00<00:01, 1577.00it/s][A
Epoch 1/20:  20%|█▉        | 373/1894 [00:00<00:00, 1863.31it/s][A
Epoch 1/20:  31%|███       | 590/1894 [00:00<00:00, 1894.53it/s][A
Epoch 1/20:  43%|████▎     | 808/1894 [00:00<00:00, 1964.33it/s][A
Epoch 1/20:  54%|█████▍    | 1021/1894 [00:00<00:00, 2015.22it/s][A
Epoch 1/20:  65%|██████▍   | 1229/1894 [00:00<00:00, 2033.37it/s][A
Epoch 1/20:  76%|███████▌  | 1441/1894 [00:00<00:00, 2058.71it/s][A
Epoch 1/20:  87%|████████▋ | 1648/1894 [00:00<00:00, 2030.32it/s][A
Epoch 1/20:  99%|█████████▊| 1869/1894 [00:00<00:00, 2054.31it/s][A
Training Fold 1/5:   5%|▌         | 1/20 [00:00<00:18,  1.05it/s][A
Epoch 2/20:   0%|          | 0/1894 [00:00<?, ?it/s][A
Epoch 2/20:   8%|▊         | 156/1894 [00:00<00:01, 1545.47it/s][A
Epoch 2/20:  19%|█▊        | 352/1894 [00:00<00:00, 1783.93it/s][A
Epoch 2/

Validation Loss: 734625.0940, RMSE: 435.4770
Fold 2/5


Training Fold 2/5:   0%|          | 0/20 [00:00<?, ?it/s]
Epoch 1/20:   0%|          | 0/1894 [00:00<?, ?it/s][A
Epoch 1/20:   9%|▉         | 173/1894 [00:00<00:01, 1620.05it/s][A
Epoch 1/20:  22%|██▏       | 413/1894 [00:00<00:00, 1896.99it/s][A
Epoch 1/20:  34%|███▎      | 638/1894 [00:00<00:00, 1986.49it/s][A
Epoch 1/20:  47%|████▋     | 882/1894 [00:00<00:00, 2038.70it/s][A
Epoch 1/20:  58%|█████▊    | 1107/1894 [00:00<00:00, 2062.71it/s][A
Epoch 1/20:  70%|██████▉   | 1322/1894 [00:00<00:00, 2087.14it/s][A
Epoch 1/20:  83%|████████▎ | 1563/1894 [00:00<00:00, 2089.45it/s][A
Epoch 1/20:  94%|█████████▍| 1789/1894 [00:00<00:00, 2094.98it/s][A
Training Fold 2/5:   5%|▌         | 1/20 [00:00<00:17,  1.08it/s][A
Epoch 2/20:   0%|          | 0/1894 [00:00<?, ?it/s][A
Epoch 2/20:   8%|▊         | 158/1894 [00:00<00:01, 1576.09it/s][A
Epoch 2/20:  21%|██        | 389/1894 [00:00<00:00, 1902.56it/s][A
Epoch 2/20:  32%|███▏      | 602/1894 [00:00<00:00, 1998.56it/s][A
Epoch 2/2

Validation Loss: 503195.6795, RMSE: 388.5273
Fold 3/5


Training Fold 3/5:   0%|          | 0/20 [00:00<?, ?it/s]
Epoch 1/20:   0%|          | 0/1894 [00:00<?, ?it/s][A
Epoch 1/20:   8%|▊         | 157/1894 [00:00<00:01, 1562.60it/s][A
Epoch 1/20:  19%|█▊        | 353/1894 [00:00<00:00, 1786.98it/s][A
Epoch 1/20:  29%|██▉       | 549/1894 [00:00<00:00, 1858.98it/s][A
Epoch 1/20:  39%|███▉      | 735/1894 [00:00<00:00, 1856.56it/s][A
Epoch 1/20:  50%|████▉     | 945/1894 [00:00<00:00, 1937.15it/s][A
Epoch 1/20:  61%|██████    | 1148/1894 [00:00<00:00, 1968.24it/s][A
Epoch 1/20:  71%|███████▏  | 1351/1894 [00:00<00:00, 1985.70it/s][A
Epoch 1/20:  82%|████████▏ | 1550/1894 [00:00<00:00, 1979.58it/s][A
Epoch 1/20:  92%|█████████▏| 1748/1894 [00:00<00:00, 1954.38it/s][A
Training Fold 3/5:   5%|▌         | 1/20 [00:00<00:18,  1.01it/s][A
Epoch 2/20:   0%|          | 0/1894 [00:00<?, ?it/s][A
Epoch 2/20:   9%|▊         | 162/1894 [00:00<00:01, 1617.80it/s][A
Epoch 2/20:  20%|█▉        | 372/1894 [00:00<00:00, 1899.10it/s][A
Epoch 2/2

Validation Loss: 333952.3350, RMSE: 375.9351
Fold 4/5


Training Fold 4/5:   0%|          | 0/20 [00:00<?, ?it/s]
Epoch 1/20:   0%|          | 0/1894 [00:00<?, ?it/s][A
Epoch 1/20:   8%|▊         | 157/1894 [00:00<00:01, 1567.86it/s][A
Epoch 1/20:  19%|█▉        | 357/1894 [00:00<00:00, 1815.67it/s][A
Epoch 1/20:  29%|██▉       | 555/1894 [00:00<00:00, 1889.06it/s][A
Epoch 1/20:  40%|███▉      | 755/1894 [00:00<00:00, 1932.77it/s][A
Epoch 1/20:  50%|█████     | 950/1894 [00:00<00:00, 1932.52it/s][A
Epoch 1/20:  61%|██████    | 1146/1894 [00:00<00:00, 1935.85it/s][A
Epoch 1/20:  71%|███████   | 1349/1894 [00:00<00:00, 1961.64it/s][A
Epoch 1/20:  82%|████████▏ | 1546/1894 [00:00<00:00, 1957.71it/s][A
Epoch 1/20:  92%|█████████▏| 1742/1894 [00:00<00:00, 1957.12it/s][A
Training Fold 4/5:   5%|▌         | 1/20 [00:00<00:18,  1.02it/s][A
Epoch 2/20:   0%|          | 0/1894 [00:00<?, ?it/s][A
Epoch 2/20:   9%|▊         | 161/1894 [00:00<00:01, 1609.21it/s][A
Epoch 2/20:  19%|█▊        | 352/1894 [00:00<00:00, 1785.85it/s][A
Epoch 2/2

Validation Loss: 606181.7200, RMSE: 409.4458
Fold 5/5


Training Fold 5/5:   0%|          | 0/20 [00:00<?, ?it/s]
Epoch 1/20:   0%|          | 0/1894 [00:00<?, ?it/s][A
Epoch 1/20:   8%|▊         | 159/1894 [00:00<00:01, 1585.84it/s][A
Epoch 1/20:  19%|█▉        | 361/1894 [00:00<00:00, 1838.99it/s][A
Epoch 1/20:  30%|██▉       | 561/1894 [00:00<00:00, 1910.78it/s][A
Epoch 1/20:  41%|████      | 770/1894 [00:00<00:00, 1979.67it/s][A
Epoch 1/20:  52%|█████▏    | 980/1894 [00:00<00:00, 2021.77it/s][A
Epoch 1/20:  63%|██████▎   | 1184/1894 [00:00<00:00, 2027.21it/s][A
Epoch 1/20:  74%|███████▍  | 1397/1894 [00:00<00:00, 2059.96it/s][A
Epoch 1/20:  85%|████████▍ | 1605/1894 [00:00<00:00, 2065.19it/s][A
Epoch 1/20:  96%|█████████▌| 1812/1894 [00:00<00:00, 2058.45it/s][A
Training Fold 5/5:   5%|▌         | 1/20 [00:00<00:18,  1.05it/s][A
Epoch 2/20:   0%|          | 0/1894 [00:00<?, ?it/s][A
Epoch 2/20:   9%|▊         | 164/1894 [00:00<00:01, 1637.50it/s][A
Epoch 2/20:  19%|█▉        | 368/1894 [00:00<00:00, 1870.10it/s][A
Epoch 2/2

Validation Loss: 404756.1283, RMSE: 386.7219

K-Fold Cross Validation results for 5 folds
Fold 1: RMSE 435.4770
Fold 2: RMSE 388.5273
Fold 3: RMSE 375.9351
Fold 4: RMSE 409.4458
Fold 5: RMSE 386.7219
Average RMSE: 399.2214


