In [None]:
# conda install pytorch torchvision torchaudio cpuonly -c pytorch

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from tqdm import tqdm

In [2]:
# https://depmap.org/portal/data_page/?tab=allData&releasename=Sanger+GDSC1+and+GDSC2&filename=sanger-dose-response.csv
# IC50 score of drugs (DRUG_ID) per cell line (COSMIC_ID) for GDSC1 and GDSC2 
try:
    df_dose_resp = pd.read_csv("C:\\Users\\chris\\rank-rx\\data\\sanger-dose-response.csv", delimiter=',')
    df_dose_resp_gdsc2 = df_dose_resp[(df_dose_resp.DATASET == "GDSC2")]
    df_dose_resp_gdsc2_edited = df_dose_resp_gdsc2.filter(items = ['DRUG_ID', 'ARXSPAN_ID', 'IC50_PUBLISHED'])
    print("Shape of df_dose_resp_gdsc2 = {}".format(df_dose_resp_gdsc2_edited.shape))
    print("Unique cell lines (ARXSPAN_ID) = {}".format(df_dose_resp_gdsc2_edited['ARXSPAN_ID'].nunique()))
    print("Unique drugs = {}".format(df_dose_resp_gdsc2_edited['DRUG_ID'].nunique()))
    print("Unique combinations of cell line x drug = {}".format(df_dose_resp_gdsc2_edited.groupby(['ARXSPAN_ID', 'DRUG_ID']).size().count()))
    print(df_dose_resp_gdsc2_edited.head)
except pd.errors.ParserError as e:
    print(f"Error: {e}")

Shape of df_dose_resp_gdsc2 = (118908, 3)
Unique cell lines (ARXSPAN_ID) = 793
Unique drugs = 175
Unique combinations of cell line x drug = 116202
<bound method NDFrame.head of         DRUG_ID  ARXSPAN_ID  IC50_PUBLISHED
268718     1003  ACH-000958        0.025129
268719     1003  ACH-000651        0.049577
268720     1003  ACH-000856        0.028549
268721     1003  ACH-000360        0.039996
268722     1003  ACH-001199        1.986678
...         ...         ...             ...
387621     2172  ACH-000288       25.410793
387622     2172  ACH-001065        0.339325
387623     2172  ACH-000930        7.780877
387624     2172  ACH-000859      534.688321
387625     2172  ACH-000536      120.177282

[118908 rows x 3 columns]>


In [3]:
# https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap+Public+24Q2&filename=Model.csv
# Mapping between 'ModelID', 'PatientID', 'SangerModelID', 'COSMICID', etc
try:
    df_depmap_model = pd.read_csv("C:\\Users\\chris\\rank-rx\\data\\Model.csv", delimiter=',')
    df_depmap_model_edited = df_depmap_model.filter(items=['ModelID', 'PatientID', 'SangerModelID', 'COSMICID'])
    print("Shape of df_depmap_model = {}".format(df_depmap_model_edited.shape))
    print("Unique cell lines (ModelID) ACH-XXXXXX = {}".format(df_depmap_model_edited['ModelID'].nunique()))
    print("Unique cell lines (SangerModelID) = {}".format(df_depmap_model_edited['SangerModelID'].nunique()))
    print("Unique cell lines (COSMICID) = {}".format(df_depmap_model_edited['COSMICID'].nunique()))
    # print(df_depmap_model_edited.head())
except pd.errors.ParserError as e:
    print(f"Error: {e}")

Shape of df_depmap_model = (1959, 4)
Unique cell lines (ModelID) ACH-XXXXXX = 1959
Unique cell lines (SangerModelID) = 1215
Unique cell lines (COSMICID) = 977


In [4]:
# gene expression data OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected
try:
    df_gene_express = pd.read_csv("C:\\Users\\chris\\rank-rx\\data\\OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected.csv", delimiter=',', low_memory=False)
    print("Shape of df_gene_express = {}".format(df_gene_express.shape))
    
except pd.errors.ParserError as e:
    print(f"Error: {e}")

Shape of df_gene_express = (1517, 19138)


In [9]:
# Merge ic50 dataset with model dataset
# This should have been 118908 rows just like df_dose_resp_gdsc2 but it's 115502 probably because model df doesn't have some cell lines of ic50 df.
try:
    assert df_dose_resp_gdsc2_edited["ARXSPAN_ID"].dtype == df_depmap_model_edited["ModelID"].dtype
    df_dose_model = pd.merge(df_dose_resp_gdsc2_edited, df_depmap_model_edited, left_on="ARXSPAN_ID", right_on="ModelID")
    print("Shape of df_dose_model = {}".format(df_dose_model.shape))
    print(df_dose_model.head())
except pd.errors.ParserError as e:
    print(f"Error: {e}")

Shape of df_dose_model = (115502, 7)
   DRUG_ID  ARXSPAN_ID  IC50_PUBLISHED     ModelID  PatientID SangerModelID  \
0     1003  ACH-000958        0.025129  ACH-000958  PT-EB6qeM     SIDM00837   
1     1003  ACH-000651        0.049577  ACH-000651  PT-IPboWn     SIDM00841   
2     1003  ACH-000856        0.028549  ACH-000856  PT-PgOEtc     SIDM00933   
3     1003  ACH-000360        0.039996  ACH-000360  PT-M0lnCb     SIDM00777   
4     1003  ACH-001199        1.986678  ACH-001199  PT-GfrL06     SIDM00781   

   COSMICID  
0  909751.0  
1  905962.0  
2  910927.0  
3  908442.0  
4  909740.0  


In [None]:
# Merge ic50/model dataset with gene expression dataset
try:
    assert df_dose_model["ARXSPAN_ID"].dtype == df_gene_express["depmap_id"].dtype
    df_dose_model_gene_express = pd.merge(df_dose_model, df_gene_express, left_on="ARXSPAN_ID", right_on=df_gene_express.columns[0])
    print("Shape of df_dose_model_gene_express = {}".format(df_dose_model_gene_express.shape))
    print(df_dose_model_gene_express.head())
except pd.errors.ParserError as e:
    print(f"Error: {e}")

In [None]:
# Filter columns to be used as features in X
dataset_final = df_dose_model_crispr.select_dtypes(include=[np.number])
print("Shape of dataset_final = {}".format(dataset_final.shape))

# TODO preprocessing to convert non-numerical columns we want to keep as features

In [None]:
# # Check for NANs
# print(dataset_final.isna().sum().sort_values(ascending=False))
# # dataset_final_filled = dataset_final.fillna(0)
print(dataset_final.isna().sum().sort_values(ascending=False))

In [None]:
# Define a simple ranking dataset
class RankingDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

# Define a simple neural network for ranking
class SimpleRankingNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleRankingNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(50, 1)  # Output single score for ranking

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [None]:
# Split into features (X) and target (y)
y = dataset_final['IC50_PUBLISHED'].values
X = dataset_final.drop(columns=['IC50_PUBLISHED']).values

# Standardize the dataset
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

# Training parameters
learning_rate = 0.001
num_epochs = 20
batch_size = 32
k_folds = 5

# Cross-validation
kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)
results = {}

for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
    print(f'Fold {fold + 1}/{k_folds}')

    # Create data loaders
    train_dataset = RankingDataset(X[train_idx], y[train_idx])
    val_dataset = RankingDataset(X[val_idx], y[val_idx])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Initialize the model, loss function, and optimizer
    model = SimpleRankingNN(input_size=X.shape[1])
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop with progress bar
    for epoch in tqdm(range(num_epochs), desc=f"Training Fold {fold+1}/{k_folds}"):
        model.train()
        for batch_features, batch_targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
            optimizer.zero_grad()
            outputs = model(batch_features).squeeze()
            loss = criterion(outputs, batch_targets)
            loss.backward()
            optimizer.step()

    # Validation with progress bar
    model.eval()
    val_loss = 0
    val_rmse = 0
    total = 0
    with torch.no_grad():
        for batch_features, batch_targets in tqdm(val_loader, desc=f"Validating Fold {fold+1}/{k_folds}", leave=False):
            outputs = model(batch_features).squeeze()
            loss = criterion(outputs, batch_targets)
            val_loss += loss.item()
            val_rmse += torch.sqrt(loss).item() * batch_targets.size(0)
            total += batch_targets.size(0)

    avg_val_loss = val_loss / len(val_loader)
    avg_val_rmse = val_rmse / total
    print(f'Validation Loss: {avg_val_loss:.4f}, RMSE: {avg_val_rmse:.4f}')
    results[fold] = avg_val_rmse

# Print fold results
print(f'\nK-Fold Cross Validation results for {k_folds} folds')
for key, value in results.items():
    print(f'Fold {key+1}: RMSE {value:.4f}')
print(f'Average RMSE: {np.mean(list(results.values())):.4f}')
