In [None]:
# conda install pytorch torchvision torchaudio cpuonly -c pytorch
# conda install conda-forge::polars
# conda install conda-forge::xgboost

In [None]:
import polars as pl
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.datasets import make_regression
from tqdm import tqdm
import xgboost as xgb

In [None]:
# Read cell line to drug id to ic50
# https://depmap.org/portal/data_page/?tab=allData&releasename=Sanger+GDSC1+and+GDSC2&filename=sanger-dose-response.csv
# IC50 score of drugs (DRUG_ID) per cell line (COSMIC_ID) for GDSC1 and GDSC2 
try:
    df_dose_resp = pl.read_csv("C:\\Users\\chris\\rank-rx\\data\\sanger-dose-response.csv")
    df_dose_resp_gdsc2 = df_dose_resp.filter(pl.col("DATASET") == "GDSC2")
    df_dose_resp_gdsc2_edited = df_dose_resp_gdsc2.select(["DRUG_ID", "ARXSPAN_ID", "IC50_PUBLISHED"])
    print("Shape of df_dose_resp_gdsc2 = {}".format(df_dose_resp_gdsc2_edited.shape))
    print("Unique cell lines (ARXSPAN_ID) = {}".format(df_dose_resp_gdsc2_edited['ARXSPAN_ID'].unique().len()))
    print("Unique drugs = {}".format(df_dose_resp_gdsc2_edited['DRUG_ID'].unique().len()))
    grouped = df_dose_resp_gdsc2_edited.group_by(['ARXSPAN_ID', 'DRUG_ID']).agg(pl.len())
    print("Unique combinations of cell line x drug = {}".format(grouped.shape[0]))
    print(df_dose_resp_gdsc2_edited.head)
except Exception as e:
    print(f"Error: {e}")

In [None]:
# Read demographics and cancer type
# https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap+Public+24Q2&filename=Model.csv
# Mapping between 'ModelID', 'PatientID', 'SangerModelID', 'COSMICID', etc
# ModelID here is the cell line id.
# OncotreeCode is the type of cancer.
try:
    df_depmap_model = pl.read_csv("C:\\Users\\chris\\rank-rx\\data\\Model.csv")
    df_depmap_model_edited = df_depmap_model.select(['ModelID', 'OncotreeCode', 'AgeCategory', 'Sex', 'PatientRace', 'PrimaryOrMetastasis'])
    print("Shape of df_depmap_model = {}".format(df_depmap_model_edited.shape))
    print("Unique cell lines (ModelID) ACH-XXXXXX = {}".format(df_depmap_model_edited['ModelID'].unique().len()))
    print(df_depmap_model_edited.head())
except Exception as e:
    print(f"Error: {e}")

In [None]:
# Read gene expression data OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected
try:
    df_gene_express = pl.read_csv("C:\\Users\\chris\\rank-rx\\data\\OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected.csv")
    df_gene_express = df_gene_express.rename({'': 'ARXSPAN_ID'})
    print("Shape of df_gene_express = {}".format(df_gene_express.shape))
except pl.errors.ParserError as e:
    print(f"Error: {e}")

In [None]:
# Join ic50 dataset with model dataset
# This should have been 118908 rows just like df_dose_resp_gdsc2 but it's 115502 probably because model df doesn't have some cell lines of ic50 df.
try:
    assert df_dose_resp_gdsc2_edited["ARXSPAN_ID"].dtype == df_depmap_model_edited["ModelID"].dtype
    df_dose_model = df_dose_resp_gdsc2_edited.join(other = df_depmap_model_edited, left_on="ARXSPAN_ID", right_on="ModelID")
    print("Shape of df_dose_model = {}".format(df_dose_model.shape))
    print(df_dose_model.head())
except pl.errors.ParserError as e:
    print(f"Error: {e}")

In [None]:
# Join ic50 & model dataset with gene expression dataset
try:
    assert df_dose_model["ARXSPAN_ID"].dtype == df_gene_express["ARXSPAN_ID"].dtype
    df_dose_model_gene_express = df_dose_model.join(df_gene_express, left_on="ARXSPAN_ID", right_on=df_gene_express.columns[0])
    print("Shape of df_dose_model_gene_express = {}".format(df_dose_model_gene_express.shape))
    print(df_dose_model_gene_express.head())
except Exception as e:
    print(f"Error: {e}")

In [None]:
# TODO check patient profile distribution between train test, how many men/women, age distribution, etc...

In [None]:
# Encode categorical features
label_encoders = {}
for column in ['ARXSPAN_ID', 'OncotreeCode', 'AgeCategory', 'Sex', 'PatientRace', 'PrimaryOrMetastasis']:
    le = LabelEncoder()
    df_dose_model_gene_express = df_dose_model_gene_express.with_columns(pl.Series(column, le.fit_transform(df_dose_model_gene_express[column].to_list())))
    label_encoders[column] = le

In [None]:
# Prepare the features and target
features = ['DRUG_ID', 'ARXSPAN_ID', 'OncotreeCode', 'AgeCategory', 'Sex', 'PatientRace', 'PrimaryOrMetastasis']
target = 'IC50_PUBLISHED'

prediction_features = ['OncotreeCode', 'AgeCategory', 'Sex', 'PatientRace', 'PrimaryOrMetastasis']

# X = df_dose_model_gene_express[features]
# print(X.head(20))


df_dose_model_gene_express = df_dose_model_gene_express.with_columns(pl.col('IC50_PUBLISHED').round().cast(pl.Int32))

# Scale the IC50_PUBLISHED values to 0-31 using expression
df_dose_model_gene_express = df_dose_model_gene_express.with_columns([
    ((pl.col(target) - pl.col(target).min()) /
     (pl.col(target).max() - pl.col(target).min()) * 31).round().cast(pl.Int32).alias(target)
])
# y = df_dose_model_gene_express['IC50_PUBLISHED']
# print(y.head(20))


In [None]:
# Group Shuffle and Split using Polars
def group_shuffle_split(df, group_col, test_size=0.2, random_state=42):
    np.random.seed(random_state)
    groups = df[group_col].unique().to_list()
    np.random.shuffle(groups)
    test_groups = groups[:int(test_size * len(groups))]
    train_groups = groups[int(test_size * len(groups)):]
    
    train_df = df.filter(pl.col(group_col).is_in(train_groups))
    test_df = df.filter(pl.col(group_col).is_in(test_groups))
    
    return train_df, test_df

train_data, test_data = group_shuffle_split(df_dose_model_gene_express, 'ARXSPAN_ID', test_size=0.2, random_state=42)
print("Shape of train_data = {}".format(train_data.shape))
print(train_data.head())
print("Shape of test_data = {}".format(test_data.shape))
print(test_data.head())


# Separate features and target in train and test data
X_train = train_data.select(features)
print("Shape of X_train = {}".format(X_train.shape))
print(X_train.head())

y_train = train_data.select(['IC50_PUBLISHED'])
print("Shape of y_train = {}".format(y_train.shape))
print(y_train.head())

X_test = test_data.select(features)
print("Shape of X_test = {}".format(X_test.shape))
print(X_test.head())

y_test = test_data.select(['IC50_PUBLISHED'])
print("Shape of y_test = {}".format(y_test.shape))
print(y_test.head())

In [None]:
# Create group parameter for XGBoost
group_train = X_train.group_by('ARXSPAN_ID').count().select('count').to_series().to_list()
print(f"shape of group_train: {len(group_train)}")
group_test = X_test.group_by('ARXSPAN_ID').count().select('count').to_series().to_list()
print(f"shape of group_test: {len(group_test)}")

# Convert data to DMatrix
dtrain = xgb.DMatrix(X_train.select(prediction_features).to_numpy(), label=y_train.to_numpy())
dtrain.set_group(group_train)
print(f"Shape of dtrain DMatrix: ({dtrain.num_row()}, {dtrain.num_col()})")

dtest = xgb.DMatrix(X_test.select(prediction_features).to_numpy(), label=y_test.to_numpy())
dtest.set_group(group_test)
print(f"Shape of dtest DMatrix: ({dtest.num_row()}, {dtest.num_col()})")


In [None]:
# Define XGBoost parameters
params = {
    'objective': 'rank:pairwise',
    'eta': 0.1,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'max_depth': 6,
    'eval_metric': 'ndcg'
}

In [None]:
# Train the model
model = xgb.train(params, dtrain, num_boost_round=500, evals=[(dtest, 'test')], early_stopping_rounds=10)

# Predict and evaluate
y_pred = model.predict(dtest)