In [1]:
# Objectives:
# 1. Make use of TensorFlow Ranking as alternative to XGBoost

In [2]:
import pdb # pdb.set_trace()

import numpy as np
# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import polars as pl
import pandas as pd
from typing import Dict, Tuple

import tensorflow as tf
print(tf.__version__)

from tensorflow.keras import layers

import tensorflow_datasets as tfds
import tensorflow_ranking as tfr
print(tfr.__version__)


2.15.1

0.5.5.dev


In [3]:
target = 'IC50_PUBLISHED'

In [4]:
# Read cell line to drug id to ic50
# https://depmap.org/portal/data_page/?tab=allData&releasename=Sanger+GDSC1+and+GDSC2&filename=sanger-dose-response.csv
# IC50 score of drugs (DRUG_ID) per cell line (COSMIC_ID) for GDSC1 and GDSC2 
try:
    df_dose_resp = pl.read_csv("C:\\Users\\chris\\rank-rx\\data\\sanger-dose-response.csv")
    df_dose_resp_gdsc2 = df_dose_resp.filter(pl.col("DATASET") == "GDSC2")
    df_dose_resp_gdsc2_edited = df_dose_resp_gdsc2.select(["DRUG_ID", "ARXSPAN_ID", "IC50_PUBLISHED"])
    print("Shape of df_dose_resp_gdsc2 = {}".format(df_dose_resp_gdsc2_edited.shape))
    print("Unique cell lines (ARXSPAN_ID) = {}".format(df_dose_resp_gdsc2_edited['ARXSPAN_ID'].unique().len()))
    print("Unique drugs = {}".format(df_dose_resp_gdsc2_edited['DRUG_ID'].unique().len()))
    grouped = df_dose_resp_gdsc2_edited.group_by(['ARXSPAN_ID', 'DRUG_ID']).agg(pl.len())
    print("Unique combinations of cell line x drug = {}".format(grouped.shape[0]))
    print(df_dose_resp_gdsc2_edited.head)
except Exception as e:
    print(f"Error: {e}")

Shape of df_dose_resp_gdsc2 = (118908, 3)
Unique cell lines (ARXSPAN_ID) = 794
Unique drugs = 175
Unique combinations of cell line x drug = 116377
<bound method DataFrame.head of shape: (118_908, 3)
┌─────────┬────────────┬────────────────┐
│ DRUG_ID ┆ ARXSPAN_ID ┆ IC50_PUBLISHED │
│ ---     ┆ ---        ┆ ---            │
│ i64     ┆ str        ┆ f64            │
╞═════════╪════════════╪════════════════╡
│ 1003    ┆ ACH-000958 ┆ 0.025129       │
│ 1003    ┆ ACH-000651 ┆ 0.049577       │
│ 1003    ┆ ACH-000856 ┆ 0.028549       │
│ 1003    ┆ ACH-000360 ┆ 0.039996       │
│ 1003    ┆ ACH-001199 ┆ 1.986678       │
│ …       ┆ …          ┆ …              │
│ 2172    ┆ ACH-000288 ┆ 25.410793      │
│ 2172    ┆ ACH-001065 ┆ 0.339325       │
│ 2172    ┆ ACH-000930 ┆ 7.780877       │
│ 2172    ┆ ACH-000859 ┆ 534.688321     │
│ 2172    ┆ ACH-000536 ┆ 120.177282     │
└─────────┴────────────┴────────────────┘>


In [5]:
# Read demographics and cancer type
# https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap+Public+24Q2&filename=Model.csv
# Mapping between 'ModelID', 'PatientID', 'SangerModelID', 'COSMICID', etc
# ModelID here is the cell line id.
# OncotreeCode is the type of cancer.
try:
    df_depmap_model = pl.read_csv("C:\\Users\\chris\\rank-rx\\data\\Model.csv")
    df_depmap_model_edited = df_depmap_model.select(['ModelID', 'OncotreeCode', 'AgeCategory', 'Sex', 'PatientRace', 'PrimaryOrMetastasis'])
    print("Shape of df_depmap_model = {}".format(df_depmap_model_edited.shape))
    print("Unique cell lines (ModelID) ACH-XXXXXX = {}".format(df_depmap_model_edited['ModelID'].unique().len()))
    print(df_depmap_model_edited.head())
except Exception as e:
    print(f"Error: {e}")

Shape of df_depmap_model = (1959, 6)
Unique cell lines (ModelID) ACH-XXXXXX = 1959
shape: (5, 6)
┌────────────┬──────────────┬─────────────┬────────┬─────────────┬─────────────────────┐
│ ModelID    ┆ OncotreeCode ┆ AgeCategory ┆ Sex    ┆ PatientRace ┆ PrimaryOrMetastasis │
│ ---        ┆ ---          ┆ ---         ┆ ---    ┆ ---         ┆ ---                 │
│ str        ┆ str          ┆ str         ┆ str    ┆ str         ┆ str                 │
╞════════════╪══════════════╪═════════════╪════════╪═════════════╪═════════════════════╡
│ ACH-000001 ┆ HGSOC        ┆ Adult       ┆ Female ┆ caucasian   ┆ Metastatic          │
│ ACH-000002 ┆ AML          ┆ Adult       ┆ Female ┆ caucasian   ┆ Primary             │
│ ACH-000003 ┆ COAD         ┆ Adult       ┆ Male   ┆ caucasian   ┆ Primary             │
│ ACH-000004 ┆ AML          ┆ Adult       ┆ Male   ┆ caucasian   ┆ Primary             │
│ ACH-000005 ┆ AML          ┆ Adult       ┆ Male   ┆ caucasian   ┆ null                │
└────────────

In [6]:
# Read gene expression data OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected
try:
    df_gene_express = pl.read_csv("C:\\Users\\chris\\rank-rx\\data\\OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected.csv")
    df_gene_express = df_gene_express.rename({'': 'ARXSPAN_ID'})
    
    # TODO when this section becomes a method, then all genes should be selected by default if no specific genes or feature selection method are declared.
    # gene_col_names = df_gene_express.columns
    # gene_col_names.remove('ARXSPAN_ID')
    # Based on this research https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-022-04678-y
    gene_col_names = [
        'ZMIZ1 (57178)', 'ENG (2022)', 'FGFR1 (2260)', 
        'PAWR (5074)', 'KRT17 (3872)', 'MPO (4353)', 
        'LAT2 (7462)'
    ]
    
    # # print(gene_col_names)
    print("Shape of df_gene_express = {}".format(df_gene_express.shape))
    print(df_gene_express.head())
except pl.errors.ParserError as e:
    print(f"Error: {e}")

Shape of df_gene_express = (1517, 19138)
shape: (5, 19_138)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ ARXSPAN_I ┆ ZNF891    ┆ ARMC10    ┆ PTGER4    ┆ … ┆ DNMT3B    ┆ ZCCHC10   ┆ PRSS2     ┆ ADAMTSL4 │
│ D         ┆ (10106020 ┆ (83787)   ┆ (5734)    ┆   ┆ (1789)    ┆ (54819)   ┆ (5645)    ┆ (54507)  │
│ ---       ┆ 0)        ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│ str       ┆ ---       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64      │
│           ┆ f64       ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ ACH-00005 ┆ 0.984137  ┆ 4.524944  ┆ 2.019524  ┆ … ┆ 2.320999  ┆ 5.005448  ┆ 0.169594  ┆ 1.356288 │
│ 8         ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ ACH-00008 ┆ 1.335101  ┆ 3.974

In [7]:
# Join ic50 dataset with model dataset
# This should have been 118908 rows just like df_dose_resp_gdsc2 but it's 115502 probably because model df doesn't have some cell lines of ic50 df.
try:
    assert df_dose_resp_gdsc2_edited["ARXSPAN_ID"].dtype == df_depmap_model_edited["ModelID"].dtype
    df_dose_model = df_dose_resp_gdsc2_edited.join(other = df_depmap_model_edited, left_on="ARXSPAN_ID", right_on="ModelID")
    print("Shape of df_dose_model = {}".format(df_dose_model.shape))
    print(df_dose_model.head())
except pl.errors.ParserError as e:
    print(f"Error: {e}")

Shape of df_dose_model = (115502, 8)
shape: (5, 8)
┌─────────┬────────────┬─────────────┬─────────────┬────────────┬────────┬────────────┬────────────┐
│ DRUG_ID ┆ ARXSPAN_ID ┆ IC50_PUBLIS ┆ OncotreeCod ┆ AgeCategor ┆ Sex    ┆ PatientRac ┆ PrimaryOrM │
│ ---     ┆ ---        ┆ HED         ┆ e           ┆ y          ┆ ---    ┆ e          ┆ etastasis  │
│ i64     ┆ str        ┆ ---         ┆ ---         ┆ ---        ┆ str    ┆ ---        ┆ ---        │
│         ┆            ┆ f64         ┆ str         ┆ str        ┆        ┆ str        ┆ str        │
╞═════════╪════════════╪═════════════╪═════════════╪════════════╪════════╪════════════╪════════════╡
│ 1003    ┆ ACH-000958 ┆ 0.025129    ┆ COAD        ┆ Adult      ┆ Female ┆ caucasian  ┆ Primary    │
│ 1003    ┆ ACH-000651 ┆ 0.049577    ┆ COAD        ┆ Adult      ┆ Male   ┆ caucasian  ┆ Metastatic │
│ 1003    ┆ ACH-000856 ┆ 0.028549    ┆ BRCA        ┆ Adult      ┆ Female ┆ null       ┆ Metastatic │
│ 1003    ┆ ACH-000360 ┆ 0.039996    ┆ C

In [8]:
# Join ic50 & model dataset with gene expression dataset
try:
    assert df_dose_model["ARXSPAN_ID"].dtype == df_gene_express["ARXSPAN_ID"].dtype
    df_dose_model_gene_express = df_dose_model.join(df_gene_express, left_on="ARXSPAN_ID", right_on="ARXSPAN_ID")
    # df_dose_model_gene_express = df_dose_model.join(df_gene_express.select(['ARXSPAN_ID'] + gene_col_names), left_on="ARXSPAN_ID", right_on="ARXSPAN_ID")
    print("Shape of df_dose_model_gene_express = {}".format(df_dose_model_gene_express.shape))
    print(df_dose_model_gene_express.head())
except Exception as e:
    print(f"Error: {e}")

Shape of df_dose_model_gene_express = (88976, 19145)
shape: (5, 19_145)
┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ DRUG_ID ┆ ARXSPAN_ID ┆ IC50_PUBL ┆ OncotreeC ┆ … ┆ DNMT3B    ┆ ZCCHC10   ┆ PRSS2     ┆ ADAMTSL4  │
│ ---     ┆ ---        ┆ ISHED     ┆ ode       ┆   ┆ (1789)    ┆ (54819)   ┆ (5645)    ┆ (54507)   │
│ i64     ┆ str        ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│         ┆            ┆ f64       ┆ str       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1003    ┆ ACH-000958 ┆ 0.025129  ┆ COAD      ┆ … ┆ 2.072003  ┆ 4.69381   ┆ 0.240879  ┆ 0.560533  │
│ 1003    ┆ ACH-000651 ┆ 0.049577  ┆ COAD      ┆ … ┆ 2.745968  ┆ 4.884648  ┆ 3.368963  ┆ 0.335325  │
│ 1003    ┆ ACH-000856 ┆ 0.028549  ┆ BRCA      ┆ … ┆ 3.51348   ┆ 4.950103  ┆ -0.054202 ┆ 2.401006  │
│ 1003    ┆ ACH-000

In [9]:
data_pandas = df_dose_model_gene_express.to_pandas()
print(df_final.shape)

(88976, 19145)


In [None]:
data_pandas = data_pandas

In [None]:
# ratings = tfds.load('movielens/100k-ratings', split="train")
# print(type(ratings))
# <class 'tensorflow.python.data.ops.prefetch_op._PrefetchDataset'>

dc_basic = df_dose_model_gene_express.select(["ARXSPAN_ID", "OncotreeCode", "ZMIZ1 (57178)", "IC50_PUBLISHED"])
    # ARXSPAN_ID as user id
    # OncotreeCode and ZMIZ1 (57178) as movie title
    # IC50_PUBLISHED as user rating

In [None]:
# Build vocabularies to convert all user ids and all movie titles into integer indices for embedding layers
cell_line_ids = dc_basic.select(["ARXSPAN_ID"]).to_numpy()
cell_lines = tf.data.Dataset.from_tensor_slices(cell_line_ids)
cell_line_ids_vocab =  tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
cell_line_ids_vocab.adapt(cell_lines.batch(1000))

onco_ids = dc_basic.select(["OncotreeCode"]).fill_nan(0)
onco_ids = onco_ids.to_numpy()

# Check for NaN values
is_nan = np.isnan(onco_ids.astype(float))

# Check for None values
is_none = onco_ids == None  # Using '==' comparison for None

# Check for empty strings
is_empty_string = onco_ids == ""

# Combine checks
has_nan = np.any(is_nan)
has_none = np.any(is_none)
has_empty_string = np.any(is_empty_string)

# Print results
print(f"Contains NaN: {has_nan}")
print(f"Contains None: {has_none}")
print(f"Contains empty string: {has_empty_string}")


oncos = tf.data.Dataset.from_tensor_slices(onco_ids)
onco_ids_vocab = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
onco_ids_vocab.adapt(oncos.batch(1000))

In [None]:
# Group by ARXSPAN_ID to form lists for ranking models
key_func = lambda x: user_ids_vocabulary(x["ARXSPAN_ID"])
reduce_func = lambda key, dataset: dataset.batch(100)
ds_train = dc_basic.group_by_window(
    key_func=key_func, reduce_func=reduce_func, window_size=100)


In [None]:
for x in ds_train.take(1):
  for key, value in x.items():
    print(f"Shape of {key}: {value.shape}")
    print(f"Example values of {key}: {value[:5].numpy()}")
    print()