In [1]:
# Objectives:
# 1. Set up a basic example using TensorFlow Ranking using our dataset.
# 2. Run it successfully end to end even if the results are not good.
# 3. Clean up the data loading part

In [2]:
EPOCHS = 10
BATCH_SIZE = 16

In [3]:
import numpy as np
# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

In [4]:
import pandas as pd

In [5]:
import tensorflow as tf
print(tf.__version__)


2.15.1


In [6]:
from tensorflow.keras import layers
print(layers)
# Our dataset is definitely heterogenous therefore we need to follow this guide and build the preprocessing into the model
# https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example

<module 'keras.api._v2.keras.layers' from 'C:\\Users\\chris\\AppData\\Local\\anaconda3\\envs\\rankenv\\lib\\site-packages\\keras\\api\\_v2\\keras\\layers\\__init__.py'>


In [7]:
import os

# Define the file path
csv_file_path = 'data/df_response_model_expression_all_genes.csv'

# Ensure the 'data' directory exists
os.makedirs('data', exist_ok=True)

OVERWRITE = False

In [8]:

# Import our custom Dataloader
from data_loader import DataLoader

# Create an instance of DataLoader and pass the name of the dir that holds the dataset
data_loader = DataLoader(data_dir="data")


In [9]:

# Read cell line to drug id to ic50
# https://depmap.org/portal/data_page/?tab=allData&releasename=Sanger+GDSC1+and+GDSC2&filename=sanger-dose-response.csv
# IC50 score of drugs (DRUG_ID) per cell line (COSMIC_ID) for GDSC1 and GDSC2 

# Load the "resopnse" data
df_response = data_loader.load_data('sanger-dose-response.csv')

df_response = df_response.query('DATASET == "GDSC2"')
df_response = df_response.filter(items=["DRUG_ID", "ARXSPAN_ID", "IC50_PUBLISHED"])
print(df_response)

sanger-dose-response.csv read successfully.
        DRUG_ID  ARXSPAN_ID  IC50_PUBLISHED
268718     1003  ACH-000958        0.025129
268719     1003  ACH-000651        0.049577
268720     1003  ACH-000856        0.028549
268721     1003  ACH-000360        0.039996
268722     1003  ACH-001199        1.986678
...         ...         ...             ...
387621     2172  ACH-000288       25.410793
387622     2172  ACH-001065        0.339325
387623     2172  ACH-000930        7.780877
387624     2172  ACH-000859      534.688321
387625     2172  ACH-000536      120.177282

[118908 rows x 3 columns]


In [10]:

# Read the Model.csv
# This dataset contains demographics and cancer type
# https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap+Public+24Q2&filename=Model.csv
# Mapping between 'ModelID', 'PatientID', 'SangerModelID', 'COSMICID', etc
# ModelID here is the cell line id.
# OncotreeCode is the type of cancer.

df_model = data_loader.load_data('Model.csv')
df_model = df_model.filter(items=['ModelID', 
                                  'OncotreeCode', 
                                  'AgeCategory', 
                                  'Sex'])
                                  # 'PatientRace', 
                                  # 'PrimaryOrMetastasis'])
print(df_model)

# Check for any remaining missing values
print(df_model.isnull().sum())
# [1959 rows x 4 columns]
# ModelID           0
# OncotreeCode    141
# AgeCategory       0
# Sex              25
# dtype: int64
df_model['OncotreeCode'] = df_model['OncotreeCode'].fillna('Unknown')
df_model['OncotreeCode'] = df_model['OncotreeCode'].fillna('NON_CANCEROUS')


Model.csv read successfully.
         ModelID OncotreeCode AgeCategory     Sex
0     ACH-000001        HGSOC       Adult  Female
1     ACH-000002          AML       Adult  Female
2     ACH-000003         COAD       Adult    Male
3     ACH-000004          AML       Adult    Male
4     ACH-000005          AML       Adult    Male
...          ...          ...         ...     ...
1954  ACH-003161          NaN     Unknown     NaN
1955  ACH-003181          LMS       Adult  Female
1956  ACH-003183          MFS       Adult    Male
1957  ACH-003184          LMS       Adult  Female
1958  ACH-003191         GCTB       Adult    Male

[1959 rows x 4 columns]
ModelID           0
OncotreeCode    141
AgeCategory       0
Sex              25
dtype: int64


In [11]:

# Load the gene expression data from "OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected"
df_expression = data_loader.load_data("OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected.csv")

# The first column header which is the cell line id happens to be empty in the dataset, so we set it manually
df_expression = df_expression.rename(columns={'': 'ARXSPAN_ID'})
df_expression_columns = df_expression.columns.tolist()
df_expression_columns.remove('ARXSPAN_ID')
print(df_expression_columns)
# Select specific columns
# df_expression = df_expression[['ARXSPAN_ID',
#                                'ZMIZ1 (57178)', 
#                                'ENG (2022)',
#                                'FGFR1 (2260)',
#                                'PAWR (5074)']] 
#                                # 'KRT17 (3872)', 
#                                # 'MPO (4353)',
#                                # 'LAT2 (7462)']]

OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected.csv read successfully.
['ZNF891 (101060200)', 'ARMC10 (83787)', 'PTGER4 (5734)', 'EIF1AD (84285)', 'ABCG5 (64240)', 'CXCR4 (7852)', 'CAPNS1 (826)', 'PTGES3L (100885848)', 'DESI1 (27351)', 'STAU2 (27067)', 'FGF9 (2254)', 'TMOD1 (7111)', 'TRNP1 (388610)', 'NOL10 (79954)', 'SPTBN5 (51332)', 'IMPG2 (50939)', 'LRTM1 (57408)', 'ZNF579 (163033)', 'RPL3 (6122)', 'ATP13A2 (23400)', 'TMED7-TICAM2 (100302736)', 'AKR1E2 (83592)', 'OVOL2 (58495)', 'TACR1 (6869)', 'EGFR (1956)', 'COX5A (9377)', 'RELA (5970)', 'PAXBP1 (94104)', 'ZSCAN2 (54993)', 'MRNIP (51149)', 'LY6G6D (58530)', 'ARFGEF3 (57221)', 'PIGC (5279)', 'GPT (2875)', 'ZNF296 (162979)', 'IKZF3 (22806)', 'TSPAN17 (26262)', 'EXOSC8 (11340)', 'CCDC152 (100129792)', 'SRP54 (6729)', 'POU4F1 (5457)', 'SCAF11 (9169)', 'DAZ3 (57054)', 'TANC2 (26115)', 'COMMD7 (149951)', 'ABHD15 (116236)', 'ACTR1B (10120)', 'EPS15L1 (58513)', 'APEX2 (27301)', 'C15orf48 (84419)', 'VTCN1 (79679)', 'C10orf90 (11861

In [12]:

# Join response and model

try:
    assert df_response["ARXSPAN_ID"].dtype == df_model["ModelID"].dtype
    df_response_model = pd.merge(df_response, df_model, left_on='ARXSPAN_ID', right_on='ModelID')
    df_response_model = df_response_model.drop('ModelID', axis=1) # we drop ModelID since it's the same info as ARXSPAN_ID 
    print("Shape of df_response_model = {}".format(df_response_model.shape))
    print(df_response_model.head())
except Exception as e:
    print(f"Error: {e}")
    

Shape of df_response_model = (115502, 6)
   DRUG_ID  ARXSPAN_ID  IC50_PUBLISHED OncotreeCode AgeCategory     Sex
0     1003  ACH-000958        0.025129         COAD       Adult  Female
1     1003  ACH-000651        0.049577         COAD       Adult    Male
2     1003  ACH-000856        0.028549         BRCA       Adult  Female
3     1003  ACH-000360        0.039996         COAD       Adult    Male
4     1003  ACH-001199        1.986678         COAD       Adult  Female


In [13]:

# Join response_model and expression

try:
    assert df_response_model['ARXSPAN_ID'].dtype == df_expression['ARXSPAN_ID'].dtype
    df_response_model_expression = pd.merge(df_response_model, df_expression, left_on='ARXSPAN_ID', right_on='ARXSPAN_ID')
    print("Shape of df_response_model_expression = {}".format(df_response_model_expression.shape))
    print(df_response_model_expression.head())
except Exception as e:
    print(f"Error: {e}")


Shape of df_response_model_expression = (88976, 19143)
   DRUG_ID  ARXSPAN_ID  IC50_PUBLISHED OncotreeCode AgeCategory     Sex  \
0     1003  ACH-000958        0.025129         COAD       Adult  Female   
1     1003  ACH-000651        0.049577         COAD       Adult    Male   
2     1003  ACH-000856        0.028549         BRCA       Adult  Female   
3     1003  ACH-000360        0.039996         COAD       Adult    Male   
4     1003  ACH-000400        0.004820         COAD       Adult    Male   

   ZNF891 (101060200)  ARMC10 (83787)  PTGER4 (5734)  EIF1AD (84285)  ...  \
0            0.061287        6.071350       1.405420        4.934454  ...   
1            0.865975        5.049065       0.208657        5.219542  ...   
2            1.360683        5.366999       3.608600        4.888851  ...   
3            1.105224        5.273343       1.169296        3.465737  ...   
4            0.493279        5.851769       0.452759        4.597744  ...   

   FAM13C (220965)  MUC20 (2009

In [14]:

# Persist csv right after join and before split for classification
if not os.path.exists(csv_file_path) or OVERWRITE:
    df_response_model_expression.to_csv('data/df_response_model_expression_all_genes.csv', index=False)


In [15]:
df_response_model_expression.dtypes

DRUG_ID               int64
ARXSPAN_ID           object
IC50_PUBLISHED      float64
OncotreeCode         object
AgeCategory          object
                     ...   
FCRL6 (343413)      float64
DNMT3B (1789)       float64
ZCCHC10 (54819)     float64
PRSS2 (5645)        float64
ADAMTSL4 (54507)    float64
Length: 19143, dtype: object

In [16]:
features = df_response_model_expression
target = features.pop('IC50_PUBLISHED')

In [17]:
# Necessary setup to convert the problem to classification just for testing.
bins = [0, 1, 10, 100, float('inf')]
labels = ['HIGH', 'MODERATE', 'LOW', 'INACTIVE']
target_binned = pd.cut(target, bins=bins, labels=labels, right=False)
num_classes = len(np.unique(target_binned))
print(num_classes)


# Convert CategoricalDtype to string
target_binned = target_binned.astype(str)

# Convert the categorical labels to numeric indices using StringLookup
lookup = tf.keras.layers.StringLookup(vocabulary=labels, mask_token=None, num_oov_indices=0, output_mode='int')
target_indices = lookup(target_binned)

# Ensure that the indices are in the range [0, len(labels)-1]
num_classes = len(labels)  # Should be 4 in this case

# Now use CategoryEncoding to one-hot encode these indices
target_one_hot = tf.keras.layers.CategoryEncoding(num_tokens=num_classes, output_mode='one_hot')(target_indices)


4




In [18]:
# https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#build_the_preprocessing_head

In [19]:
# Build the preprocessing head
binary_feature_names = []
categorical_feature_names = ['DRUG_ID', 'ARXSPAN_ID', 'OncotreeCode', 'AgeCategory', 'Sex']

inputs = {}
for name, column in features.items():
  if type(column[0]) == str:
    dtype = tf.string
  elif (name in categorical_feature_names or
        name in binary_feature_names):
    dtype = tf.int64
  else:
    dtype = tf.float32

  inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

# inputs

In [20]:
preprocessed = []

In [21]:
# # Binary inputs

# for name in binary_feature_names:
#   inp = inputs[name]
#   preprocessed.append(inp)

# preprocessed

In [24]:
# Numeric inputs

# numeric_feature_names = ['ZMIZ1 (57178)', 'ENG (2022)']
numeric_feature_names = df_expression_columns
# numeric_features = features[numeric_feature_names]
numeric_features = features[df_expression_columns].astype('float32')


# numeric_features_dict = {key: value.to_numpy()[:, tf.newaxis] for key, value in dict(numeric_features).items()}

# Convert the DataFrame to a NumPy array
numeric_features_array = numeric_features.to_numpy(dtype=np.float32)

# Create a tf.data.Dataset from the NumPy array
batch_size = 1024  # You can adjust this based on your system's memory
numeric_features_ds = tf.data.Dataset.from_tensor_slices(numeric_features_array).batch(batch_size)

# normalizer = tf.keras.layers.Normalization(axis=-1)
# normalizer.adapt(np.concatenate([value for key, value in sorted(numeric_features_dict.items())], axis=1))
# Initialize the Normalization layer
normalizer = tf.keras.layers.Normalization(axis=-1)

# Adapt the normalizer using the dataset
normalizer.adapt(numeric_features_ds)

# numeric_inputs = []
# for name in numeric_feature_names:
#   numeric_inputs.append(inputs[name])

# numeric_inputs = tf.keras.layers.Concatenate(axis=-1)(numeric_inputs)
# numeric_normalized = normalizer(numeric_inputs)

# preprocessed.append(numeric_normalized)

# preprocessed
# Prepare the inputs for the model
numeric_inputs = []
for name in numeric_feature_names:
    numeric_inputs.append(inputs[name])

# Concatenate the numeric inputs
numeric_inputs = tf.keras.layers.Concatenate(axis=-1)(numeric_inputs)

# Normalize the concatenated inputs
numeric_normalized = normalizer(numeric_inputs)

# Append to preprocessed layers
preprocessed.append(numeric_normalized)


MemoryError: Unable to allocate 12.7 GiB for an array with shape (19137, 88976) and data type float64

In [None]:
# Categorical features
for name in categorical_feature_names:
    # print(name)
    vocab = sorted(set(features[name]))
    # print(f'name: {name}')
    # print(f'vocab: {vocab}\n')
    
    if type(vocab[0]) is str:
        lookup = tf.keras.layers.StringLookup(vocabulary=vocab, output_mode='one_hot')
    else:
        lookup = tf.keras.layers.IntegerLookup(vocabulary=vocab, output_mode='one_hot')
    x = inputs[name]
    x = lookup(x)
    preprocessed.append(x)


In [None]:
preprocessed_result = tf.keras.layers.Concatenate(axis=1)(preprocessed)
preprocessed_result

In [None]:
preprocessor = tf.keras.Model(inputs, preprocessed_result)

In [None]:
# tf.keras.utils.plot_model(preprocessor, rankdir="LR", show_shapes=True,  show_layer_names=True)

In [None]:
# preprocessor(dict(df_response_model_expression.iloc[:1]))

In [None]:
body = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [None]:
# inputs

In [None]:
x = preprocessor(inputs)
x

In [None]:
result = body(x)
result

In [None]:
model = tf.keras.Model(inputs, result)

model.compile(optimizer='adam',
                # loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                loss=tf.keras.losses.CategoricalCrossentropy(),
                metrics=['accuracy'
                    # tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
                    # tf.keras.metrics.Precision(name='precision'),
                    # tf.keras.metrics.Recall(name='recall'),
                    # tf.keras.metrics.AUC(name='auc'),
                    # tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='top_5_accuracy')
                ])

In [None]:
# tf.keras.utils.plot_model(model, show_shapes=True,  show_layer_names=True)

In [None]:
history = model.fit(dict(df_response_model_expression), target_one_hot, epochs=EPOCHS, batch_size=BATCH_SIZE)