In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Libraries

In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import warnings
warnings.filterwarnings("ignore")

# Load Data

TCGA breast invasive carcinoma (BRCA) gene expression by RNAseq (polyA+ IlluminaHiSeq percentile)


Goldman, M.J., Craft, B., Hastie, M. et al. Visualizing and interpreting cancer genomics data via the Xena platform. Nat Biotechnol (2020). https://doi.org/10.1038/s41587-020-0546-8

In [None]:
# load data
# load gene expression data
geneExpressionData = pd.read_csv('/kaggle/input/breast-cancer/TCGA.BRCA.sampleMap_HiSeqV2_percentile/HiSeqV2_percentile', sep='\t')
geneExpressionData

Curated survival data from the Pan-cancer Atlas paper titled "An Integrated TCGA Pan-Cancer Clinical Data Resource (TCGA-CDR) to drive high quality survival outcome analytics". The paper highlights four types of carefully curated survival endpoints, and recommends the use of the endpoints of OS, PFI, DFI, and DSS for each TCGA cancer type.


OS: overall survial

PFI: progression-free interval

DSS: disease-specific survival

DFI: disease-free interval

In [None]:
# load survival data
survivalData = pd.read_csv('/kaggle/input/breast-cancer/tcga-xena-hub.s3.us-east-1.amazonaws.com_download_survival2FBRCA_survival.txt', sep='\t')
survivalData

BRCA_clinicalMatrix

In [None]:
clinicalData = pd.read_csv('/kaggle/input/breast-cancer/TCGA.BRCA.sampleMap_BRCA_clinicalMatrix', sep='\t')
clinicalData

In [None]:
geneExpressionData

In [None]:
geneExpressionData.info()

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(survivalData)
profile.to_file('survivalData profile_report.html')
# profile

In [None]:
from pandas_profiling import ProfileReport
# pandas gags on this dataset - too big
# profile = ProfileReport(geneExpressionData)
# profile.to_file('geneExpressionData profile_report.html')
# profile

# Data Pre-processing

In [None]:
# join datasets together
# print(geneExpressionData)
gene_samples = geneExpressionData['sample'].unique()
gene_samples.sort()
print('gene_samples\n', gene_samples)
print(gene_samples.shape)
pd.DataFrame(gene_samples).to_csv("gene_samples.csv")

survival_samples = survivalData['sample'].unique()
print('survival_samples\n', survival_samples)

clinical_samples = clinicalData['sampleID'].unique()
print('clinical_samples\n', clinical_samples)


Observation: in the gene expression dataset, the sample IDs are in the column names, and gene IDs are in the 'sample' column, so...

We'll need to transpose the gene dataset in order to combine it with survival and clinical data

In [None]:
# transpose the gene dataset...
geneTranspose = geneExpressionData.set_index('sample')
geneTranspose = geneTranspose.transpose()
geneTranspose.reset_index(names='sampleID', inplace=True)
print(geneTranspose.info())
geneTranspose


In [None]:
geneColNames = list(geneTranspose.columns)
geneColNames.remove('sampleID')
geneColNames

In [None]:
# looks like the clinial and survival datasets have an easy join
# the join columns need to have the same name to use the merge function...
# so will copy the 'sample' column in survival data to a new 'sampleID' column
mergeSurvival = survivalData.copy()
mergeSurvival['sampleID'] = mergeSurvival['sample']
mergeSurvival.info()

In [None]:
# now that clinial and survival datasets both have a sampleID column, we can merge...
mergeSurvivalClinical = pd.merge(clinicalData, mergeSurvival, on='sampleID')
print(mergeSurvivalClinical.info())
mergeSurvivalClinical

In [None]:
# Merge the Survival+Clinical dataset with the gene expression dataset
mergeSurvivalClinicalGene = pd.merge(mergeSurvivalClinical, geneTranspose, on='sampleID')
print(mergeSurvivalClinicalGene.info())
mergeSurvivalClinicalGene

# Exploratory Data Analysis

In [None]:
# heatmap of gene expression data
# Python program to generate heatmap which 
# represents panda dataframe in color-coding schemes
# along with values mentioned in each cell
  
# import required libraries
import pandas as pd
import seaborn as sns
  
# Defining figure size  
# for the output plot 
fig, ax = plt.subplots(figsize = (12, 7))
  
sns.heatmap(geneTranspose.iloc[:, 1:])

In [None]:
# gene expression heatmap, genes sorted by average expression percentile
geneAvgExpression = geneTranspose.iloc[:, 1:].mean(axis=0)
geneAvgExpression.sort_values()

geneByAvgExp = geneTranspose.reindex(geneTranspose.iloc[:, 1:].mean().sort_values().index, axis=1)
print(geneByAvgExp)

# Defining figure size  
# for the output plot 
fig, ax = plt.subplots(figsize = (12, 7))

ax.set_title('Gene Expression by Breast Cancer Sample')
sns.color_palette("mako", as_cmap=True)
sns.heatmap(geneByAvgExp, cmap="mako")

plt.ylabel('Sample ID')
plt.xlabel('Gene')



In [None]:
# find and handle blanks / NaNs
mergeSurvivalClinicalGene.isna().sum().sort_values(ascending=False)[lambda x : x > 0]

# Feature Engineering

In [None]:
!pip install scikit-learn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
!pip install fast-ml
from fast_ml.model_development import train_valid_test_split

In [None]:
# Data Split
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(mergeSurvivalClinicalGene, target = 'OS', train_size=0.7, valid_size=0.2, test_size=0.1)

print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

In [None]:
print(X_train['OS.time'])

In [None]:
# Encoding Clinical dataset categorical features
# start with select ordinal features... ER_Status_nature2012
# work with a copy of the dataset...
ordFeatures_train = pd.DataFrame(X_train.AJCC_Stage_nature2012).copy() # start with a couple...
ordFeatures_valid = pd.DataFrame(X_valid.AJCC_Stage_nature2012).copy() # start with a couple...

ordFeatures_train.replace(np.nan, 'Unknown', inplace=True)
ordFeatures_valid.replace(np.nan, 'Unknown', inplace=True)

print('ordFeatures_train:\n', ordFeatures_train)

# get unique values from training set
AJCC_Stage_nature2012_LOV = ordFeatures_train.AJCC_Stage_nature2012.unique()
print('AJCC_Stage_nature2012_LOV:\n', AJCC_Stage_nature2012_LOV)

# set the ordinal sequence
ordinal_encoder = OrdinalEncoder(categories=[[
    'Stage I','Stage IA','Stage IB', 'Stage II','Stage IIA','Stage IIB','Stage III','Stage IIIA','Stage IIIB','Stage IIIC','Stage IV','Stage X','Unknown']])

# fit the encoder to the training data
ordinal_encoder.fit(ordFeatures_train)

# transform the data
ordFeatures_train = pd.DataFrame(ordinal_encoder.transform(ordFeatures_train), index=X_train.index)
ordFeatures_valid = pd.DataFrame(ordinal_encoder.transform(ordFeatures_valid), index=X_valid.index)

# set the column name for the transformed data
ordFeatures_train.columns = ordinal_encoder.get_feature_names_out()
ordFeatures_valid.columns = ordinal_encoder.get_feature_names_out()

# take a look at the data
print('ordFeatures_train:\n', ordFeatures_train)
print('ordFeatures_train:\n', ordFeatures_train.describe())
print('ordFeatures_valid:\n', ordFeatures_valid)
print('ordFeatures_valid:\n', ordFeatures_valid.describe())

In [None]:
# encoding nominal features...,
nomFeatureColNames = ['ER_Status_nature2012', 'HER2_Final_Status_nature2012','Metastasis_nature2012','Node_nature2012','Tumor_nature2012']

# work with a copy of the dataset...
nominalFeatures_train = X_train[nomFeatureColNames].copy()
nominalFeatures_valid = X_valid[nomFeatureColNames].copy() 
print('\nnominalFeatures_train:\n', nominalFeatures_train)
# print(nominalFeatures.describe())

# replace all NaN values with 'Unknown'
nominalFeatures_train = nominalFeatures_train.replace(np.nan, 'Unknown')
nominalFeatures_valid = nominalFeatures_valid.replace(np.nan, 'Unknown')
# print(nominalFeatures)

# create one hot encoder
nominal_encoder = OneHotEncoder(drop='first', sparse_output=True)

# apply the one hot encoder 
nominalFeaturesOHE_train = pd.DataFrame(nominal_encoder.fit_transform(nominalFeatures_train).toarray(), index=X_train.index)
nominalFeaturesOHE_train.columns = nominal_encoder.get_feature_names_out()
nominalFeaturesOHE_valid = pd.DataFrame(nominal_encoder.transform(nominalFeatures_valid).toarray(), index=X_valid.index)
nominalFeaturesOHE_valid.columns = nominal_encoder.get_feature_names_out()

# print(nominalFeaturesOHE.columns)
# print(nominalFeaturesOHE.info())

# take a look at the one hot encoded features...
print('\nnominalFeaturesOHE_train:\n', nominalFeaturesOHE_train)
print('\nnominalFeaturesOHE_train:\n', nominalFeaturesOHE_train.describe())
print('\nnominalFeaturesOHE_valid:\n', nominalFeaturesOHE_valid)
print('\nnominalFeaturesOHE_valid:\n', nominalFeaturesOHE_valid.describe())

In [None]:
# Scale numerical features
listOfNumericalColNames = ['Age_at_Initial_Pathologic_Diagnosis_nature2012', 'Days_to_Date_of_Last_Contact_nature2012','Days_to_date_of_Death_nature2012','Integrated_Clusters_no_exp__nature2012','Integrated_Clusters_unsup_exp__nature2012','Integrated_Clusters_with_PAM50__nature2012','OS_Time_nature2012','SigClust_Intrinsic_mRNA_nature2012','SigClust_Unsupervised_mRNA_nature2012','age_at_initial_pathologic_diagnosis','days_to_birth','days_to_collection','days_to_death','days_to_initial_pathologic_diagnosis','days_to_last_followup','days_to_last_known_alive','days_to_new_tumor_event_additional_surgery_procedure','initial_weight','lymph_node_examined_count','methylation_Clusters_nature2012','miRNA_Clusters_nature2012','number_of_lymphnodes_positive_by_he','number_of_lymphnodes_positive_by_ihc','year_of_initial_pathologic_diagnosis']
listOfNumericalColNames += ['OS.time', 'DSS.time', 'DFI.time', 'PFI.time']
listOfNumericalColNames += geneColNames
print(X_train['OS.time'])
numericalFeatures_train = X_train[listOfNumericalColNames].copy()
# print(numericalFeatures_train)

min_max_scaler = preprocessing.MinMaxScaler()
numericalFeaturesMinMax_train = pd.DataFrame(min_max_scaler.fit_transform(numericalFeatures_train), columns = numericalFeatures_train.columns, index=X_train.index)

print(numericalFeaturesMinMax_train)

numericalFeatures_valid = X_valid[listOfNumericalColNames].copy()
numericalFeaturesMinMax_valid = pd.DataFrame(min_max_scaler.transform(numericalFeatures_valid), columns = numericalFeatures_valid.columns, index=X_valid.index)
print(numericalFeaturesMinMax_valid)

In [None]:
# more features that are already ready to go and don't need scaling (binary 0 or 1)
listOfReadyColNames = ['DSS', 'DFI', 'PFI']
readyFeatures_train = X_train[listOfReadyColNames].copy()
readyFeatures_valid = X_valid[listOfReadyColNames].copy()

print('\nreadyFeatures_train:\n', readyFeatures_train)
print('\nreadyFeatures_valid:\n', readyFeatures_valid)

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(survivalData)
profile.to_file('survivalData profile_report.html')


In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(clinicalData)
profile.to_file('clinicalData profile_report.html')
# profile

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(mergeSurvivalClinical)
profile.to_file('mergeSurvivalClinical profile_report.html')

In [None]:
# put training set together
print('\nordFeatures_train:\n', ordFeatures_train.info())
print('\nnominalFeaturesOHE_train:\n', nominalFeaturesOHE_train.info())
print('\nnumericalFeaturesMinMax_train:\n', numericalFeaturesMinMax_train.info())
print('\nreadyFeatures_train:\n', readyFeatures_train.info())

X_train = pd.DataFrame(pd.concat([ordFeatures_train, nominalFeaturesOHE_train, numericalFeaturesMinMax_train, readyFeatures_train], axis=1))
X_train.drop_duplicates(inplace=True)

print('\nX_train:\n', X_train.info())
print('\nX_train:\n', X_train)
X_valid = pd.DataFrame(pd.concat([ordFeatures_valid, nominalFeaturesOHE_valid, numericalFeaturesMinMax_valid, readyFeatures_valid], axis=1))

print('\ny_train:\n', y_train)

print('\nX_valid:\n', X_valid)
print('\ny_valid:\n', y_valid)


In [None]:
print('\nX_train:\n', X_train.shape)
print('\ny_train:\n', y_train.shape)
print('\nX_valid:\n', X_valid.shape)
print('\ny_valid:\n', y_valid.shape)


In [None]:
# find and handle blanks / NaNs
X_train.isna().sum().sort_values(ascending=False)[lambda x : x > 0]

In [None]:
# find and handle blanks / NaNs
X_valid.isna().sum().sort_values(ascending=False)[lambda x : x > 0]

In [None]:
# let's elimiate the 1 sample that has no data for PFI.time, DSS.time, OS.time
badSample = X_valid.loc[X_valid['PFI.time'].isna()]

# badSample = X_train['PFI.time'][lambda x : x.isna()]
badSample
print(badSample.index.tolist())

X_valid.drop(index=badSample.index.tolist(), inplace=True)
y_valid.drop(index=badSample.index.tolist(), inplace=True)

In [None]:
# let's elimiate the 1 sample that has no data for PFI.time, DSS.time, OS.time
badSample = X_train.loc[X_train['PFI.time'].isna()]

# badSample = X_train['PFI.time'][lambda x : x.isna()]
badSample
print(badSample.index.tolist())

X_train.drop(index=badSample.index.tolist(), inplace=True)
y_train.drop(index=badSample.index.tolist(), inplace=True)

In [None]:
X_train

In [None]:
# find and handle blanks / NaNs
NaNCols = pd.DataFrame(X_train.isna().sum().sort_values(ascending=False)[lambda x : x > 0])
print(NaNCols.index)

X_train.drop(columns=NaNCols.index, inplace=True)
X_valid.drop(columns=NaNCols.index, inplace=True)

In [None]:
# find and handle blanks / NaNs
NaNCols = pd.DataFrame(X_valid.isna().sum().sort_values(ascending=False)[lambda x : x > 0])
print(NaNCols.index)

In [None]:
# this is too big to attemp
# profile = ProfileReport(mergeSurvivalClinicalGene, correlations={"auto": {"calculate": False}})
# profile.to_file('mergeSurvivalClinicalGene profile_report.html')
# profile

# Model

In [None]:
print(X_train.shape)
print(X_train.shape[1])
X_train

In [None]:
# https://www.tensorflow.org/tutorials/structured_data/time_series#linear_model
MAX_EPOCHS = 50

def compile_and_fit(model, X_train, y_train, X_valid, y_valid, patience=2):
  early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=patience,
                                                    mode='min')

  model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=[tf.keras.metrics.BinaryAccuracy()])

  history = model.fit(X_train, y_train, epochs=MAX_EPOCHS,
                      validation_data=(X_valid, y_valid),
                      callbacks=[early_stopping])
  return history

In [None]:
# https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/
def plot_training_loss_accuracy(history):
    print(history.history.keys())
    # summarize history for accuracy
    plt.plot(history.history['binary_accuracy'])
    plt.plot(history.history['val_binary_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()
    return

In [None]:
dense_model = keras.models.Sequential([
    keras.layers.Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

history = compile_and_fit(dense_model, X_train, y_train, X_valid, y_valid)
dense_model.save('dense_model.keras')

# IPython.display.clear_output()
# multi_val_performance['Dense'] = dense_model.evaluate(X_valid, y_valid)
# multi_performance['Dense'] = dense_model.evaluate(multi_window.test, verbose=0)

plot_training_loss_accuracy(history)


In [None]:
# Evaluate the model
eval_model=dense_model.evaluate(X_train, y_train)
eval_model

In [None]:
# Predict
y_pred=dense_model.predict(X_valid)
print(y_pred)
y_pred = (y_pred>0.5)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_valid, y_pred).ravel()
(tn, fp, fn, tp)


In [None]:
# prepare the clinical data for machine learning...


In [None]:
# heatmap of gene expression data


In [None]:
# clustering which group genes and/or samples together based on the similarity of their gene expression pattern

In [None]:
# join data