# Classifying Early and Late stage KIRC tumors from the TCGA database

Replication of the study in [Jagga, Z. & Gupta, D. Classification models for clear cell renal carcinoma stage progression, based on tumor RNAseq expression trained supervised machine learning algorithms. BMC Proc. 8, S2 (2014)](http://bmcproc.biomedcentral.com/articles/10.1186/1753-6561-8-S6-S2)

Data locations:
- [Gene Expression Matrix](https://xenabrowser.net/datapages/?dataset=TCGA.KIRC.sampleMap%2FHiSeqV2&host=https%3A%2F%2Ftcga.xenahubs.net)
- [Clinical Metadata](https://xenabrowser.net/datapages/?dataset=TCGA.KIRC.sampleMap%2FKIRC_clinicalMatrix&host=https%3A%2F%2Ftcga.xenahubs.net)

In [7]:
# utils
%load_ext autoreload
%autoreload 1

import sys
sys.path.append("../utils/")
%aimport utils

import os

# math
import pandas as pd
import numpy as np
import time

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# sklearn utilities
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# classifiers
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, make_scorer
import sklearn.metrics as metrics
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

# keras
from keras import backend as K
from keras.models import Model, Input
from keras.layers import Dense
from keras.regularizers import l1
from keras.callbacks import TensorBoard

n_cores = 10

K.set_session(K.tf.Session(config=K.tf.ConfigProto(
    intra_op_parallelism_threads=n_cores, 
    inter_op_parallelism_threads=n_cores)))

tb_session_name = "SAE_KIRC"
tb_logs = "/home/nanni/tensorboard_logs"

def get_tensorboard_callback():
    return TensorBoard(log_dir="{}/{}__{}".format(tb_logs, tb_session_name,time.strftime('%Y_%m_%d__%H_%M')))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
X_path = "./data/KIRC_gene_exp.tsv"
X_meta_path = "./data/KIRC_clinicalMatrix.tsv"

We divide the stages in two classes:
- *Early stage*: Stage 1 and Stage 2
- *Late stage*: Stage 3 and Stage 4

We consider only the samples which are labeled as "Primary Tumor"

In [3]:
X, y, \
idx_to_patient, patient_to_idx, \
idx_to_gene, gene_to_idx = utils.load_stage_data(X_path, X_meta_path)

# patients with Primary Tumor and Stage I, Stage II, Stage III, Stage IV: 531


  X = X_exp.loc[patients].as_matrix(gene_names)


In [13]:
out_dir = "./data/KIRC_preprocessed/"

os.makedirs(out_dir, exist_ok=True)
np.save(os.path.join(out_dir, "X"), X)
np.save(os.path.join(out_dir, "y"), y)
idx_to_patient.to_csv(os.path.join(out_dir, "idx_to_patient.tsv"), sep="\t", index=True, header=None)
patient_to_idx.to_csv(os.path.join(out_dir, "patient_to_idx.tsv"), sep="\t", index=True, header=None)
idx_to_gene.to_csv(os.path.join(out_dir, "idx_to_gene.tsv"), sep="\t", index=True, header=None)
gene_to_idx.to_csv(os.path.join(out_dir, "gene_to_idx.tsv"), sep="\t", index=True, header=None)

## Cross-Validation Pipeline

We perform a 10-fold-CV evaluation

### Model:

1. **Pre-processing**: nothing
2. **Classifier**: SVC

### Performance metrics:

- Accuracy
- Sensitivity
- MCC
- F1-score
- Area under the ROC

### Top Variant Feature Selection

In [4]:
from sklearn.base import BaseEstimator
from sklearn.feature_selection.base import SelectorMixin
from sklearn.utils.validation import check_is_fitted

class TopVariantSelector(BaseEstimator, SelectorMixin):
    """ A very simple feature selector which uses the top variant features """
    def __init__(self, top_k):
        self.top_k = top_k
    
    def fit(self, X, y=None):
        stds = X.std(0) # 1 x n.genes
        selected_genes = np.argsort(stds)[::-1][:self.top_k]
        self.selected_genes_ = selected_genes
        self.mask_ = np.in1d(np.arange(X.shape[1]), selected_genes)
        return self
        
    def _get_support_mask(self):
        check_is_fitted(self, 'selected_genes_')
        check_is_fitted(self, 'mask_')
        return self.mask_

In [5]:
pipeline = Pipeline(steps=[
    ("feature_selection", TopVariantSelector(top_k=5000)),
    ("classifier", SVC())
])

In [6]:
scoring = {
    'TRUE_ES': make_scorer(utils.tp),
    'TRUE_LS': make_scorer(utils.tn),
    'FALSE_ES': make_scorer(utils.fp),
    'FLASE_LS': make_scorer(utils.fn),
    
    'accuracy': make_scorer(metrics.accuracy_score),
    'sensitivity': make_scorer(metrics.recall_score, average="weighted"),
    'MCC': make_scorer(metrics.matthews_corrcoef),
    'f-score': make_scorer(metrics.f1_score, average="weighted"),
    'auROC': make_scorer(metrics.roc_auc_score, average="weighted")
}

In [7]:
scores = cross_validate(estimator=pipeline, scoring=scoring, 
                        X=X, y=y, cv=10, n_jobs=10, return_train_score=False)

In [8]:
def get_confusion_matrix(scores):
    test_TRUE_ES = scores['test_TRUE_ES'].sum()
    test_TRUE_LS = scores['test_TRUE_LS'].sum()
    test_FALSE_ES = scores['test_FALSE_ES'].sum()
    test_FLASE_LS = scores['test_FLASE_LS'].sum()
    
    d = np.array([[test_TRUE_ES, test_FALSE_ES], 
                  [test_FLASE_LS, test_TRUE_LS]])
    return d

def cm_to_df(d):
    r = pd.DataFrame(data=d, index=utils.name_labels, columns=utils.name_labels)
    r.index.name = "Actual"
    r.columns.name = "Predicted"
    return r

def get_aggregate_measures(scores):
    res = {
        'accuracy': scores['test_accuracy'],
        'sensitivity': scores['test_sensitivity'],
        'MCC': scores['test_MCC'],
        'f-score': scores['test_f-score'],
        'auROC': scores['test_auROC']
    }
    return pd.DataFrame.from_dict(res)

In [9]:
get_aggregate_measures(scores).mean()

MCC            0.321503
accuracy       0.692484
auROC          0.623949
f-score        0.649093
sensitivity    0.692484
dtype: float64

In [10]:
cm_to_df(get_confusion_matrix(scores))

Predicted,Early_Stage,Late_Stage
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Early_Stage,303,142
Late_Stage,21,65
