In [45]:
import src
import numpy as np
import pandas as pd

import os

from sklearn.base import BaseEstimator
from sklearn.feature_selection.base import SelectorMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.validation import check_is_fitted
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Activation
from tensorflow.keras.callbacks import EarlyStopping

In [36]:
tissues = {
    'breast': ['BRCA'],
    'lung': ["LUSC", "LUAD"],
    'kidney': ["KIRC", "KICH", "KIRP"],
    'bladder': ['BLCA']
}

In [37]:
cohorts = {}

idx_to_gene, _ = src.data.load_gene_indices()
for tissue, datasets in tissues.items():
    print(tissue)
    X_tissue, idx_to_sample_tissue, y_tissue = np.empty((0, idx_to_gene.shape[0])), [], np.empty(0, dtype=int)
    for d in datasets:
        X_d, idx_to_sample_d, _, y_d, classes = src.data.get_normal_vs_tumor_task(d)
        X_tissue = np.vstack([X_tissue, X_d])
        idx_to_sample_tissue = np.append(idx_to_sample_tissue, idx_to_sample_d.values)
        y_tissue = np.append(y_tissue, y_d)
    idx_to_sample_tissue = pd.Series(data=idx_to_sample_tissue, index=np.arange(X_tissue.shape[0], dtype=int))
    cohorts[tissue] = (X_tissue, idx_to_sample_tissue, idx_to_gene, y_tissue, classes)

breast
lung
kidney
bladder


In [38]:
class TopVariantSelector(BaseEstimator, SelectorMixin):
    """ A very simple feature selector which uses the top variant features """
    def __init__(self, top_k):
        self.top_k = top_k
    
    def fit(self, X, y=None):
        stds = X.std(0) # 1 x n.genes
        selected_genes = np.argsort(stds)[::-1][:self.top_k]
        self.selected_features_ = selected_genes
        self.mask_ = np.in1d(np.arange(X.shape[1]), selected_genes)
        return self
        
    def _get_support_mask(self):
        check_is_fitted(self, 'selected_features_')
        check_is_fitted(self, 'mask_')
        return self.mask_

In [39]:
def get_confusion_matrix_old(y_true, y_pred, names):
    return pd.DataFrame(data=confusion_matrix(y_true=y_true, y_pred=y_pred), 
                        index=pd.Series(names, name='Observed'), columns=pd.Series(names, name='Predicted'))

def get_confusion_matrix(y_true, y_pred):
    normal_normal, normal_tumor, tumor_normal, tumor_tumor = confusion_matrix(y_true=y_true, y_pred=y_pred).flatten()
    cm = pd.DataFrame(data=[[normal_normal, normal_tumor, tumor_normal, tumor_tumor]], 
                      columns=['tp', 'fn', 'fp', 'tn'])
    cm.index.name = 'true_observed'
    return cm

def sum_confusion_matrices(results_cohort):
    r_cm = None
    for _, cm in results_cohort.items():
        if r_cm is None:
            r_cm = cm
        else:
            r_cm += cm
    return cm

In [40]:
top_variant = 5000

def get_encoder():
    input_layer = Input(shape=(top_variant, ))
    h1 = Dense(500)(input_layer)
    h2 = Dense(200, activation='relu')(Activation('relu')(BatchNormalization()(h1)))
    h3 = Dense(100, activation='relu')((Activation('relu')(BatchNormalization()(h2))))
    out = Dense(1, activation='sigmoid')(h3)
    
    model = Model(inputs=input_layer, outputs=out)
    encoder = Model(inputs=input_layer, outputs=h3)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model, encoder

def tumor_alone_model():
    """ A super-simple NN for the single tumor classification
    """
    model = Sequential()
    model.add(Dense(10, input_shape=(100, ), activation='relu'))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [41]:
tissue = 'bladder'
seed = 42
n_repeated = 5
n_folds = 5

In [42]:
print("Getting tissue data")
X_tissue, _, _, y_tissue, _ = cohorts[tissue]

print("Getting other tissues")
# get other tissues
other_tissues = set(src.data.TCGA_COHORTS) - set(tissues[tissue])
X_others, y_others = [], []
for ot in other_tissues:
    X_ot, _, _, y_ot, _ = src.data.get_normal_vs_tumor_task(ot)
    X_others.append(X_ot)
    y_others.append(y_ot)
X_others = np.vstack(X_others)
y_others = np.hstack(y_others)

print("Pre-processing")
topvariant_selector = TopVariantSelector(top_k=top_variant)
topvariant_selector.fit(X_tissue)
X_tissue = topvariant_selector.transform(X_tissue)
X_others = topvariant_selector.transform(X_others)

scaler = MinMaxScaler()
X_tissue = scaler.fit_transform(X_tissue)
X_others = scaler.fit_transform(X_others)

X_others_train, X_other_val,\
y_other_train, y_other_val = train_test_split(X_others, y_others, test_size=0.3, stratify=y_others)

Getting tissue data
Getting other tissues
Pre-processing


In [43]:
print("Training of OTHER model")
model, encoder = get_encoder()
print(model.summary())
model.fit(X_others_train, y_other_train, validation_data=(X_other_val, y_other_val), epochs=30, verbose=1, batch_size=10,
          callbacks=[EarlyStopping(monitor='val_acc',restore_best_weights=True, patience=10)])
print("Data reduction")
X_tissue = encoder.predict(X_tissue)

Training of OTHER model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 5000)              0         
_________________________________________________________________
dense_204 (Dense)            (None, 500)               2500500   
_________________________________________________________________
batch_normalization_v1_14 (B (None, 500)               2000      
_________________________________________________________________
activation_14 (Activation)   (None, 500)               0         
_________________________________________________________________
dense_205 (Dense)            (None, 200)               100200    
_________________________________________________________________
batch_normalization_v1_15 (B (None, 200)               800       
_________________________________________________________________
activation_15 (Activation)   (None, 200)            

In [44]:
results = []
for t in range(n_repeated):
    print("Repetition {}".format(t + 1))
    skf = StratifiedKFold(n_splits=n_folds, random_state=seed + t, shuffle=True)
    i = 0
    for train_idx, test_idx in skf.split(X_tissue, y_tissue):
        X_train, y_train = X_tissue[train_idx, :], y_tissue[train_idx]
        X_test, y_test = X_tissue[test_idx, :], y_tissue[test_idx]
        
        print("\tSplit {}\tTrain ({}, {})\tTest ({}, {})".format(i, y_train[y_train == 0].shape[0], 
                                                                     y_train[y_train == 1].shape[0],
                                                                     y_test[y_test == 0].shape[0], 
                                                                     y_test[y_test == 1].shape[0]))
        
        # transform target dataset
        scaler = MinMaxScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, stratify=y_train)
        
        model = tumor_alone_model()
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30, verbose=1, batch_size=10,
                  callbacks=[EarlyStopping(monitor='val_acc',restore_best_weights=True, patience=10)])
        
        y_pred = model.predict_classes(X_test)
        cm_tip = get_confusion_matrix(y_true=y_test, y_pred=y_pred)
        cm_tip['repetition'] = t
        cm_tip['fold'] = i
        results.append(cm_tip)
        
        i += 1

In [46]:
os.makedirs(src.reports_dir / "confusion_matrices" / tissue, exist_ok=True)
results = pd.concat(results, axis=0, ignore_index=False)
results.to_csv(src.reports_dir / "confusion_matrices" / tissue / "TL.tsv", 
                        sep="\t", index=True, header=True, index_label="true_predicted")
results

Unnamed: 0_level_0,tp,fn,fp,tn,repetition,fold
true_observed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3,1,1,81,0,0
0,3,1,0,82,0,1
0,3,1,0,81,0,2
0,1,3,0,81,0,3
0,2,1,0,81,0,4
0,2,2,0,82,1,0
0,1,3,0,82,1,1
0,2,2,0,81,1,2
0,3,1,1,80,1,3
0,3,0,0,81,1,4
