In [1]:
# https://stackoverflow.com/questions/21971449/how-do-i-increase-the-cell-width-of-the-jupyter-ipython-notebook-in-my-browser

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import numpy as np
import pandas as pd
import seaborn as sea
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

sea.set_style("whitegrid")
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import warnings
warnings.simplefilter('ignore')

In [4]:
import os
import gc
import time
import copy
import shutil
import torch
import torch.nn as nn
import model_utils as u
import model_classes as c
import torch.nn.functional as F
from torch.autograd import Variable
from torchvision import utils, transforms
from torch.utils.data import Dataset, DataLoader

In [5]:
torch.use_deterministic_algorithms(True)
os.environ["CUBLAS_WORKSPACE_CONFIG"]=":16:8"

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# https://pytorch.org/docs/stable/notes/randomness.html
seed = 325
u.set_all_seeds(seed)

# Load and clean dataset

In [8]:
# PATH = "/content/drive/MyDrive/LUNG_PanCan/XENA Repository/"
PATH = "D:/CANCER BIOLOGY/DATASET/TCGA/FROM Xena/"

In [9]:
# https://stackoverflow.com/questions/18885175/read-a-zipped-file-as-a-pandas-dataframe
# https://www.analyticsvidhya.com/blog/2021/04/delimiters-in-pandas-read_csv-function/

df_luad = pd.read_csv(PATH+"TCGA.LUAD.sampleMap_HiSeqV2_PANCAN.gz", compression = "gzip", sep = "\t")
df_lusu = pd.read_csv(PATH+"TCGA.LUSC.sampleMap_HiSeqV2_PANCAN.gz", compression = "gzip", sep = "\t")

In [10]:
df_luad

Unnamed: 0,sample,TCGA-69-7978-01,TCGA-62-8399-01,TCGA-78-7539-01,TCGA-50-5931-11,TCGA-73-4658-01,TCGA-44-6775-01,TCGA-44-2655-01,TCGA-44-3398-01,TCGA-62-8397-01,...,TCGA-75-7025-01,TCGA-55-7726-01,TCGA-L9-A743-01,TCGA-86-8358-01,TCGA-55-6972-01,TCGA-55-7727-01,TCGA-91-6831-01,TCGA-MN-A4N4-01,TCGA-55-8302-01,TCGA-MP-A4TK-01
0,ARHGEF10L,0.125808,0.561708,-0.237592,-1.180492,-0.656192,0.139908,-0.537692,-0.839092,0.677108,...,0.226508,-2.342092,-0.207692,-0.659792,-1.651292,-2.621192,-1.025192,0.070108,0.305608,0.263208
1,HIF3A,-1.294926,6.069174,3.581474,3.927674,-0.525926,-1.497426,-0.021226,0.179974,1.092974,...,2.539674,-1.259526,-0.387226,3.689474,3.509374,1.986874,-1.993426,2.790974,-0.018326,4.657474
2,RNF17,-0.112935,-0.531035,0.592065,0.291065,-0.531035,0.475865,0.071065,-0.531035,-0.531035,...,-0.068235,-0.531035,0.428265,0.202865,0.567665,0.408165,-0.531035,0.440465,-0.531035,0.049365
3,RNF10,-1.411872,-0.228672,-0.108372,-0.043472,-0.156672,-0.605472,0.139328,-0.450172,0.583528,...,-0.451572,0.261228,-0.331772,-0.213372,-0.189472,0.091028,0.492828,0.037428,0.003728,-0.334572
4,RNF11,0.203922,0.052122,-0.499978,0.710822,0.373522,0.129022,0.436522,0.529622,0.314922,...,-0.155778,0.362522,-0.520578,0.031222,-0.966478,0.318322,0.150822,-0.357778,-0.451578,0.156422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20525,PTRF,0.863614,0.317114,-1.243086,2.802714,1.110714,0.879814,-0.028286,0.008714,-0.005186,...,0.544614,1.638214,0.381814,-1.080186,-2.139886,-1.495486,0.472314,0.993014,0.634014,1.222714
20526,BCL6B,0.802173,1.079073,-1.283227,2.250473,1.513973,0.126473,0.576073,0.643573,-0.735227,...,0.842873,0.680873,0.642773,0.154673,-0.849427,-0.359227,0.332973,0.570873,-0.909527,0.671573
20527,GSTK1,0.108205,-0.782695,0.034105,-0.540795,-0.454095,-0.797795,0.531305,0.310605,0.204105,...,-0.454195,-0.466595,-0.011995,-0.503195,0.512405,-0.584495,-1.756895,0.506805,-0.114895,-0.189095
20528,SELP,0.595367,3.114267,0.571467,3.985967,2.893167,1.805567,2.445467,2.575967,1.336567,...,2.817667,-0.301333,2.508367,-0.518033,-1.540033,1.663867,-0.275933,-0.073933,0.848867,1.195667


In [11]:
df_lusu

Unnamed: 0,sample,TCGA-18-3417-01,TCGA-22-4613-01,TCGA-90-7769-01,TCGA-77-A5G1-01,TCGA-77-A5G3-01,TCGA-66-2766-01,TCGA-37-4135-01,TCGA-56-8201-01,TCGA-56-7582-11,...,TCGA-77-8144-01,TCGA-J1-A4AH-01,TCGA-56-7580-01,TCGA-63-A5MY-01,TCGA-33-AASL-01,TCGA-85-A512-01,TCGA-85-8354-01,TCGA-O2-A5IB-01,TCGA-77-7335-01,TCGA-56-7731-11
0,ARHGEF10L,-2.032992,-1.109192,-1.270392,0.054708,-1.344192,-1.145092,-0.737892,-0.619892,-0.050992,...,-2.237692,-1.395092,-1.926792,-1.335292,-0.762292,-1.286992,-1.847792,0.568508,-0.849892,-0.888992
1,HIF3A,-0.775126,-1.023426,-3.254826,0.075174,0.826174,1.306874,-2.036826,-0.015326,2.772874,...,-5.006326,3.974574,3.154774,4.021874,-0.831926,-1.863426,-1.297326,0.025974,-2.414126,4.229474
2,RNF17,0.573765,-0.531035,-0.090835,-0.531035,-0.531035,-0.049535,0.829765,-0.531035,3.204265,...,-0.531035,1.349965,4.157765,1.554065,0.984465,-0.531035,-0.531035,0.084865,0.192865,-0.531035
3,RNF10,0.365228,-0.326772,0.160728,-0.147472,-0.364672,-0.697672,-0.765472,0.068428,-0.010572,...,-0.381072,-0.527472,0.171028,-0.292972,-0.651572,0.020328,-0.280072,-0.009372,0.197228,0.162228
4,RNF11,0.364522,0.308122,0.368322,0.826222,-0.312978,-0.792078,0.583822,-0.573278,0.323522,...,0.075622,-0.513978,-0.604678,0.479122,-0.446678,-0.560978,-0.560278,-0.136278,0.443922,0.344522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20525,PTRF,1.025014,-0.396686,0.094714,1.569614,1.269914,-0.263486,-2.675286,1.638114,2.118814,...,1.972214,0.993014,-0.233586,-0.222586,0.774714,0.434414,1.246914,-2.425586,1.550114,2.468514
20526,BCL6B,-0.381027,-0.781327,-2.267827,-0.409227,-0.542127,-0.932927,-1.847227,0.972273,4.176973,...,-0.010327,-1.413827,-0.132027,-1.714727,-2.306727,-0.491427,-0.070027,-0.734427,0.294773,2.667973
20527,GSTK1,0.839305,-0.732495,-1.137095,-0.003395,-1.324995,-0.113095,0.841305,-0.320395,-0.376795,...,-1.539295,-1.318095,-0.900095,0.251605,0.406205,0.318305,-0.620695,-1.926795,0.369005,-0.264695
20528,SELP,-1.085033,1.733867,-1.210233,1.761467,-1.784433,-0.762733,-2.145733,0.812667,1.800567,...,-3.797533,0.044667,-1.761533,-0.018533,-2.346733,-1.011433,-1.390333,-2.121633,1.909267,3.930867


# Dataset preprocessing 

In [12]:
df, labels, columns = u.dataset_preprocess(df_luad, df_lusu)

Unnamed: 0,label,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,REM1,MTVR2,...,TULP2,NPY5R,GNGT2,GNGT1,TULP3,PTRF,BCL6B,GSTK1,SELP,SELS
TCGA-69-7978-01,1,0.125808,-1.29493,-0.112935,-1.41187,0.203922,0.0993901,-0.222094,0.504354,-0.423399,...,1.13472,-0.845117,1.76017,-1.28139,0.224623,0.863614,0.802173,0.108205,0.595367,-0.222712
TCGA-62-8399-01,1,0.561708,6.06917,-0.531035,-0.228672,0.0521219,-1.20601,-0.338894,1.44985,0.0394006,...,-0.286078,-0.055517,-0.0282335,0.0480102,0.295223,0.317114,1.07907,-0.782695,3.11427,-0.388912
TCGA-78-7539-01,1,-0.237592,3.58147,0.592065,-0.108372,-0.499978,-0.0254099,0.163006,0.131654,-0.0508994,...,2.20992,-1.58712,1.56537,2.63871,0.0491232,-1.24309,-1.28323,0.0341054,0.571467,0.233588
TCGA-50-5931-11,1,-1.18049,3.92767,0.291065,-0.043472,0.710822,1.05089,-0.564394,1.68395,0.398701,...,-0.748878,0.900483,1.90357,-1.28139,-0.549277,2.80271,2.25047,-0.540795,3.98597,0.370988
TCGA-73-4658-01,1,-0.656192,-0.525926,-0.531035,-0.156672,0.373522,0.44729,-0.438994,1.83155,-0.423399,...,-0.748878,-0.931417,1.91517,-0.17659,0.0423232,1.11071,1.51397,-0.454095,2.89317,0.193788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-85-A512-01,0,-1.28699,-1.86343,-0.531035,0.020328,-0.560978,0.51729,0.519806,-0.748146,-0.423399,...,-0.748878,-1.58712,-0.236033,1.41071,1.65372,0.434414,-0.491427,0.318305,-1.01143,0.392288
TCGA-85-8354-01,0,-1.84779,-1.29733,-0.531035,-0.280072,-0.560278,0.85009,-0.166294,-1.51925,0.602101,...,0.276622,-1.58712,-0.784133,1.46201,0.765023,1.24691,-0.0700266,-0.620695,-1.39033,0.918888
TCGA-O2-A5IB-01,0,0.568508,0.0259737,0.084865,-0.00937199,-0.136278,-1.41811,0.173506,-2.19405,0.622701,...,-0.408378,-1.24662,-1.94053,-1.28139,1.28172,-2.42559,-0.734427,-1.92679,-2.12163,-0.637412
TCGA-77-7335-01,0,-0.849892,-2.41413,0.192865,0.197228,0.443922,0.54789,0.0844055,1.01505,-0.423399,...,-0.748878,-1.58712,2.16037,1.19391,0.197023,1.55011,0.294773,0.369005,1.90927,0.117688


In [13]:
df_xtrain, df_xtest, df_ytrain, df_ytest = train_test_split(df, labels, train_size=0.9, random_state=seed, stratify=labels)

In [14]:
df_xtrain.shape, df_xtest.shape

((1016, 20258), (113, 20258))

In [15]:
df = df_xtrain.copy(deep=True)
labels = df_ytrain
df.reset_index(drop=True, inplace=True)
df_xtest.reset_index(drop=True, inplace=True)

# Setup hyperparameters

In [16]:
u.set_all_seeds(seed)

In [17]:
input_dim = df.shape[1]
epochs = 100
batch_size_train = 64
batch_size_test = 32
batch_size_df = 64
learning_rate = 1e-4
output_dim = 512

# Start training and validation using K-Fold

In [18]:
## Load the saved model trained at: 1_XENA_LUNG_GeneExp_AutoEncoder
## This will be reloaded after every K-Fold iteration. It will act as reset weights.

saved_model = torch.load(PATH+"SECOND_ITERATION/models/XENA_LUNG_GeneExp_Autoencoder.kd")    ## when GPU is available

In [19]:
# Setup the Stratified K-Fold Cross Validation
cumulative_train_acc, cumulative_test_acc = 0., 0.
k = 10
kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)

In [20]:
list_avg_train_acc_per_fold=[]
list_avg_valid_acc_per_fold=[]

for fold, (train_index, test_index) in enumerate(kfold.split(df, labels)):

    text=HTML("<h1>Fold: {}</h1>".format(fold+1))
    display(text)
    ##------------------------------------------------------------------------------------##
    
    ## collect the rows for train and test
    ## https://stackoverflow.com/questions/19155718/select-pandas-rows-based-on-list-index
    xtrain, xtest = df.iloc[df.index[train_index]], df.iloc[df.index[test_index]]
    ytrain, ytest = np.array(labels)[train_index], np.array(labels)[test_index]
    
    scaler = StandardScaler()

    ## standardise  the datasets
    ## https://towardsdatascience.com/feature-scaling-and-normalisation-in-a-nutshell-5319af86f89b
    ## https://stackoverflow.com/questions/49444262/normalize-data-before-or-after-split-of-training-and-testing-data
    xtrain_scaled = scaler.fit_transform(xtrain)
    xtest_scaled = scaler.transform(xtest)


    ## create train_dataset and test_dataset of class LUNG_GeneExp
    train_dataset = c.LUNG_GeneExp(ytrain, xtrain_scaled)
    test_dataset = c.LUNG_GeneExp(ytest, xtest_scaled)


    ## create dataloaders for train_dataset and test_dataset
    train_loader = DataLoader(train_dataset, batch_size=batch_size_train, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size_test, shuffle=True)

    ##------------------------------------------------------------------------------------##

#     u.set_all_seeds(seed)
    torch.cuda.empty_cache()

    ## create an object of class AutoEncoder and load the saved model from 1_XENA_LUNG_GeneExp_AutoEncoder
    only_encoder = c.AutoEncoder(input_dim, output_dim)
    only_encoder.load_state_dict(saved_model)
    

    ## detach the decoder part from the saved model
    only_encoder = nn.Sequential(*list(only_encoder.children())[:-1])
    

    ## create an object of class Classifier and pass the only_encoder object
    classifier = c.Classifier(only_encoder, output_dim)
    # print(classifier)
    for params in classifier.encoder.parameters():
        params.requires_grad=False
    classifier.to(device)

    ##------------------------------------------------------------------------------------##

    ## setup the optimizer and lr_scheduler
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, classifier.parameters()), lr=learning_rate, betas=(0.9, 0.999), weight_decay=0.001, amsgrad=False)
    learn = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, verbose=True)

    ## call the train function, print accuracy and plot !!!!!!!!
    log_dict = u.train_classifier(
        num_epochs = epochs,
        model = classifier,
        optimizer = optimizer,
        scheduler = learn,
        device = device,
        train_loader = train_loader,
        valid_loader = test_loader,
        skip_epoch_stats = True,
        logging_interval = 500
    )
    
    print(f"Train accuracy avg: {np.mean(list(log_dict['train_acc']))}, Valid accuracy avg: {np.mean(list(log_dict['valid_acc']))}")
    list_avg_train_acc_per_fold.append(np.mean(list(log_dict['train_acc'])))
    list_avg_valid_acc_per_fold.append(np.mean(list(log_dict['valid_acc'])))
    
    cumulative_train_acc += np.mean(list(log_dict['train_acc']))
    cumulative_test_acc += np.mean(list(log_dict['valid_acc']))


    ##------------------------------------------------------------------------------------##

    
'''
Write both train and test accuracy in a file and save it in the respective seed
'''
with open(r'D:/CANCER BIOLOGY/DATASET/TCGA/FROM Xena/SECOND_ITERATION/seed=%s/neural_classifier_train_acc_k_fold.txt'%str(seed), 'w') as fp:
    for item in list_avg_train_acc_per_fold:
        fp.write("%s\n" % item)
with open(r'D:/CANCER BIOLOGY/DATASET/TCGA/FROM Xena/SECOND_ITERATION/seed=%s/neural_classifier_valid_acc_k_fold.txt'%str(seed), 'w') as fp:
    for item in list_avg_valid_acc_per_fold:
        fp.write("%s\n" % item)

print('K-Fold Train and Validate accuracies written')

  0%|          | 0/100 [00:00<?, ?it/s]

Total Training Time: 2.07 min
Train accuracy avg: 99.4737474822998, Valid accuracy avg: 94.91177070617675


  0%|          | 0/100 [00:00<?, ?it/s]

Total Training Time: 2.42 min
Train accuracy avg: 99.50000610351563, Valid accuracy avg: 96.44118301391602


  0%|          | 0/100 [00:00<?, ?it/s]

Total Training Time: 2.36 min
Train accuracy avg: 99.36433563232421, Valid accuracy avg: 97.13726020812989


  0%|          | 0/100 [00:00<?, ?it/s]

Total Training Time: 2.42 min
Train accuracy avg: 99.58643119812012, Valid accuracy avg: 92.62745376586913


  0%|          | 0/100 [00:00<?, ?it/s]

Total Training Time: 2.28 min
Train accuracy avg: 99.4420182800293, Valid accuracy avg: 93.91176948547363


  0%|          | 0/100 [00:00<?, ?it/s]

Total Training Time: 2.37 min
Train accuracy avg: 99.45295845031738, Valid accuracy avg: 94.78431968688965


  0%|          | 0/100 [00:00<?, ?it/s]

Total Training Time: 2.28 min
Train accuracy avg: 99.30492614746093, Valid accuracy avg: 91.53464851379394


  0%|          | 0/100 [00:00<?, ?it/s]

Total Training Time: 2.34 min
Train accuracy avg: 99.47869354248047, Valid accuracy avg: 95.67326522827149


  0%|          | 0/100 [00:00<?, ?it/s]

Total Training Time: 2.30 min
Train accuracy avg: 99.44372077941894, Valid accuracy avg: 96.32673179626465


  0%|          | 0/100 [00:00<?, ?it/s]

Total Training Time: 2.37 min
Train accuracy avg: 99.46995010375977, Valid accuracy avg: 95.8514835357666
K-Fold Train and Validate accuracies written


---
---
---

In [21]:
# u.plot_train_test_k_fold_accuracy(
#     list_avg_train_acc_per_fold,
#     list_avg_valid_acc_per_fold,
#     N=k, 
#     width=0.45,
#     width_mult=1,
#     fig_size=(12, 6), 
#     title='K-FOLD Accuracy Chart ===> Overall avg_train_acc: {:.4f}, Overall avg_valid_acc: {:.4f}'.format(cumulative_train_acc/k, cumulative_test_acc/k),
#     x_ticks=('Fold=1', 'Fold=2', 'Fold=3', 'Fold=4', 'Fold=5'),
#     legends=('Train', 'Validation'),
#     file_path=PATH+"SECOND_ITERATION/seed="+str(seed)+"/classifier_on_k_fold",
# )

# 2nd last, train the classifier on complete Train Split (DO NOT DO ANYMORE)

In [22]:
# learn = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, verbose=True)

In [23]:
# df_scaler = StandardScaler()
# df_scaled = df_scaler.fit_transform(df)
# df_scaled_xtest = df_scaler.transform(df_xtest)

In [24]:
# df_dataset = c.LUNG_GeneExp(labels, df_scaled)
# df_loader = DataLoader(df_dataset, batch_size=batch_size_df, shuffle=True)

In [25]:
# u.set_all_seeds(seed)
# torch.cuda.empty_cache()

In [26]:
# only_encoder = c.AutoEncoder(input_dim, output_dim)
# only_encoder.load_state_dict(saved_model)
# only_encoder = nn.Sequential(*list(only_encoder.children())[:-1])

In [27]:
# classifier = c.Classifier(only_encoder, output_dim)
# for params in classifier.encoder.parameters():
#         params.requires_grad=False
# classifier.to(device)

In [28]:
# optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, classifier.parameters()), lr=learning_rate, betas=(0.9, 0.999), weight_decay=0.001, amsgrad=False)

In [29]:
# log_dict = u.train_classifier(
#     num_epochs = epochs,
#     model = classifier,
#     optimizer = optimizer,
#     scheduler = learn,
#     device = device,
#     train_loader = df_loader,
#     skip_epoch_stats = True,
#     logging_interval = 500
# )

# Lastly, test the classifier on the Test Split

In [30]:
# avg_train_acc_final = np.mean(list(log_dict['train_acc']))

# df_test_dataset = c.LUNG_GeneExp(df_ytest, df_scaled_xtest)
# df_test_loader = DataLoader(df_test_dataset, batch_size=batch_size_test, shuffle=True)

# avg_test_acc_final = u.compute_accuracy(classifier, df_test_loader, device)

In [31]:
# u.plot_train_test_k_fold_accuracy(
#     avg_train_acc_final,
#     avg_test_acc_final,
#     N=1, 
#     width=0.35,
#     width_mult=1.5,
#     fig_size=(8, 4), 
#     title='Accuracy score on Train/Test Split',
#     x_ticks=(''),
#     legends=('Train', 'Test'),
#     file_path=PATH+"project_summary_seed_wise/seed="+str(seed)+"/classifier_on_train_test",
# )

# Save the classifier model

In [32]:
torch.save(classifier.state_dict(), PATH+"SECOND_ITERATION/models/XENA_LUNG_GeneExp_Classifier.kd")

---
---
---

# Level 2 complete !!