# Evaluation of PROMO Genes on Maverick/Unseen/Independent Dataset (ICGC)

In [1]:
# https://stackoverflow.com/questions/21971449/how-do-i-increase-the-cell-width-of-the-jupyter-ipython-notebook-in-my-browser

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:
import numpy as np
import pandas as pd
import seaborn as sea
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_curve, auc, plot_confusion_matrix, plot_precision_recall_curve, classification_report

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

plt.style.use('ggplot')
sea.set_style("ticks")
sea.set_context("paper", font_scale=1.5, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(8,8))
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import os
import gc
import time
import copy
import torch
import model_utils as u
import model_evaluation as me

In [4]:
torch.use_deterministic_algorithms(True)
os.environ["CUBLAS_WORKSPACE_CONFIG"]=":16:8"

In [5]:
import warnings
warnings.simplefilter('ignore')

In [6]:
# https://pytorch.org/docs/stable/notes/randomness.html
seed = 322
u.set_all_seeds(seed)

---
---
---

In [7]:
PATH = "D:/CANCER BIOLOGY/DATASET/TCGA/FROM Xena/"

In [8]:
promo_genes = u.read_promo_genes(PATH)

In [9]:
# https://stackoverflow.com/questions/18885175/read-a-zipped-file-as-a-pandas-dataframe
# https://www.analyticsvidhya.com/blog/2021/04/delimiters-in-pandas-read_csv-function/

icgc_luad = pd.read_csv("D:/CANCER BIOLOGY/DATASET/TCGA/From ICGC/luad_exp_seq.tsv.gz", compression = "gzip", sep = "\t")
icgc_lusu = pd.read_csv("D:/CANCER BIOLOGY/DATASET/TCGA/From ICGC/lusu_exp_seq.tsv.gz", compression = "gzip", sep = "\t")

In [10]:
icgc_df, icgc_labels = me.prepare_icgc_dataset(icgc_luad, icgc_lusu, promo_genes)

Unnamed: 0,KRT5,DSG3,CALML3,DSC3,LASS3,SERPINB13,BNC1,C10orf99,SPRR2E,CLCA2,...,LOC728758,FTHL3,ID2B,PTTG3P,C9orf69,PRSSL1,LOC442308,PA2G4P4,label,pid
0,266,5,7,11,0,10,6,0,0,14,...,83,1044,6,16,1617,12,2,132,1,TCGA-05-4244-01A-01R-1107-07
1,68,1,1,12,1,2,0,0,0,23,...,89,503,6,6,1770,6,4,104,1,TCGA-05-4249-01A-01R-1107-07
2,721,6,12,55,0,5,8,1,64,41,...,85,611,16,9,2423,7,12,163,1,TCGA-05-4250-01A-01R-1107-07
3,1,1,6,271,0,0,4,0,0,3,...,163,3310,9,6,3217,1,4,209,1,TCGA-05-4382-01A-01R-1206-07
4,12,1,2,5,1,0,1,0,0,8,...,43,1335,2,1,1799,0,1,58,1,TCGA-05-4384-01A-01R-1755-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538,2226,16,23,109,1,87,8,0,0,146,...,42,412,12,6,879,3,2,110,1,TCGA-MP-A4TJ-01A-51R-A262-07
539,18,1,8,1454,0,0,0,0,1,9,...,30,628,4,10,2322,0,0,109,1,TCGA-MP-A4TK-01A-11R-A24X-07
540,6,3,158,1,0,2,0,0,0,0,...,74,940,18,6,2202,1,7,212,1,TCGA-MP-A5C7-01A-11R-A262-07
541,151,9,0,1,0,1,5,0,9,55,...,148,656,0,0,1612,0,0,110,1,TCGA-NJ-A7XG-01A-12R-A39D-07


Unnamed: 0,KRT5,DSG3,CALML3,DSC3,LASS3,SERPINB13,BNC1,C10orf99,SPRR2E,CLCA2,...,LOC728758,FTHL3,ID2B,PTTG3P,C9orf69,PRSSL1,LOC442308,PA2G4P4,label,pid
0,378342,30933,20443,18368,596,16438,1613,644,6877,33305,...,28,351,1,10,781,0,0,108,0,TCGA-18-3406-01A-01R-0980-07
1,188128,23977,29552,27355,2590,8665,7163,423,13253,33684,...,21,456,2,11,2216,2,4,74,0,TCGA-18-3407-01A-01R-0980-07
2,55357,14595,2383,11500,2000,3541,1016,440,56,67720,...,26,634,7,17,569,0,0,112,0,TCGA-18-3408-01A-01R-0980-07
3,116346,5502,11442,14090,554,4,7574,1,0,6842,...,42,526,1,10,1524,1,3,75,0,TCGA-18-3409-01A-01R-0980-07
4,395862,6544,43585,3388,28,89,87,6,97,448,...,40,414,8,27,2863,1,4,154,0,TCGA-18-3410-01A-01R-0980-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468,568,285,6539,520,549,624,56,1187,2,969,...,121,1316,6,29,3964,2,1,192,0,TCGA-NC-A5HE-01A-11R-A26W-07
469,692335,66849,3717,65828,6458,259,3861,3003,4314,11845,...,111,327,4,9,1989,0,5,299,0,TCGA-NC-A5HG-01A-11R-A26W-07
470,797271,32526,55765,16628,2073,5117,951,171,1825,30664,...,160,522,4,32,1905,0,9,401,0,TCGA-NC-A5HI-01A-11R-A26W-07
471,431055,70546,3075,38606,3265,14552,10427,796,65,37444,...,138,1074,1,15,3143,1,1,387,0,TCGA-NC-A5HN-01A-11R-A26W-07


Unnamed: 0,KRT5,DSG3,CALML3,DSC3,LASS3,SERPINB13,BNC1,C10orf99,SPRR2E,CLCA2,...,NACA2,RPL7,LOC728758,FTHL3,ID2B,PTTG3P,C9orf69,PRSSL1,LOC442308,PA2G4P4
0,8.055282,2.321928,2.807355,3.459432,0.000000,3.321928,2.584963,0.000000,0.000000,3.807355,...,6.392317,11.866506,6.375039,10.027906,2.584963,4.000000,10.659104,3.584963,1.000000,7.044394
1,6.087463,0.000000,0.000000,3.584963,0.000000,1.000000,0.000000,0.000000,0.000000,4.523562,...,5.672425,11.337064,6.475733,8.974415,2.584963,2.584963,10.789534,2.584963,2.000000,6.700440
2,9.493855,2.584963,3.584963,5.781360,0.000000,2.321928,3.000000,0.000000,6.000000,5.357552,...,6.569856,11.472691,6.409391,9.255029,4.000000,3.169925,11.242579,2.807355,3.584963,7.348728
3,0.000000,0.000000,2.584963,8.082149,0.000000,0.000000,2.000000,0.000000,0.000000,1.584963,...,8.515700,10.878051,7.348728,11.692616,3.169925,2.584963,11.651500,0.000000,2.000000,7.707359
4,3.584963,0.000000,1.000000,2.321928,0.000000,0.000000,0.000000,0.000000,0.000000,3.000000,...,6.781360,10.121534,5.426265,10.382624,1.000000,0.000000,10.812979,0.000000,0.000000,5.857981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1011,9.149747,8.154818,12.674854,9.022368,9.100662,9.285402,5.807355,10.213104,1.000000,9.920353,...,8.383704,10.992230,6.918863,10.361944,2.584963,4.857981,11.952741,1.000000,0.000000,7.584963
1012,19.401111,16.028618,11.859923,16.006414,12.656872,8.016808,11.914759,11.552189,12.074810,13.531991,...,8.214319,12.110483,6.794416,8.353147,2.000000,3.169925,10.957828,0.000000,2.321928,8.224002
1013,19.604711,14.989306,15.767072,14.021327,11.017504,12.321083,9.893302,7.417853,10.833681,14.904258,...,9.312883,10.820179,7.321928,9.027906,2.000000,5.000000,10.895575,0.000000,3.169925,8.647458
1014,18.717512,16.106277,11.586371,15.236537,11.672867,13.828930,13.348037,9.636625,6.022368,15.192447,...,8.607330,12.266787,7.108524,10.068778,0.000000,3.906891,11.617927,0.000000,0.000000,8.596190


In [11]:
icgc_df

Unnamed: 0,KRT5,DSG3,CALML3,DSC3,LASS3,SERPINB13,BNC1,C10orf99,SPRR2E,CLCA2,...,NACA2,RPL7,LOC728758,FTHL3,ID2B,PTTG3P,C9orf69,PRSSL1,LOC442308,PA2G4P4
0,8.055282,2.321928,2.807355,3.459432,0.000000,3.321928,2.584963,0.000000,0.000000,3.807355,...,6.392317,11.866506,6.375039,10.027906,2.584963,4.000000,10.659104,3.584963,1.000000,7.044394
1,6.087463,0.000000,0.000000,3.584963,0.000000,1.000000,0.000000,0.000000,0.000000,4.523562,...,5.672425,11.337064,6.475733,8.974415,2.584963,2.584963,10.789534,2.584963,2.000000,6.700440
2,9.493855,2.584963,3.584963,5.781360,0.000000,2.321928,3.000000,0.000000,6.000000,5.357552,...,6.569856,11.472691,6.409391,9.255029,4.000000,3.169925,11.242579,2.807355,3.584963,7.348728
3,0.000000,0.000000,2.584963,8.082149,0.000000,0.000000,2.000000,0.000000,0.000000,1.584963,...,8.515700,10.878051,7.348728,11.692616,3.169925,2.584963,11.651500,0.000000,2.000000,7.707359
4,3.584963,0.000000,1.000000,2.321928,0.000000,0.000000,0.000000,0.000000,0.000000,3.000000,...,6.781360,10.121534,5.426265,10.382624,1.000000,0.000000,10.812979,0.000000,0.000000,5.857981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1011,9.149747,8.154818,12.674854,9.022368,9.100662,9.285402,5.807355,10.213104,1.000000,9.920353,...,8.383704,10.992230,6.918863,10.361944,2.584963,4.857981,11.952741,1.000000,0.000000,7.584963
1012,19.401111,16.028618,11.859923,16.006414,12.656872,8.016808,11.914759,11.552189,12.074810,13.531991,...,8.214319,12.110483,6.794416,8.353147,2.000000,3.169925,10.957828,0.000000,2.321928,8.224002
1013,19.604711,14.989306,15.767072,14.021327,11.017504,12.321083,9.893302,7.417853,10.833681,14.904258,...,9.312883,10.820179,7.321928,9.027906,2.000000,5.000000,10.895575,0.000000,3.169925,8.647458
1014,18.717512,16.106277,11.586371,15.236537,11.672867,13.828930,13.348037,9.636625,6.022368,15.192447,...,8.607330,12.266787,7.108524,10.068778,0.000000,3.906891,11.617927,0.000000,0.000000,8.596190


In [12]:
# icgc_df = icgc_df.apply(lambda col:pd.to_numeric(col, errors='coerce'))

In [13]:
# icgc_df.dtypes

In [14]:
# icgc_df.var()

In [15]:
# icgc_df = np.log2(icgc_df)

In [16]:
# icgc_df.replace([np.inf, -np.inf], 0, inplace=True)

In [17]:
# icgc_df

---
---
---

In [18]:
# icgc_xtrain, icgc_xtest, icgc_ytrain, icgc_ytest = train_test_split(icgc_df, icgc_labels, train_size=0.9, random_state=seed, stratify=icgc_labels)

In [19]:
icgc_xtrain = icgc_df
icgc_ytrain = icgc_labels

In [20]:
# print(sum(map(lambda x: x==0, icgc_ytrain)))
# print(sum(map(lambda x: x==1, icgc_ytrain)))

In [21]:
# icgc_xtest

In [22]:
# print(sum(map(lambda x: x==0, icgc_ytest)))
# print(sum(map(lambda x: x==1, icgc_ytest)))

In [23]:
icgc_xtrain.reset_index(drop=True, inplace=True)
# icgc_xtest.reset_index(drop=True, inplace=True)

In [24]:
# me.plot_feature_importance(icgc_xtrain, icgc_ytrain, PATH=PATH+"essential_genes_set/ICGC_Analysis/seed="+str(seed))

In [25]:
me.k_fold(icgc_xtrain, icgc_ytrain, colors=['violet', 'dodgerblue', 'lime'], seed=seed, PATH=PATH+"essential_genes_set/ICGC_Analysis/seed="+str(seed))

  0%|          | 0/4 [00:00<?, ?it/s]

[[45  2]
 [ 4 51]]
0.993036750483559
[[45  2]
 [ 5 50]]
0.9895551257253385
[[45  2]
 [ 2 53]]
0.9961315280464217
[[45  2]
 [ 1 54]]
0.9965183752417796
----------


  0%|          | 0/4 [00:00<?, ?it/s]

[[46  1]
 [ 4 51]]
0.983752417794971
[[45  2]
 [ 3 52]]
0.9810444874274661
[[41  6]
 [ 2 53]]
0.9771760154738879
[[45  2]
 [ 3 52]]
0.981431334622824
----------


  0%|          | 0/4 [00:00<?, ?it/s]

[[44  3]
 [ 2 53]]
0.995357833655706
[[45  2]
 [ 2 53]]
0.9949709864603482
[[44  3]
 [ 1 54]]
0.9918762088974855
[[43  4]
 [ 1 54]]
0.9926499032882011
----------


  0%|          | 0/4 [00:00<?, ?it/s]

[[43  5]
 [ 1 53]]
0.9791666666666667
[[42  6]
 [ 0 54]]
0.9722222222222222
[[42  6]
 [ 2 52]]
0.9756944444444444
[[43  5]
 [ 0 54]]
0.9861111111111112
----------


  0%|          | 0/4 [00:00<?, ?it/s]

[[43  5]
 [ 5 49]]
0.9753086419753085
[[43  5]
 [ 4 50]]
0.970679012345679
[[45  3]
 [ 5 49]]
0.9691358024691358
[[43  5]
 [ 4 50]]
0.9664351851851852
----------


  0%|          | 0/4 [00:00<?, ?it/s]

[[47  1]
 [ 2 52]]
0.996141975308642
[[47  1]
 [ 1 53]]
0.9938271604938271
[[45  3]
 [ 2 52]]
0.9930555555555556
[[47  1]
 [ 1 53]]
0.9972993827160495
----------


  0%|          | 0/4 [00:00<?, ?it/s]

[[47  0]
 [ 1 53]]
0.9976359338061466
[[47  0]
 [ 2 52]]
1.0
[[47  0]
 [ 3 51]]
0.9992119779353822
[[47  0]
 [ 1 53]]
0.9984239558707644
----------


  0%|          | 0/4 [00:00<?, ?it/s]

[[42  5]
 [ 1 53]]
0.9822695035460993
[[44  3]
 [ 1 53]]
0.9802994483845549
[[42  5]
 [ 1 53]]
0.9818754925137905
[[41  6]
 [ 1 53]]
0.9732072498029944
----------


  0%|          | 0/4 [00:00<?, ?it/s]

[[42  5]
 [ 1 53]]
0.987391646966115
[[43  4]
 [ 1 53]]
0.9842395587076438
[[43  4]
 [ 1 53]]
0.9901497241922773
[[43  4]
 [ 1 53]]
0.9858156028368794
----------


  0%|          | 0/4 [00:00<?, ?it/s]

[[46  1]
 [ 0 54]]
0.9988179669030732
[[46  1]
 [ 2 52]]
0.9956658786446021
[[45  2]
 [ 1 53]]
0.9988179669030733
[[46  1]
 [ 0 54]]
1.0
----------


Unnamed: 0,Avg Train Accuracy,AUC,Avg Test Accuracy
MLPClassifier,0.97605,0.988888,0.951825
LogisticRegression,0.970582,0.98625,0.953786
XGBClassifier,1.0,0.987312,0.946894
Support Vector Classifier,0.970909,0.987789,0.957707


---
---
---

In [26]:
# df_intersect_scaler = StandardScaler()
# icgc_X_train_scaled = df_intersect_scaler.fit_transform(icgc_xtrain)
# icgc_X_test_scaled = df_intersect_scaler.transform(icgc_xtest)

In [27]:
# roc_color= ['slateblue', 'darkorange', 'darkkhaki', 'firebrick']
# reverse_roc_color= ['aquamarine', 'darkgray', 'teal', 'cornflowerblue']
# visuals = ['Purples', 'YlOrBr', 'PRGn', 'RdYlGn']

# auc_roc_all = []
# test_acc_all = []

# for i, classifier in enumerate(me.get_sklearn_classifiers(seed)):
#     display(HTML("<hr><hr>"))
#     t1, t2 = me.plot_report_cm_auc(icgc_X_train_scaled, icgc_ytrain, icgc_X_test_scaled, icgc_ytest, classifier, visuals[i], roc_color[i], reverse_roc_color[i], PATH=PATH+"essential_genes_set/ICGC_Analysis/seed="+str(seed))
#     test_acc_all.append(t1)
#     auc_roc_all.append(t2)

In [28]:
# me.plot_accuracy_summary(auc_roc_all, test_acc_all,  colors=['violet', 'dodgerblue'], PATH=PATH+"essential_genes_set/ICGC_Analysis/seed="+str(seed))

# Level 4 complete !!

---
---
---

# annnnnnd..... Scene !!