In [1]:
# https://stackoverflow.com/questions/21971449/how-do-i-increase-the-cell-width-of-the-jupyter-ipython-notebook-in-my-browser

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:
import numpy as np
import pandas as pd
import seaborn as sea
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_curve, auc, plot_confusion_matrix, plot_precision_recall_curve, classification_report


plt.style.use('ggplot')
sea.set_style("ticks")
sea.set_context("paper", font_scale=1.5, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(8,8))
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import os
import gc
import time
import copy
import torch
import torch.nn as nn
import model_utils as u
import torch.nn.functional as F
from torch.autograd import Variable
from torchvision import utils, transforms
from torch.utils.data import Dataset, DataLoader

In [4]:
torch.use_deterministic_algorithms(True)
os.environ["CUBLAS_WORKSPACE_CONFIG"]=":16:8"

In [5]:
import warnings
warnings.simplefilter('ignore')

In [6]:
# https://pytorch.org/docs/stable/notes/randomness.html
seed = 322
u.set_all_seeds(seed)

In [7]:
# PATH = "/content/drive/MyDrive/LUNG_PanCan/XENA Repository/"
PATH = "D:/CANCER BIOLOGY/DATASET/TCGA/FROM Xena/"

In [8]:
# https://stackoverflow.com/questions/18885175/read-a-zipped-file-as-a-pandas-dataframe
# https://www.analyticsvidhya.com/blog/2021/04/delimiters-in-pandas-read_csv-function/

df_luad = pd.read_csv(PATH+"TCGA.LUAD.sampleMap_HiSeqV2_PANCAN.gz", compression = "gzip", sep = "\t")
df_lusu = pd.read_csv(PATH+"TCGA.LUSC.sampleMap_HiSeqV2_PANCAN.gz", compression = "gzip", sep = "\t")

In [9]:
df_luad

Unnamed: 0,sample,TCGA-69-7978-01,TCGA-62-8399-01,TCGA-78-7539-01,TCGA-50-5931-11,TCGA-73-4658-01,TCGA-44-6775-01,TCGA-44-2655-01,TCGA-44-3398-01,TCGA-62-8397-01,...,TCGA-75-7025-01,TCGA-55-7726-01,TCGA-L9-A743-01,TCGA-86-8358-01,TCGA-55-6972-01,TCGA-55-7727-01,TCGA-91-6831-01,TCGA-MN-A4N4-01,TCGA-55-8302-01,TCGA-MP-A4TK-01
0,ARHGEF10L,0.125808,0.561708,-0.237592,-1.180492,-0.656192,0.139908,-0.537692,-0.839092,0.677108,...,0.226508,-2.342092,-0.207692,-0.659792,-1.651292,-2.621192,-1.025192,0.070108,0.305608,0.263208
1,HIF3A,-1.294926,6.069174,3.581474,3.927674,-0.525926,-1.497426,-0.021226,0.179974,1.092974,...,2.539674,-1.259526,-0.387226,3.689474,3.509374,1.986874,-1.993426,2.790974,-0.018326,4.657474
2,RNF17,-0.112935,-0.531035,0.592065,0.291065,-0.531035,0.475865,0.071065,-0.531035,-0.531035,...,-0.068235,-0.531035,0.428265,0.202865,0.567665,0.408165,-0.531035,0.440465,-0.531035,0.049365
3,RNF10,-1.411872,-0.228672,-0.108372,-0.043472,-0.156672,-0.605472,0.139328,-0.450172,0.583528,...,-0.451572,0.261228,-0.331772,-0.213372,-0.189472,0.091028,0.492828,0.037428,0.003728,-0.334572
4,RNF11,0.203922,0.052122,-0.499978,0.710822,0.373522,0.129022,0.436522,0.529622,0.314922,...,-0.155778,0.362522,-0.520578,0.031222,-0.966478,0.318322,0.150822,-0.357778,-0.451578,0.156422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20525,PTRF,0.863614,0.317114,-1.243086,2.802714,1.110714,0.879814,-0.028286,0.008714,-0.005186,...,0.544614,1.638214,0.381814,-1.080186,-2.139886,-1.495486,0.472314,0.993014,0.634014,1.222714
20526,BCL6B,0.802173,1.079073,-1.283227,2.250473,1.513973,0.126473,0.576073,0.643573,-0.735227,...,0.842873,0.680873,0.642773,0.154673,-0.849427,-0.359227,0.332973,0.570873,-0.909527,0.671573
20527,GSTK1,0.108205,-0.782695,0.034105,-0.540795,-0.454095,-0.797795,0.531305,0.310605,0.204105,...,-0.454195,-0.466595,-0.011995,-0.503195,0.512405,-0.584495,-1.756895,0.506805,-0.114895,-0.189095
20528,SELP,0.595367,3.114267,0.571467,3.985967,2.893167,1.805567,2.445467,2.575967,1.336567,...,2.817667,-0.301333,2.508367,-0.518033,-1.540033,1.663867,-0.275933,-0.073933,0.848867,1.195667


In [10]:
df_lusu

Unnamed: 0,sample,TCGA-18-3417-01,TCGA-22-4613-01,TCGA-90-7769-01,TCGA-77-A5G1-01,TCGA-77-A5G3-01,TCGA-66-2766-01,TCGA-37-4135-01,TCGA-56-8201-01,TCGA-56-7582-11,...,TCGA-77-8144-01,TCGA-J1-A4AH-01,TCGA-56-7580-01,TCGA-63-A5MY-01,TCGA-33-AASL-01,TCGA-85-A512-01,TCGA-85-8354-01,TCGA-O2-A5IB-01,TCGA-77-7335-01,TCGA-56-7731-11
0,ARHGEF10L,-2.032992,-1.109192,-1.270392,0.054708,-1.344192,-1.145092,-0.737892,-0.619892,-0.050992,...,-2.237692,-1.395092,-1.926792,-1.335292,-0.762292,-1.286992,-1.847792,0.568508,-0.849892,-0.888992
1,HIF3A,-0.775126,-1.023426,-3.254826,0.075174,0.826174,1.306874,-2.036826,-0.015326,2.772874,...,-5.006326,3.974574,3.154774,4.021874,-0.831926,-1.863426,-1.297326,0.025974,-2.414126,4.229474
2,RNF17,0.573765,-0.531035,-0.090835,-0.531035,-0.531035,-0.049535,0.829765,-0.531035,3.204265,...,-0.531035,1.349965,4.157765,1.554065,0.984465,-0.531035,-0.531035,0.084865,0.192865,-0.531035
3,RNF10,0.365228,-0.326772,0.160728,-0.147472,-0.364672,-0.697672,-0.765472,0.068428,-0.010572,...,-0.381072,-0.527472,0.171028,-0.292972,-0.651572,0.020328,-0.280072,-0.009372,0.197228,0.162228
4,RNF11,0.364522,0.308122,0.368322,0.826222,-0.312978,-0.792078,0.583822,-0.573278,0.323522,...,0.075622,-0.513978,-0.604678,0.479122,-0.446678,-0.560978,-0.560278,-0.136278,0.443922,0.344522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20525,PTRF,1.025014,-0.396686,0.094714,1.569614,1.269914,-0.263486,-2.675286,1.638114,2.118814,...,1.972214,0.993014,-0.233586,-0.222586,0.774714,0.434414,1.246914,-2.425586,1.550114,2.468514
20526,BCL6B,-0.381027,-0.781327,-2.267827,-0.409227,-0.542127,-0.932927,-1.847227,0.972273,4.176973,...,-0.010327,-1.413827,-0.132027,-1.714727,-2.306727,-0.491427,-0.070027,-0.734427,0.294773,2.667973
20527,GSTK1,0.839305,-0.732495,-1.137095,-0.003395,-1.324995,-0.113095,0.841305,-0.320395,-0.376795,...,-1.539295,-1.318095,-0.900095,0.251605,0.406205,0.318305,-0.620695,-1.926795,0.369005,-0.264695
20528,SELP,-1.085033,1.733867,-1.210233,1.761467,-1.784433,-0.762733,-2.145733,0.812667,1.800567,...,-3.797533,0.044667,-1.761533,-0.018533,-2.346733,-1.011433,-1.390333,-2.121633,1.909267,3.930867


---
---
---

# Make raw dataset compatible with PROMO

In [11]:
df_luad.set_index('sample', inplace=True)
df_lusu.set_index('sample', inplace=True)

In [12]:
df_luad = df_luad.T
df_lusu = df_lusu.T

In [13]:
df_luad.insert(0, 'label', 'LUAD')
df_lusu.insert(0, 'label', 'LUSC')

In [14]:
df_luad.set_index('label', inplace=True)
df_lusu.set_index('label', inplace=True)

In [15]:
df_luad

sample,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,REM1,MTVR2,RTN4RL2,...,TULP2,NPY5R,GNGT2,GNGT1,TULP3,PTRF,BCL6B,GSTK1,SELP,SELS
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LUAD,0.125808,-1.294926,-0.112935,-1.411872,0.203922,0.09939,-0.222094,0.504354,-0.423399,2.895028,...,1.134722,-0.845117,1.760167,-1.28139,0.224623,0.863614,0.802173,0.108205,0.595367,-0.222712
LUAD,0.561708,6.069174,-0.531035,-0.228672,0.052122,-1.20601,-0.338894,1.449854,0.039401,0.274628,...,-0.286078,-0.055517,-0.028233,0.04801,0.295223,0.317114,1.079073,-0.782695,3.114267,-0.388912
LUAD,-0.237592,3.581474,0.592065,-0.108372,-0.499978,-0.02541,0.163006,0.131654,-0.050899,-1.770772,...,2.209922,-1.587117,1.565367,2.63871,0.049123,-1.243086,-1.283227,0.034105,0.571467,0.233588
LUAD,-1.180492,3.927674,0.291065,-0.043472,0.710822,1.05089,-0.564394,1.683954,0.398701,-1.954472,...,-0.748878,0.900483,1.903567,-1.28139,-0.549277,2.802714,2.250473,-0.540795,3.985967,0.370988
LUAD,-0.656192,-0.525926,-0.531035,-0.156672,0.373522,0.44729,-0.438994,1.831554,-0.423399,2.108328,...,-0.748878,-0.931417,1.915167,-0.17659,0.042323,1.110714,1.513973,-0.454095,2.893167,0.193788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LUAD,-2.621192,1.986874,0.408165,0.091028,0.318322,0.16679,0.143706,-1.370846,-0.423399,-0.218672,...,-0.748878,-0.647917,1.158367,1.77851,0.555323,-1.495486,-0.359227,-0.584495,1.663867,0.188988
LUAD,-1.025192,-1.993426,-0.531035,0.492828,0.150822,-0.53401,-1.071294,-0.701346,-0.423399,2.515128,...,0.076322,-1.587117,-0.369733,0.44771,0.319623,0.472314,0.332973,-1.756895,-0.275933,0.732388
LUAD,0.070108,2.790974,0.440465,0.037428,-0.357778,-0.32071,-1.039394,1.124154,-0.423399,3.996128,...,2.952822,-1.587117,0.024967,1.66841,0.566123,0.993014,0.570873,0.506805,-0.073933,-0.243412
LUAD,0.305608,-0.018326,-0.531035,0.003728,-0.451578,-0.38671,-0.084494,-0.217746,-0.423399,-0.706872,...,-0.748878,-1.587117,1.078867,-1.28139,0.280623,0.634014,-0.909527,-0.114895,0.848867,0.195588


In [16]:
df_lusu

sample,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,REM1,MTVR2,RTN4RL2,...,TULP2,NPY5R,GNGT2,GNGT1,TULP3,PTRF,BCL6B,GSTK1,SELP,SELS
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LUSC,-2.032992,-0.775126,0.573765,0.365228,0.364522,0.31389,-0.245994,0.016254,-0.423399,-1.116572,...,-0.748878,-1.587117,0.304967,2.60701,1.302623,1.025014,-0.381027,0.839305,-1.085033,0.533188
LUSC,-1.109192,-1.023426,-0.531035,-0.326772,0.308122,1.19219,-0.814294,-0.618246,1.046401,-2.143072,...,-0.748878,-1.587117,0.779667,1.15821,0.124023,-0.396686,-0.781327,-0.732495,1.733867,0.624488
LUSC,-1.270392,-3.254826,-0.090835,0.160728,0.368322,-0.52621,0.506406,-1.370946,0.016801,-3.830172,...,0.028022,-1.587117,-2.528233,2.07981,-0.431477,0.094714,-2.267827,-1.137095,-1.210233,0.153688
LUSC,0.054708,0.075174,-0.531035,-0.147472,0.826222,0.38929,-0.713794,0.012254,-0.423399,-0.968672,...,0.157422,-1.587117,0.382067,1.36141,0.588023,1.569614,-0.409227,-0.003395,1.761467,0.738688
LUSC,-1.344192,0.826174,-0.531035,-0.364672,-0.312978,0.57109,-0.247594,-1.987746,0.466601,-0.645172,...,0.141122,-1.587117,-1.158633,0.99351,0.583523,1.269914,-0.542127,-1.324995,-1.784433,0.881688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LUSC,-1.286992,-1.863426,-0.531035,0.020328,-0.560978,0.51729,0.519806,-0.748146,-0.423399,-2.228572,...,-0.748878,-1.587117,-0.236033,1.41071,1.653723,0.434414,-0.491427,0.318305,-1.011433,0.392288
LUSC,-1.847792,-1.297326,-0.531035,-0.280072,-0.560278,0.85009,-0.166294,-1.519246,0.602101,-2.013972,...,0.276622,-1.587117,-0.784133,1.46201,0.765023,1.246914,-0.070027,-0.620695,-1.390333,0.918888
LUSC,0.568508,0.025974,0.084865,-0.009372,-0.136278,-1.41811,0.173506,-2.194046,0.622701,2.913828,...,-0.408378,-1.246617,-1.940533,-1.28139,1.281723,-2.425586,-0.734427,-1.926795,-2.121633,-0.637412
LUSC,-0.849892,-2.414126,0.192865,0.197228,0.443922,0.54789,0.084406,1.015054,-0.423399,0.130728,...,-0.748878,-1.587117,2.160367,1.19391,0.197023,1.550114,0.294773,0.369005,1.909267,0.117688


In [17]:
luad_lusu_raw_data=pd.concat([df_luad, df_lusu])

In [18]:
luad_lusu_raw_data

sample,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,REM1,MTVR2,RTN4RL2,...,TULP2,NPY5R,GNGT2,GNGT1,TULP3,PTRF,BCL6B,GSTK1,SELP,SELS
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LUAD,0.125808,-1.294926,-0.112935,-1.411872,0.203922,0.09939,-0.222094,0.504354,-0.423399,2.895028,...,1.134722,-0.845117,1.760167,-1.28139,0.224623,0.863614,0.802173,0.108205,0.595367,-0.222712
LUAD,0.561708,6.069174,-0.531035,-0.228672,0.052122,-1.20601,-0.338894,1.449854,0.039401,0.274628,...,-0.286078,-0.055517,-0.028233,0.04801,0.295223,0.317114,1.079073,-0.782695,3.114267,-0.388912
LUAD,-0.237592,3.581474,0.592065,-0.108372,-0.499978,-0.02541,0.163006,0.131654,-0.050899,-1.770772,...,2.209922,-1.587117,1.565367,2.63871,0.049123,-1.243086,-1.283227,0.034105,0.571467,0.233588
LUAD,-1.180492,3.927674,0.291065,-0.043472,0.710822,1.05089,-0.564394,1.683954,0.398701,-1.954472,...,-0.748878,0.900483,1.903567,-1.28139,-0.549277,2.802714,2.250473,-0.540795,3.985967,0.370988
LUAD,-0.656192,-0.525926,-0.531035,-0.156672,0.373522,0.44729,-0.438994,1.831554,-0.423399,2.108328,...,-0.748878,-0.931417,1.915167,-0.17659,0.042323,1.110714,1.513973,-0.454095,2.893167,0.193788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LUSC,-1.286992,-1.863426,-0.531035,0.020328,-0.560978,0.51729,0.519806,-0.748146,-0.423399,-2.228572,...,-0.748878,-1.587117,-0.236033,1.41071,1.653723,0.434414,-0.491427,0.318305,-1.011433,0.392288
LUSC,-1.847792,-1.297326,-0.531035,-0.280072,-0.560278,0.85009,-0.166294,-1.519246,0.602101,-2.013972,...,0.276622,-1.587117,-0.784133,1.46201,0.765023,1.246914,-0.070027,-0.620695,-1.390333,0.918888
LUSC,0.568508,0.025974,0.084865,-0.009372,-0.136278,-1.41811,0.173506,-2.194046,0.622701,2.913828,...,-0.408378,-1.246617,-1.940533,-1.28139,1.281723,-2.425586,-0.734427,-1.926795,-2.121633,-0.637412
LUSC,-0.849892,-2.414126,0.192865,0.197228,0.443922,0.54789,0.084406,1.015054,-0.423399,0.130728,...,-0.748878,-1.587117,2.160367,1.19391,0.197023,1.550114,0.294773,0.369005,1.909267,0.117688


In [19]:
# luad_lusu_raw_data.T.to_csv(PATH+"essential_genes_set/PROMO Analysis/luad_lusu_raw_data.csv", sep='\t')

---
---
---

# Make union of all seed genes compatible with PROMO

In [20]:
# # https://stackoverflow.com/questions/18885175/read-a-zipped-file-as-a-pandas-dataframe
# # https://www.analyticsvidhya.com/blog/2021/04/delimiters-in-pandas-read_csv-function/

# df_luad = pd.read_csv(PATH+"TCGA.LUAD.sampleMap_HiSeqV2_PANCAN.gz", compression = "gzip", sep = "\t")
# df_lusu = pd.read_csv(PATH+"TCGA.LUSC.sampleMap_HiSeqV2_PANCAN.gz", compression = "gzip", sep = "\t")

In [21]:
# df_final= u.prepare_data_for_ranksum_test(df_luad, df_lusu)

In [22]:
# essential_genes_union = u.read_essential_genes_union(PATH)
# essential_genes_union.append('label')

In [23]:
# ## https://stackoverflow.com/questions/40636514/selecting-columns-by-list-and-columns-are-subset-of-list

# df_intersect = df_final[df_final.columns.intersection(essential_genes_union)]

In [24]:
# df_intersect

In [25]:
# df_intersect.set_index('label', inplace=True)

In [26]:
# df_intersect.T.to_csv(PATH+"PROMO Analysis/df_intersect_union_genes.csv", sep='\t')

---
---
---

# Make essential_genes_set dataset compatible with PROMO

In [27]:
# https://stackoverflow.com/questions/18885175/read-a-zipped-file-as-a-pandas-dataframe
# https://www.analyticsvidhya.com/blog/2021/04/delimiters-in-pandas-read_csv-function/

df_luad = pd.read_csv(PATH+"TCGA.LUAD.sampleMap_HiSeqV2_PANCAN.gz", compression = "gzip", sep = "\t")
df_lusu = pd.read_csv(PATH+"TCGA.LUSC.sampleMap_HiSeqV2_PANCAN.gz", compression = "gzip", sep = "\t")

In [28]:
df_final= u.prepare_data_for_ranksum_test(df_luad, df_lusu)

Unnamed: 0,label,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,REM1,MTVR2,...,TULP2,NPY5R,GNGT2,GNGT1,TULP3,PTRF,BCL6B,GSTK1,SELP,SELS
TCGA-69-7978-01,LUAD,0.125808,-1.29493,-0.112935,-1.41187,0.203922,0.0993901,-0.222094,0.504354,-0.423399,...,1.13472,-0.845117,1.76017,-1.28139,0.224623,0.863614,0.802173,0.108205,0.595367,-0.222712
TCGA-62-8399-01,LUAD,0.561708,6.06917,-0.531035,-0.228672,0.0521219,-1.20601,-0.338894,1.44985,0.0394006,...,-0.286078,-0.055517,-0.0282335,0.0480102,0.295223,0.317114,1.07907,-0.782695,3.11427,-0.388912
TCGA-78-7539-01,LUAD,-0.237592,3.58147,0.592065,-0.108372,-0.499978,-0.0254099,0.163006,0.131654,-0.0508994,...,2.20992,-1.58712,1.56537,2.63871,0.0491232,-1.24309,-1.28323,0.0341054,0.571467,0.233588
TCGA-50-5931-11,LUAD,-1.18049,3.92767,0.291065,-0.043472,0.710822,1.05089,-0.564394,1.68395,0.398701,...,-0.748878,0.900483,1.90357,-1.28139,-0.549277,2.80271,2.25047,-0.540795,3.98597,0.370988
TCGA-73-4658-01,LUAD,-0.656192,-0.525926,-0.531035,-0.156672,0.373522,0.44729,-0.438994,1.83155,-0.423399,...,-0.748878,-0.931417,1.91517,-0.17659,0.0423232,1.11071,1.51397,-0.454095,2.89317,0.193788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-85-A512-01,LUSC,-1.28699,-1.86343,-0.531035,0.020328,-0.560978,0.51729,0.519806,-0.748146,-0.423399,...,-0.748878,-1.58712,-0.236033,1.41071,1.65372,0.434414,-0.491427,0.318305,-1.01143,0.392288
TCGA-85-8354-01,LUSC,-1.84779,-1.29733,-0.531035,-0.280072,-0.560278,0.85009,-0.166294,-1.51925,0.602101,...,0.276622,-1.58712,-0.784133,1.46201,0.765023,1.24691,-0.0700266,-0.620695,-1.39033,0.918888
TCGA-O2-A5IB-01,LUSC,0.568508,0.0259737,0.084865,-0.00937199,-0.136278,-1.41811,0.173506,-2.19405,0.622701,...,-0.408378,-1.24662,-1.94053,-1.28139,1.28172,-2.42559,-0.734427,-1.92679,-2.12163,-0.637412
TCGA-77-7335-01,LUSC,-0.849892,-2.41413,0.192865,0.197228,0.443922,0.54789,0.0844055,1.01505,-0.423399,...,-0.748878,-1.58712,2.16037,1.19391,0.197023,1.55011,0.294773,0.369005,1.90927,0.117688


In [29]:
essential_genes = u.read_essential_genes(PATH)
essential_genes.append('label')

In [30]:
## https://stackoverflow.com/questions/40636514/selecting-columns-by-list-and-columns-are-subset-of-list

df_intersect = df_final[df_final.columns.intersection(essential_genes)]

In [31]:
df_intersect

Unnamed: 0,label,TMPRSS11D,FLJ44635,C10orf99,NACA2,TMEM40,SPRR1B,LASS3,TMPRSS11A,PTTG3P,...,S100A2,SPRR2A,RNASE7,SPRR2D,HEY1,RHCG,BNC1,SERPINB13,C14orf19,S100A7A
TCGA-69-7978-01,LUAD,-0.845109,-0.998144,-2.7577,-0.256212,-0.159503,5.37031,-2.5496,-1.33597,-0.734992,...,1.50406,-0.280343,-1.73515,5.74156,-0.600271,-4.1308,-1.33005,-1.68125,-0.32905,-0.446876
TCGA-62-8399-01,LUAD,-1.85141,-0.992544,-2.7577,-0.768012,-0.0107027,-2.94299,3.5763,-0.873174,-0.870592,...,-0.576137,-2.40554,-2.15325,-1.93574,-1.64277,-3.668,-1.65285,-1.63655,-1.64165,-1.45318
TCGA-78-7539-01,LUAD,-1.47891,-0.458644,-2.7577,-0.247612,-0.837203,-2.94299,-2.1771,-1.33597,-1.59209,...,-1.12504,-2.40554,-1.48505,-2.39854,-1.99837,-1.1184,-2.99385,-2.09935,0.13245,-1.45318
TCGA-50-5931-11,LUAD,-1.85141,3.23656,-2.7577,0.992388,-1.3847,-2.94299,-1.7275,-1.33597,-1.14249,...,-2.77374,-1.06294,-2.15325,-1.05594,1.03963,2.1652,-2.83995,-2.09935,1.00315,-1.45318
TCGA-73-4658-01,LUAD,-0.746609,-0.234244,-2.7577,0.601888,0.987797,5.18661,-2.5496,-1.33597,-1.30889,...,3.80246,1.59086,2.08225,4.55716,0.296929,-3.026,-1.70715,-2.09935,-0.29975,-0.00637567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-85-A512-01,LUSC,7.52139,-0.102744,6.1288,0.420088,5.194,10.6855,5.8307,5.03793,1.72671,...,4.59366,9.65216,5.74975,9.98326,3.23653,-0.817796,5.47785,7.65565,-0.29205,7.61642
TCGA-85-8354-01,LUSC,4.17569,-1.24654,2.6493,0.270088,5.1918,6.64191,5.854,5.89423,0.778808,...,6.48686,5.01116,-1.12775,5.57076,2.48463,4.7158,5.98285,6.46485,-1.85215,-0.427676
TCGA-O2-A5IB-01,LUSC,-1.00441,-1.71774,-2.7577,-0.157312,0.433197,-0.408895,0.1709,-0.995474,0.00940847,...,2.56396,2.12416,0.880348,-0.00353874,2.89833,0.125004,-1.59345,-1.25235,-1.40815,1.75682
TCGA-77-7335-01,LUSC,10.8078,0.922956,7.7021,0.826388,4.9753,8.25861,7.8454,8.90213,1.46481,...,5.24126,6.54546,2.29635,6.42956,1.02593,7.0034,4.45305,9.63895,-0.89125,-0.729276


In [32]:
df_intersect.set_index('label', inplace=True)

In [33]:
df_intersect

Unnamed: 0_level_0,TMPRSS11D,FLJ44635,C10orf99,NACA2,TMEM40,SPRR1B,LASS3,TMPRSS11A,PTTG3P,KRT14,...,S100A2,SPRR2A,RNASE7,SPRR2D,HEY1,RHCG,BNC1,SERPINB13,C14orf19,S100A7A
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LUAD,-0.845109,-0.998144,-2.7577,-0.256212,-0.159503,5.37031,-2.5496,-1.33597,-0.734992,0.543442,...,1.50406,-0.280343,-1.73515,5.74156,-0.600271,-4.1308,-1.33005,-1.68125,-0.32905,-0.446876
LUAD,-1.85141,-0.992544,-2.7577,-0.768012,-0.0107027,-2.94299,3.5763,-0.873174,-0.870592,-2.14946,...,-0.576137,-2.40554,-2.15325,-1.93574,-1.64277,-3.668,-1.65285,-1.63655,-1.64165,-1.45318
LUAD,-1.47891,-0.458644,-2.7577,-0.247612,-0.837203,-2.94299,-2.1771,-1.33597,-1.59209,-4.93766,...,-1.12504,-2.40554,-1.48505,-2.39854,-1.99837,-1.1184,-2.99385,-2.09935,0.13245,-1.45318
LUAD,-1.85141,3.23656,-2.7577,0.992388,-1.3847,-2.94299,-1.7275,-1.33597,-1.14249,-5.31016,...,-2.77374,-1.06294,-2.15325,-1.05594,1.03963,2.1652,-2.83995,-2.09935,1.00315,-1.45318
LUAD,-0.746609,-0.234244,-2.7577,0.601888,0.987797,5.18661,-2.5496,-1.33597,-1.30889,-2.98026,...,3.80246,1.59086,2.08225,4.55716,0.296929,-3.026,-1.70715,-2.09935,-0.29975,-0.00637567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LUSC,7.52139,-0.102744,6.1288,0.420088,5.194,10.6855,5.8307,5.03793,1.72671,5.31474,...,4.59366,9.65216,5.74975,9.98326,3.23653,-0.817796,5.47785,7.65565,-0.29205,7.61642
LUSC,4.17569,-1.24654,2.6493,0.270088,5.1918,6.64191,5.854,5.89423,0.778808,10.9679,...,6.48686,5.01116,-1.12775,5.57076,2.48463,4.7158,5.98285,6.46485,-1.85215,-0.427676
LUSC,-1.00441,-1.71774,-2.7577,-0.157312,0.433197,-0.408895,0.1709,-0.995474,0.00940847,-1.98076,...,2.56396,2.12416,0.880348,-0.00353874,2.89833,0.125004,-1.59345,-1.25235,-1.40815,1.75682
LUSC,10.8078,0.922956,7.7021,0.826388,4.9753,8.25861,7.8454,8.90213,1.46481,7.79084,...,5.24126,6.54546,2.29635,6.42956,1.02593,7.0034,4.45305,9.63895,-0.89125,-0.729276


In [34]:
## drop two genes (this step should be performed only after wilcoxon ranksum test)
df_intersect.drop(columns=['GYPB', 'CLEC1B'], inplace=True)

In [35]:
df_intersect.T.to_csv(PATH+"essential_genes_set/PROMO Analysis/df_intersect.csv", sep='\t')

---
----
---

# Now,  goto PROMO Tool and identify top_genes_set out of essential_genes_set using df_intersect....