# Model Selection with updated Dataset

## Data Preprocessing

In [147]:
import pandas as pd

# Load dataset
methexpr_df = pd.read_csv('data/processed/ml_with_gene_expr.csv.gz',
				 compression='gzip',
				 index_col=0,
				 low_memory=False)
# Separate feature types
metadata_cols = ['primary site', 'primary histology', 'cosmic_id']
methylation_cols = [col for col in methexpr_df.columns if col.startswith('cg')]
expression_cols = [col for col in methexpr_df.columns if col.startswith('expr_')]
# Extract subsets
X_meth = methexpr_df[methylation_cols]  # 1018 x 10000
X_expr = methexpr_df[expression_cols]   # 1018 x 4956
metadata = methexpr_df[metadata_cols]




In [148]:
methexpr_df

Unnamed: 0,primary site,primary histology,cosmic_id,cg00944421,cg14557185,cg00989853,cg24702147,cg06723863,cg27174108,cg14481208,...,expr_SMAP2,expr_TGFB1,expr_RHAG,expr_ACOT13,expr_MRPL41,expr_PRKCQ,expr_VMP1,expr_PLCG1,expr_IFFO1,expr_ZFYVE21
697,blood,lymphoblastic_leukemia,906800.0,0.995309,0.999933,0.992589,0.993804,1.000000,0.989910,0.991938,...,,,,,,,,,,
5637,urogenital_system,bladder,687452.0,0.010293,0.009700,0.000000,0.019619,0.001614,0.006393,0.037711,...,,,,,,,,,,
201T,lung,lung_NSCLC_adenocarcinoma,1287381.0,0.821831,0.004671,0.091291,0.013206,0.016155,0.003981,0.008285,...,,,,,,,,,,
22RV1,urogenital_system,prostate,924100.0,0.005373,0.995386,0.000000,0.005776,0.004595,0.985648,0.009981,...,,,,,,,,,,
23132-87,digestive_system,stomach,910924.0,0.000000,0.935833,0.000000,0.008962,0.002041,0.002005,0.026887,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,,,,,,,,,,,...,4.468082,3.159499,3.161347,9.481404,8.298121,3.017939,9.832539,3.654862,3.031856,7.945312
2,,,,,,,,,,,...,4.468082,3.159499,3.161347,9.481404,8.298121,3.017939,9.832539,3.654862,3.031856,7.945312
4,,,,,,,,,,,...,4.468082,3.159499,3.161347,9.481404,8.298121,3.017939,9.832539,3.654862,3.031856,7.945312
Q,,,,,,,,,,,...,4.468082,3.159499,3.161347,9.481404,8.298121,3.017939,9.832539,3.654862,3.031856,7.945312


In [149]:
response_df = pd.read_csv('data/processed/ML_dataset_methylation_drug_response.csv.gz',
				 compression='gzip',
				 index_col=0,
				 low_memory=False)


drug_cols = [col for col in response_df.columns if col not in metadata_cols and col not in methylation_cols]

y_all_drugs = response_df[drug_cols]  # 1018 x 265

y_all_drugs

Unnamed: 0,(5Z)-7-Oxozeaenol,5-Fluorouracil,965-D2,993-D2,A-443654,A-770041,A-83-01,ACY-1215,AGI-6780,AICA Ribonucleotide,...,ZG-10,ZL049,ZL109,ZM447439,ZSTK474,Zibotentan,"eEF2K Inhibitor, A-484954",kb NB 142-70,rTRAIL,torin2
22RV1,2.473594,0.579534,3.072015,5.010936,,,5.299894,0.848553,2.072854,8.277564,...,2.216771,3.456567,0.704177,3.415833,-0.571660,5.886388,5.891054,2.989656,-0.586829,
23132-87,1.301368,1.365854,3.036932,4.696090,,,5.371888,1.069621,1.808507,8.504796,...,,3.582629,-0.153728,3.171768,0.052341,5.419213,5.460743,2.988242,-0.347357,
42-MG-BA,0.588826,2.220896,2.992956,4.755989,,,3.793761,2.390197,1.303087,7.562482,...,,2.995327,-0.980930,1.339098,2.235509,5.494359,5.566457,3.082294,-1.651262,
451LU,-3.318480,4.792561,4.815939,4.402518,,,3.534849,1.467267,1.327098,8.796535,...,1.720577,3.290158,-0.756653,3.080837,2.359697,5.481833,5.630239,2.657428,-0.139989,
5637,1.626606,3.166983,4.410360,4.227483,,,3.847780,1.039981,1.127383,8.011505,...,,2.053874,0.046212,0.556022,0.239659,5.144572,5.273391,2.519340,-2.889220,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YH-13,-0.858999,5.626487,4.732312,4.736507,,,4.281385,1.124988,1.000514,9.042915,...,0.712498,2.348748,-0.409551,1.514446,0.242131,5.164013,5.539282,1.610461,0.131337,
YKG-1,0.499784,4.074388,3.765386,4.776605,,,4.390300,1.853255,1.934874,6.477155,...,,2.297815,-0.086315,1.239723,0.718917,5.048165,5.600037,2.003123,-1.560052,
YMB-1-E,,3.490093,,,,,3.552885,2.729916,2.494366,,...,,,,,-0.508052,6.181953,6.255944,3.402388,,
YT,0.506163,-0.173922,4.449358,4.390753,2.238528,3.841682,4.291107,1.042755,1.061003,8.023256,...,,2.737701,0.298806,1.466147,-3.425817,5.165823,5.211461,1.620578,-0.557084,


In [150]:
df = methexpr_df.join(y_all_drugs, how='inner', lsuffix='_caller', rsuffix='_other')
df

Unnamed: 0,primary site,primary histology,cosmic_id,cg00944421,cg14557185,cg00989853,cg24702147,cg06723863,cg27174108,cg14481208,...,ZG-10,ZL049,ZL109,ZM447439,ZSTK474,Zibotentan,"eEF2K Inhibitor, A-484954",kb NB 142-70,rTRAIL,torin2
697,blood,lymphoblastic_leukemia,906800.0,0.995309,0.999933,0.992589,0.993804,1.000000,0.989910,0.991938,...,,1.389685,-1.238606,-0.364143,-2.428664,4.825729,4.690386,0.684583,-3.045563,
5637,urogenital_system,bladder,687452.0,0.010293,0.009700,0.000000,0.019619,0.001614,0.006393,0.037711,...,,2.053874,0.046212,0.556022,0.239659,5.144572,5.273391,2.519340,-2.889220,
22RV1,urogenital_system,prostate,924100.0,0.005373,0.995386,0.000000,0.005776,0.004595,0.985648,0.009981,...,2.216771,3.456567,0.704177,3.415833,-0.571660,5.886388,5.891054,2.989656,-0.586829,
23132-87,digestive_system,stomach,910924.0,0.000000,0.935833,0.000000,0.008962,0.002041,0.002005,0.026887,...,,3.582629,-0.153728,3.171768,0.052341,5.419213,5.460743,2.988242,-0.347357,
42-MG-BA,nervous_system,glioma,687561.0,0.994996,0.000000,1.000000,1.000000,1.000000,1.000000,0.995498,...,,2.995327,-0.980930,1.339098,2.235509,5.494359,5.566457,3.082294,-1.651262,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YAPC,pancreas,pancreas,909904.0,0.009136,0.007066,0.000898,0.015999,0.014864,0.008737,0.019683,...,,5.121817,1.659229,2.946571,2.190982,6.153842,6.538489,3.625790,-1.416428,
YH-13,nervous_system,glioma,909905.0,0.963341,0.005739,0.989838,0.992368,0.993164,0.989321,0.982065,...,0.712498,2.348748,-0.409551,1.514446,0.242131,5.164013,5.539282,1.610461,0.131337,
YKG-1,nervous_system,glioma,687592.0,0.996477,0.007576,0.986879,0.996015,0.993360,0.996321,0.992586,...,,2.297815,-0.086315,1.239723,0.718917,5.048165,5.600037,2.003123,-1.560052,
YT,blood,lymphoid_neoplasm other,946358.0,1.000000,0.998999,0.998225,0.987221,1.000000,0.995139,0.990399,...,,2.737701,0.298806,1.466147,-3.425817,5.165823,5.211461,1.620578,-0.557084,


In [151]:
df.loc['YH-13'][expression_cols].isna().sum()

np.int64(4956)

In [152]:
expression_cols

['expr_RPS4Y1',
 'expr_KRT19',
 'expr_VIM',
 'expr_S100P',
 'expr_TACSTD2',
 'expr_TGFBI',
 'expr_nan',
 'expr_TM4SF1',
 'expr_SRGN',
 'expr_CAV1',
 'expr_DKK1',
 'expr_C19orf33',
 'expr_KRT8',
 'expr_SPINT2',
 'expr_NNMT',
 'expr_EPCAM',
 'expr_BEX1',
 'expr_IFITM3',
 'expr_UCHL1',
 'expr_MYOF',
 'expr_SPOCK1',
 'expr_BASP1',
 'expr_MAL2',
 'expr_HLA-DRA',
 'expr_CYR61',
 'expr_DSP',
 'expr_GNG11',
 'expr_SLPI',
 'expr_MGST1',
 'expr_FN1',
 'expr_PXDN',
 'expr_HSPA1A',
 'expr_SPP1',
 'expr_LGALS1',
 'expr_LGALS3',
 'expr_BST2',
 'expr_ANXA1',
 'expr_NGFRAP1',
 'expr_PRSS23',
 'expr_IFI27',
 'expr_TPD52L1',
 'expr_GDF15',
 'expr_ANXA3',
 'expr_NUPR1',
 'expr_KRT7',
 'expr_S100A14',
 'expr_S100A16',
 'expr_LCN2',
 'expr_MIR205HG',
 'expr_EFEMP1',
 'expr_SPARC',
 'expr_AKR1C1',
 'expr_BEX4',
 'expr_UCA1',
 'expr_HLA-DPA1',
 'expr_TSPAN8',
 'expr_GYPC',
 'expr_LCP1',
 'expr_ESRP1',
 'expr_IGFBP3',
 'expr_RAB25',
 'expr_CYBA',
 'expr_TUBB2B',
 'expr_CAV2',
 'expr_ALDH1A1',
 'expr_CLEC2B',


In [153]:
len(expression_cols)

4956

In [154]:
df.loc['YH-13'][methylation_cols].isna().sum()

np.int64(8)

In [155]:
len(methylation_cols)

10000

In [156]:
[drug for drug in df.columns if 'mimat' in drug]

[]

In [157]:
gdsc_raw = pd.read_csv('data/Cell_line_RMA_proc_basalExp.txt',
				 sep='\t',
				 index_col=0,
				 low_memory=False)

indices = [str(drug) for drug in gdsc_raw.index if pd.notna(drug)]
new_indices = ['expr_' + drug for drug in indices]

len(new_indices) > len(expression_cols)

True

In [158]:
gdsc_raw

Unnamed: 0_level_0,GENE_title,DATA.906826,DATA.687983,DATA.910927,DATA.1240138,DATA.1240139,DATA.906792,DATA.910688,DATA.1240135,DATA.1290812,...,DATA.753584,DATA.907044,DATA.998184,DATA.908145,DATA.1659787,DATA.1298157,DATA.1480372,DATA.1298533,DATA.930299,DATA.905954.1
GENE_SYMBOLS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,tetraspanin 6 [Source:HGNC Symbol;Acc:11858],7.632023,7.548671,8.712338,7.797142,7.729268,7.074533,3.285198,6.961606,5.943046,...,7.105637,3.236503,3.038892,8.373223,6.932178,8.441628,8.422922,8.089255,3.112333,7.153127
TNMD,tenomodulin [Source:HGNC Symbol;Acc:17757],2.964585,2.777716,2.643508,2.817923,2.957739,2.889677,2.828203,2.874751,2.686874,...,2.798847,2.745137,2.976406,2.852552,2.622630,2.639276,2.879890,2.521169,2.870468,2.834285
DPM1,dolichyl-phosphate mannosyltransferase polypep...,10.379553,11.807341,9.880733,9.883471,10.418840,9.773987,10.264385,10.205931,10.299757,...,10.486486,10.442951,10.311962,10.454830,10.418475,11.463742,10.557777,10.792750,9.873902,10.788218
SCYL3,SCY1-like 3 (S. cerevisiae) [Source:HGNC Symbo...,3.614794,4.066887,3.956230,4.063701,4.341500,4.270903,5.968168,3.715033,3.848112,...,3.696835,4.624013,4.348524,3.858121,3.947561,4.425849,3.550390,4.443337,4.266828,4.100493
C1orf112,chromosome 1 open reading frame 112 [Source:HG...,3.380681,3.732485,3.236620,3.558414,3.840373,3.815055,3.011867,3.268449,3.352835,...,3.726833,3.947744,3.806584,3.196988,3.814831,4.384732,4.247189,3.071359,3.230197,3.435795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,hsa-mir-5195 [Source:miRBase;Acc:MI0018174],2.852537,2.776771,2.685307,3.436412,2.951270,3.233383,3.810246,2.792116,2.641117,...,2.699663,5.190438,3.253381,3.000088,2.846830,2.959009,2.974475,2.903894,2.857956,3.033662
POLRMTP1,polymerase (RNA) mitochondrial (DNA directed) ...,3.130696,3.260982,3.176239,3.074432,3.213545,3.382112,3.200106,2.829053,3.158745,...,2.773728,2.988250,3.514337,3.254306,3.139208,3.007502,3.088841,2.847505,2.832840,2.817057
UBL5P2,ubiquitin-like 5 pseudogene 2 [Source:HGNC Sym...,9.986616,9.002814,9.113243,9.958284,9.938978,8.714820,9.396484,9.779745,9.477582,...,9.593772,9.506062,9.945730,9.890244,10.018968,9.332193,10.742651,8.544696,9.900550,9.071943
TBC1D3P5,"TBC1 domain family, member 3 pseudogene 5 [Sou...",3.073724,3.000182,2.916274,3.256500,3.396126,3.497439,3.193505,3.254539,3.143067,...,3.407260,3.256900,3.189972,3.155584,3.357660,3.435411,3.317945,3.174515,3.243563,3.324517


In [159]:
len(indices)

17419

In [160]:
len(new_indices)

17419

In [161]:
len(expression_cols)

4956

In [162]:
gdsc_expr= gdsc_raw.T.dropna(axis=1)
gdsc_expr = gdsc_expr.drop(index=gdsc_expr.index[0])
gdsc_expr

GENE_SYMBOLS,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,ZNF234,NaN,MYH4,LINC00526,PPY2,KRT18P55,NaN.1,POLRMTP1,UBL5P2,TBC1D3P5
DATA.906826,7.632023,2.964585,10.379553,3.614794,3.380681,3.324692,3.56635,8.20453,5.235118,5.369039,...,4.841169,3.005416,2.628932,6.786925,2.997054,3.331134,2.852537,3.130696,9.986616,3.073724
DATA.687983,7.548671,2.777716,11.807341,4.066887,3.732485,3.152404,7.827172,6.616972,5.809264,7.209653,...,4.570476,2.878796,2.783441,5.317911,3.263745,2.992611,2.776771,3.260982,9.002814,3.000182
DATA.910927,8.712338,2.643508,9.880733,3.95623,3.23662,3.241246,2.931034,8.191246,5.426841,5.120747,...,4.214729,2.985562,2.603604,3.143006,3.112145,2.886574,2.685307,3.176239,9.113243,2.916274
DATA.1240138,7.797142,2.817923,9.883471,4.063701,3.558414,3.101247,7.211707,8.630643,5.617714,4.996434,...,4.060761,3.054339,2.61954,3.153896,3.151576,3.812119,3.436412,3.074432,9.958284,3.2565
DATA.1240139,7.729268,2.957739,10.41884,4.3415,3.840373,3.001802,3.375422,8.29695,5.669418,4.180205,...,4.869199,2.93518,2.450375,3.65266,2.918475,3.412586,2.95127,3.213545,9.938978,3.396126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DATA.1298157,8.441628,2.639276,11.463742,4.425849,4.384732,3.229511,3.571204,8.193,5.6716,4.943996,...,4.540545,2.92134,2.595066,5.097882,3.102979,3.343723,2.959009,3.007502,9.332193,3.435411
DATA.1480372,8.422922,2.87989,10.557777,3.55039,4.247189,3.176336,3.321811,8.901706,4.684851,4.215908,...,4.062441,2.834466,2.443743,4.243448,3.034131,3.412558,2.974475,3.088841,10.742651,3.317945
DATA.1298533,8.089255,2.521169,10.79275,4.443337,3.071359,3.238305,5.209472,8.073389,5.643811,5.040952,...,4.68637,2.887736,2.603842,5.084844,2.981869,3.64039,2.903894,2.847505,8.544696,3.174515
DATA.930299,3.112333,2.870468,9.873902,4.266828,3.230197,3.027742,3.407148,5.76061,5.834256,5.550722,...,4.099547,3.029919,2.53128,4.986124,2.992148,3.142641,2.857956,2.83284,9.90055,3.243563


In [163]:
gdsc_expr = gdsc_expr[indices].copy()
gdsc_expr

GENE_SYMBOLS,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,LINC00514,OR1D5,ZNF234,MYH4,LINC00526,PPY2,KRT18P55,POLRMTP1,UBL5P2,TBC1D3P5
DATA.906826,7.632023,2.964585,10.379553,3.614794,3.380681,3.324692,3.56635,8.20453,5.235118,5.369039,...,3.665788,3.134197,4.841169,2.628932,6.786925,2.997054,3.331134,3.130696,9.986616,3.073724
DATA.687983,7.548671,2.777716,11.807341,4.066887,3.732485,3.152404,7.827172,6.616972,5.809264,7.209653,...,3.053174,3.327528,4.570476,2.783441,5.317911,3.263745,2.992611,3.260982,9.002814,3.000182
DATA.910927,8.712338,2.643508,9.880733,3.95623,3.23662,3.241246,2.931034,8.191246,5.426841,5.120747,...,3.226808,3.326309,4.214729,2.603604,3.143006,3.112145,2.886574,3.176239,9.113243,2.916274
DATA.1240138,7.797142,2.817923,9.883471,4.063701,3.558414,3.101247,7.211707,8.630643,5.617714,4.996434,...,3.110801,2.921903,4.060761,2.61954,3.153896,3.151576,3.812119,3.074432,9.958284,3.2565
DATA.1240139,7.729268,2.957739,10.41884,4.3415,3.840373,3.001802,3.375422,8.29695,5.669418,4.180205,...,3.285372,3.474086,4.869199,2.450375,3.65266,2.918475,3.412586,3.213545,9.938978,3.396126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DATA.1298157,8.441628,2.639276,11.463742,4.425849,4.384732,3.229511,3.571204,8.193,5.6716,4.943996,...,3.390231,3.402212,4.540545,2.595066,5.097882,3.102979,3.343723,3.007502,9.332193,3.435411
DATA.1480372,8.422922,2.87989,10.557777,3.55039,4.247189,3.176336,3.321811,8.901706,4.684851,4.215908,...,3.016188,3.841095,4.062441,2.443743,4.243448,3.034131,3.412558,3.088841,10.742651,3.317945
DATA.1298533,8.089255,2.521169,10.79275,4.443337,3.071359,3.238305,5.209472,8.073389,5.643811,5.040952,...,4.133042,3.221974,4.68637,2.603842,5.084844,2.981869,3.64039,2.847505,8.544696,3.174515
DATA.930299,3.112333,2.870468,9.873902,4.266828,3.230197,3.027742,3.407148,5.76061,5.834256,5.550722,...,2.910977,3.116006,4.099547,2.53128,4.986124,2.992148,3.142641,2.83284,9.90055,3.243563


In [164]:
gdsc_expr.reset_index(inplace=True)
gdsc_expr.rename(columns={'index':'cosmic_id'}, inplace= True)
gdsc_expr

GENE_SYMBOLS,cosmic_id,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,LINC00514,OR1D5,ZNF234,MYH4,LINC00526,PPY2,KRT18P55,POLRMTP1,UBL5P2,TBC1D3P5
0,DATA.906826,7.632023,2.964585,10.379553,3.614794,3.380681,3.324692,3.56635,8.20453,5.235118,...,3.665788,3.134197,4.841169,2.628932,6.786925,2.997054,3.331134,3.130696,9.986616,3.073724
1,DATA.687983,7.548671,2.777716,11.807341,4.066887,3.732485,3.152404,7.827172,6.616972,5.809264,...,3.053174,3.327528,4.570476,2.783441,5.317911,3.263745,2.992611,3.260982,9.002814,3.000182
2,DATA.910927,8.712338,2.643508,9.880733,3.95623,3.23662,3.241246,2.931034,8.191246,5.426841,...,3.226808,3.326309,4.214729,2.603604,3.143006,3.112145,2.886574,3.176239,9.113243,2.916274
3,DATA.1240138,7.797142,2.817923,9.883471,4.063701,3.558414,3.101247,7.211707,8.630643,5.617714,...,3.110801,2.921903,4.060761,2.61954,3.153896,3.151576,3.812119,3.074432,9.958284,3.2565
4,DATA.1240139,7.729268,2.957739,10.41884,4.3415,3.840373,3.001802,3.375422,8.29695,5.669418,...,3.285372,3.474086,4.869199,2.450375,3.65266,2.918475,3.412586,3.213545,9.938978,3.396126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,DATA.1298157,8.441628,2.639276,11.463742,4.425849,4.384732,3.229511,3.571204,8.193,5.6716,...,3.390231,3.402212,4.540545,2.595066,5.097882,3.102979,3.343723,3.007502,9.332193,3.435411
1014,DATA.1480372,8.422922,2.87989,10.557777,3.55039,4.247189,3.176336,3.321811,8.901706,4.684851,...,3.016188,3.841095,4.062441,2.443743,4.243448,3.034131,3.412558,3.088841,10.742651,3.317945
1015,DATA.1298533,8.089255,2.521169,10.79275,4.443337,3.071359,3.238305,5.209472,8.073389,5.643811,...,4.133042,3.221974,4.68637,2.603842,5.084844,2.981869,3.64039,2.847505,8.544696,3.174515
1016,DATA.930299,3.112333,2.870468,9.873902,4.266828,3.230197,3.027742,3.407148,5.76061,5.834256,...,2.910977,3.116006,4.099547,2.53128,4.986124,2.992148,3.142641,2.83284,9.90055,3.243563


In [165]:
gdsc_expr

GENE_SYMBOLS,cosmic_id,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,LINC00514,OR1D5,ZNF234,MYH4,LINC00526,PPY2,KRT18P55,POLRMTP1,UBL5P2,TBC1D3P5
0,DATA.906826,7.632023,2.964585,10.379553,3.614794,3.380681,3.324692,3.56635,8.20453,5.235118,...,3.665788,3.134197,4.841169,2.628932,6.786925,2.997054,3.331134,3.130696,9.986616,3.073724
1,DATA.687983,7.548671,2.777716,11.807341,4.066887,3.732485,3.152404,7.827172,6.616972,5.809264,...,3.053174,3.327528,4.570476,2.783441,5.317911,3.263745,2.992611,3.260982,9.002814,3.000182
2,DATA.910927,8.712338,2.643508,9.880733,3.95623,3.23662,3.241246,2.931034,8.191246,5.426841,...,3.226808,3.326309,4.214729,2.603604,3.143006,3.112145,2.886574,3.176239,9.113243,2.916274
3,DATA.1240138,7.797142,2.817923,9.883471,4.063701,3.558414,3.101247,7.211707,8.630643,5.617714,...,3.110801,2.921903,4.060761,2.61954,3.153896,3.151576,3.812119,3.074432,9.958284,3.2565
4,DATA.1240139,7.729268,2.957739,10.41884,4.3415,3.840373,3.001802,3.375422,8.29695,5.669418,...,3.285372,3.474086,4.869199,2.450375,3.65266,2.918475,3.412586,3.213545,9.938978,3.396126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,DATA.1298157,8.441628,2.639276,11.463742,4.425849,4.384732,3.229511,3.571204,8.193,5.6716,...,3.390231,3.402212,4.540545,2.595066,5.097882,3.102979,3.343723,3.007502,9.332193,3.435411
1014,DATA.1480372,8.422922,2.87989,10.557777,3.55039,4.247189,3.176336,3.321811,8.901706,4.684851,...,3.016188,3.841095,4.062441,2.443743,4.243448,3.034131,3.412558,3.088841,10.742651,3.317945
1015,DATA.1298533,8.089255,2.521169,10.79275,4.443337,3.071359,3.238305,5.209472,8.073389,5.643811,...,4.133042,3.221974,4.68637,2.603842,5.084844,2.981869,3.64039,2.847505,8.544696,3.174515
1016,DATA.930299,3.112333,2.870468,9.873902,4.266828,3.230197,3.027742,3.407148,5.76061,5.834256,...,2.910977,3.116006,4.099547,2.53128,4.986124,2.992148,3.142641,2.83284,9.90055,3.243563


In [166]:
gdsc_expr.rename(columns=dict(zip(indices, new_indices)), inplace=True)
gdsc_expr

GENE_SYMBOLS,cosmic_id,expr_TSPAN6,expr_TNMD,expr_DPM1,expr_SCYL3,expr_C1orf112,expr_FGR,expr_CFH,expr_FUCA2,expr_GCLC,...,expr_LINC00514,expr_OR1D5,expr_ZNF234,expr_MYH4,expr_LINC00526,expr_PPY2,expr_KRT18P55,expr_POLRMTP1,expr_UBL5P2,expr_TBC1D3P5
0,DATA.906826,7.632023,2.964585,10.379553,3.614794,3.380681,3.324692,3.56635,8.20453,5.235118,...,3.665788,3.134197,4.841169,2.628932,6.786925,2.997054,3.331134,3.130696,9.986616,3.073724
1,DATA.687983,7.548671,2.777716,11.807341,4.066887,3.732485,3.152404,7.827172,6.616972,5.809264,...,3.053174,3.327528,4.570476,2.783441,5.317911,3.263745,2.992611,3.260982,9.002814,3.000182
2,DATA.910927,8.712338,2.643508,9.880733,3.95623,3.23662,3.241246,2.931034,8.191246,5.426841,...,3.226808,3.326309,4.214729,2.603604,3.143006,3.112145,2.886574,3.176239,9.113243,2.916274
3,DATA.1240138,7.797142,2.817923,9.883471,4.063701,3.558414,3.101247,7.211707,8.630643,5.617714,...,3.110801,2.921903,4.060761,2.61954,3.153896,3.151576,3.812119,3.074432,9.958284,3.2565
4,DATA.1240139,7.729268,2.957739,10.41884,4.3415,3.840373,3.001802,3.375422,8.29695,5.669418,...,3.285372,3.474086,4.869199,2.450375,3.65266,2.918475,3.412586,3.213545,9.938978,3.396126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,DATA.1298157,8.441628,2.639276,11.463742,4.425849,4.384732,3.229511,3.571204,8.193,5.6716,...,3.390231,3.402212,4.540545,2.595066,5.097882,3.102979,3.343723,3.007502,9.332193,3.435411
1014,DATA.1480372,8.422922,2.87989,10.557777,3.55039,4.247189,3.176336,3.321811,8.901706,4.684851,...,3.016188,3.841095,4.062441,2.443743,4.243448,3.034131,3.412558,3.088841,10.742651,3.317945
1015,DATA.1298533,8.089255,2.521169,10.79275,4.443337,3.071359,3.238305,5.209472,8.073389,5.643811,...,4.133042,3.221974,4.68637,2.603842,5.084844,2.981869,3.64039,2.847505,8.544696,3.174515
1016,DATA.930299,3.112333,2.870468,9.873902,4.266828,3.230197,3.027742,3.407148,5.76061,5.834256,...,2.910977,3.116006,4.099547,2.53128,4.986124,2.992148,3.142641,2.83284,9.90055,3.243563


In [167]:
[column for column in expression_cols if column == 'expr_nan']

['expr_nan']

In [168]:
[index for index in gdsc_expr.index if not pd.notna(index)]

[]

In [169]:
expression_cols.remove('expr_nan')

In [170]:
expr_df = gdsc_expr[['cosmic_id'] + expression_cols].copy()
expr_df

GENE_SYMBOLS,cosmic_id,expr_RPS4Y1,expr_KRT19,expr_VIM,expr_S100P,expr_TACSTD2,expr_TGFBI,expr_TM4SF1,expr_SRGN,expr_CAV1,...,expr_SMAP2,expr_TGFB1,expr_RHAG,expr_ACOT13,expr_MRPL41,expr_PRKCQ,expr_VMP1,expr_PLCG1,expr_IFFO1,expr_ZFYVE21
0,DATA.906826,3.29073,12.168602,12.774622,4.06024,11.081974,5.285864,11.902697,10.441504,12.289793,...,3.699497,4.056391,2.923542,8.729473,7.466002,2.858463,9.320376,4.635385,3.092527,6.276218
1,DATA.687983,8.145208,2.890861,13.093634,3.309209,2.833932,3.178296,2.88943,2.70639,3.422951,...,3.964057,3.275564,2.91981,9.535773,7.409235,3.353255,9.578953,5.470668,2.642293,5.525015
2,DATA.910927,3.33739,11.185326,11.258885,10.165595,2.866781,3.095703,9.812015,3.130309,10.439182,...,3.715675,3.558868,2.998421,8.723757,7.104237,4.547467,9.008119,4.845271,3.034252,6.297756
3,DATA.1240138,3.019026,12.884291,11.461745,3.435135,2.78187,8.536861,11.488132,9.099688,9.393353,...,3.417975,3.237272,3.01135,8.849614,7.975214,2.866926,9.816645,4.804883,3.591777,5.529628
4,DATA.1240139,11.226058,4.965551,12.880485,3.359441,2.849831,6.992837,8.813685,9.237848,10.577919,...,4.127631,4.6852,3.060413,7.753717,8.351571,2.939849,9.15957,5.579806,5.264455,6.072089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,DATA.1298157,4.026997,11.980189,7.736445,8.047832,11.510755,7.39026,9.855936,4.973477,11.441381,...,3.762667,2.729408,3.069429,8.585153,7.263554,2.947024,8.863493,4.775982,3.107198,5.436081
1014,DATA.1480372,2.693484,11.773619,11.222721,3.235943,11.48673,8.752779,9.918038,2.979506,11.652844,...,3.417305,3.516275,2.856744,6.129409,7.307694,4.886727,8.061265,4.046974,3.0366,6.943521
1015,DATA.1298533,2.936375,11.817423,4.295499,11.429297,11.824048,9.634719,10.244194,3.329379,8.472977,...,3.448687,3.488416,2.962836,9.011269,6.273418,3.384259,10.167021,4.279694,2.993828,6.501944
1016,DATA.930299,3.056752,8.954154,9.175388,3.058269,2.839616,3.249272,2.919157,3.47707,3.144733,...,4.004962,3.459313,2.947651,7.289031,7.290104,2.863723,8.430276,3.623459,4.033035,5.491749


In [171]:
# methexpr_df['comic_id'] = methexpr_df['cosmic_id'].astype(str)

# methexpr_df.drop('comic_id', inplace=True)
methexpr_df['cosmic_id'] = methexpr_df['cosmic_id'].astype(str)
methexpr_df

Unnamed: 0,primary site,primary histology,cosmic_id,cg00944421,cg14557185,cg00989853,cg24702147,cg06723863,cg27174108,cg14481208,...,expr_SMAP2,expr_TGFB1,expr_RHAG,expr_ACOT13,expr_MRPL41,expr_PRKCQ,expr_VMP1,expr_PLCG1,expr_IFFO1,expr_ZFYVE21
697,blood,lymphoblastic_leukemia,906800.0,0.995309,0.999933,0.992589,0.993804,1.000000,0.989910,0.991938,...,,,,,,,,,,
5637,urogenital_system,bladder,687452.0,0.010293,0.009700,0.000000,0.019619,0.001614,0.006393,0.037711,...,,,,,,,,,,
201T,lung,lung_NSCLC_adenocarcinoma,1287381.0,0.821831,0.004671,0.091291,0.013206,0.016155,0.003981,0.008285,...,,,,,,,,,,
22RV1,urogenital_system,prostate,924100.0,0.005373,0.995386,0.000000,0.005776,0.004595,0.985648,0.009981,...,,,,,,,,,,
23132-87,digestive_system,stomach,910924.0,0.000000,0.935833,0.000000,0.008962,0.002041,0.002005,0.026887,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,,,,,,,,,,,...,4.468082,3.159499,3.161347,9.481404,8.298121,3.017939,9.832539,3.654862,3.031856,7.945312
2,,,,,,,,,,,...,4.468082,3.159499,3.161347,9.481404,8.298121,3.017939,9.832539,3.654862,3.031856,7.945312
4,,,,,,,,,,,...,4.468082,3.159499,3.161347,9.481404,8.298121,3.017939,9.832539,3.654862,3.031856,7.945312
Q,,,,,,,,,,,...,4.468082,3.159499,3.161347,9.481404,8.298121,3.017939,9.832539,3.654862,3.031856,7.945312


In [172]:
expr_df['cosmic_id']= expr_df['cosmic_id'].map(lambda x: x.removeprefix('DATA.'))
expr_df

GENE_SYMBOLS,cosmic_id,expr_RPS4Y1,expr_KRT19,expr_VIM,expr_S100P,expr_TACSTD2,expr_TGFBI,expr_TM4SF1,expr_SRGN,expr_CAV1,...,expr_SMAP2,expr_TGFB1,expr_RHAG,expr_ACOT13,expr_MRPL41,expr_PRKCQ,expr_VMP1,expr_PLCG1,expr_IFFO1,expr_ZFYVE21
0,906826,3.29073,12.168602,12.774622,4.06024,11.081974,5.285864,11.902697,10.441504,12.289793,...,3.699497,4.056391,2.923542,8.729473,7.466002,2.858463,9.320376,4.635385,3.092527,6.276218
1,687983,8.145208,2.890861,13.093634,3.309209,2.833932,3.178296,2.88943,2.70639,3.422951,...,3.964057,3.275564,2.91981,9.535773,7.409235,3.353255,9.578953,5.470668,2.642293,5.525015
2,910927,3.33739,11.185326,11.258885,10.165595,2.866781,3.095703,9.812015,3.130309,10.439182,...,3.715675,3.558868,2.998421,8.723757,7.104237,4.547467,9.008119,4.845271,3.034252,6.297756
3,1240138,3.019026,12.884291,11.461745,3.435135,2.78187,8.536861,11.488132,9.099688,9.393353,...,3.417975,3.237272,3.01135,8.849614,7.975214,2.866926,9.816645,4.804883,3.591777,5.529628
4,1240139,11.226058,4.965551,12.880485,3.359441,2.849831,6.992837,8.813685,9.237848,10.577919,...,4.127631,4.6852,3.060413,7.753717,8.351571,2.939849,9.15957,5.579806,5.264455,6.072089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,1298157,4.026997,11.980189,7.736445,8.047832,11.510755,7.39026,9.855936,4.973477,11.441381,...,3.762667,2.729408,3.069429,8.585153,7.263554,2.947024,8.863493,4.775982,3.107198,5.436081
1014,1480372,2.693484,11.773619,11.222721,3.235943,11.48673,8.752779,9.918038,2.979506,11.652844,...,3.417305,3.516275,2.856744,6.129409,7.307694,4.886727,8.061265,4.046974,3.0366,6.943521
1015,1298533,2.936375,11.817423,4.295499,11.429297,11.824048,9.634719,10.244194,3.329379,8.472977,...,3.448687,3.488416,2.962836,9.011269,6.273418,3.384259,10.167021,4.279694,2.993828,6.501944
1016,930299,3.056752,8.954154,9.175388,3.058269,2.839616,3.249272,2.919157,3.47707,3.144733,...,4.004962,3.459313,2.947651,7.289031,7.290104,2.863723,8.430276,3.623459,4.033035,5.491749


In [173]:
methexpr_df = methexpr_df[['cosmic_id']+ methylation_cols].dropna(axis=0).copy()

In [174]:
expr_df.merge(methexpr_df, how='outer', on='cosmic_id')

Unnamed: 0,cosmic_id,expr_RPS4Y1,expr_KRT19,expr_VIM,expr_S100P,expr_TACSTD2,expr_TGFBI,expr_TM4SF1,expr_SRGN,expr_CAV1,...,cg16628641,cg25747192,cg25291653,cg24739382,cg20792735,cg04408104,cg12626589,cg01824410,cg17484699,cg27392850
0,11223344,3.291107,12.097008,3.081119,12.209577,3.638138,10.822322,10.088764,2.952313,3.345084,...,,,,,,,,,,
1,1240121,2.997926,12.154681,8.364776,9.834068,12.216048,10.447884,11.652129,3.006736,11.981777,...,,,,,,,,,,
2,1240122,3.066012,11.998865,3.713845,10.360103,11.597271,11.119625,10.198695,6.394594,12.719154,...,,,,,,,,,,
3,1240123,3.350448,10.71653,3.138711,10.170108,3.695209,3.366745,3.557953,3.108708,3.428797,...,,,,,,,,,,
4,1240123.0,,,,,,,,,,...,0.977038,0.427420,0.069099,0.916764,0.051502,0.032327,0.924428,0.040823,0.146464,0.373331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1188,971777,3.818962,2.989921,13.091291,3.576699,3.024598,3.223917,10.677009,3.058808,9.607691,...,,,,,,,,,,
1189,998179,3.326312,11.475558,12.784976,3.224896,9.075248,10.491229,8.415231,2.876391,11.708187,...,,,,,,,,,,
1190,998184,8.831393,3.338759,12.767885,3.987979,3.047339,3.073296,2.903974,7.907584,3.1916,...,,,,,,,,,,
1191,998184.0,,,,,,,,,,...,0.990338,0.837384,0.900297,0.362850,0.028776,0.049502,0.904091,0.863979,0.903518,0.031552


In [175]:
methexpr_df[methexpr_df['cosmic_id']=='998184.0']

Unnamed: 0,cosmic_id,cg00944421,cg14557185,cg00989853,cg24702147,cg06723863,cg27174108,cg14481208,cg24361265,cg00699993,...,cg16628641,cg25747192,cg25291653,cg24739382,cg20792735,cg04408104,cg12626589,cg01824410,cg17484699,cg27392850
JURKAT,998184.0,0.985339,0.997566,0.997974,0.989642,0.988557,0.992347,0.984474,0.996618,0.58006,...,0.990338,0.837384,0.900297,0.36285,0.028776,0.049502,0.904091,0.863979,0.903518,0.031552


In [176]:
expr_df[expr_df['cosmic_id']=='998184.0']

GENE_SYMBOLS,cosmic_id,expr_RPS4Y1,expr_KRT19,expr_VIM,expr_S100P,expr_TACSTD2,expr_TGFBI,expr_TM4SF1,expr_SRGN,expr_CAV1,...,expr_SMAP2,expr_TGFB1,expr_RHAG,expr_ACOT13,expr_MRPL41,expr_PRKCQ,expr_VMP1,expr_PLCG1,expr_IFFO1,expr_ZFYVE21


In [177]:
test[(test[expression_cols].notna().any(axis=1)) & (test[methylation_cols].notna().any(axis=1))]


Unnamed: 0,cosmic_id,expr_RPS4Y1,expr_KRT19,expr_VIM,expr_S100P,expr_TACSTD2,expr_TGFBI,expr_TM4SF1,expr_SRGN,expr_CAV1,...,cg16628641,cg25747192,cg25291653,cg24739382,cg20792735,cg04408104,cg12626589,cg01824410,cg17484699,cg27392850


In [178]:
test[expression_cols]

Unnamed: 0,expr_RPS4Y1,expr_KRT19,expr_VIM,expr_S100P,expr_TACSTD2,expr_TGFBI,expr_TM4SF1,expr_SRGN,expr_CAV1,expr_DKK1,...,expr_SMAP2,expr_TGFB1,expr_RHAG,expr_ACOT13,expr_MRPL41,expr_PRKCQ,expr_VMP1,expr_PLCG1,expr_IFFO1,expr_ZFYVE21
0,3.291107,12.097008,3.081119,12.209577,3.638138,10.822322,10.088764,2.952313,3.345084,3.071455,...,3.785883,3.453036,3.226879,8.218489,6.114585,3.03523,8.58969,4.01786,2.927121,6.351078
1,2.997926,12.154681,8.364776,9.834068,12.216048,10.447884,11.652129,3.006736,11.981777,9.705061,...,3.545905,4.738681,3.297113,7.111671,8.719293,3.133279,8.842407,4.308111,3.13225,7.320619
2,3.066012,11.998865,3.713845,10.360103,11.597271,11.119625,10.198695,6.394594,12.719154,11.106554,...,3.583002,3.62042,3.220278,7.798386,8.387211,3.492424,9.224213,4.251668,3.446303,6.603626
3,3.350448,10.71653,3.138711,10.170108,3.695209,3.366745,3.557953,3.108708,3.428797,2.766948,...,3.641497,3.205927,3.207659,8.62983,7.731554,3.086153,8.98989,4.148559,3.158408,7.01867
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1188,3.818962,2.989921,13.091291,3.576699,3.024598,3.223917,10.677009,3.058808,9.607691,2.871009,...,3.993483,3.482156,3.131498,7.987438,7.45415,2.900158,8.952722,4.748946,5.038103,4.936197
1189,3.326312,11.475558,12.784976,3.224896,9.075248,10.491229,8.415231,2.876391,11.708187,10.083113,...,4.050341,3.333828,2.989167,7.178404,7.512015,4.403471,9.732099,4.166703,3.802915,5.983956
1190,8.831393,3.338759,12.767885,3.987979,3.047339,3.073296,2.903974,7.907584,3.1916,2.599691,...,4.820916,3.524263,3.195193,8.708099,6.661997,6.890094,8.035265,5.919422,4.964062,5.108294
1191,,,,,,,,,,,...,,,,,,,,,,


In [179]:
test.dropna()

Unnamed: 0,cosmic_id,expr_RPS4Y1,expr_KRT19,expr_VIM,expr_S100P,expr_TACSTD2,expr_TGFBI,expr_TM4SF1,expr_SRGN,expr_CAV1,...,cg16628641,cg25747192,cg25291653,cg24739382,cg20792735,cg04408104,cg12626589,cg01824410,cg17484699,cg27392850


In [182]:
list(set(methexpr_df['cosmic_id']) and set(expr_df['cosmic_id']))[-1]

'905938'

In [183]:
methexpr_df[methexpr_df['cosmic_id'] == '905938']

Unnamed: 0,cosmic_id,cg00944421,cg14557185,cg00989853,cg24702147,cg06723863,cg27174108,cg14481208,cg24361265,cg00699993,...,cg16628641,cg25747192,cg25291653,cg24739382,cg20792735,cg04408104,cg12626589,cg01824410,cg17484699,cg27392850


In [184]:
expr_df[expr_df['cosmic_id'] == '905938']

GENE_SYMBOLS,cosmic_id,expr_RPS4Y1,expr_KRT19,expr_VIM,expr_S100P,expr_TACSTD2,expr_TGFBI,expr_TM4SF1,expr_SRGN,expr_CAV1,...,expr_SMAP2,expr_TGFB1,expr_RHAG,expr_ACOT13,expr_MRPL41,expr_PRKCQ,expr_VMP1,expr_PLCG1,expr_IFFO1,expr_ZFYVE21
565,905938,3.666526,3.55439,11.249756,10.990172,3.225838,3.587067,3.077106,12.690685,3.352894,...,5.217329,5.102099,3.277109,8.247541,7.266314,3.151913,7.926412,3.891563,4.659412,5.708338


In [None]:
x_response.notna().sum(axis=1)

In [None]:
x_response.to_csv('data/processed/ml_with_methylation_and_expression.csv')

In [None]:
x_response[methylation_cols]

In [63]:
expr_df['expr_SMAP2'].isna().sum()

np.int64(0)

* Adds even more dimensionalities -> bayesian might be the next step?

## Traditional Regression method

In [None]:
df.dropna()

In [None]:
df.index