# Classification Train-test data processing

In [1]:
# Author: Shirley Zhou

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Exclude non-coding gene from dataset

In [3]:
gwide = (pd.read_csv('../../data/gwide_hema_classification/Genomewide_coding.csv')
         .drop(["Unnamed: 0", "GeneID", "Aliases", "X", "X.1"], axis = 1))

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
gwide.head()

Unnamed: 0,Gene,chromosome,HSC.logCPM,Brummelkamp.hap1.GTS.ratio,Brummelkamp.hap1.GTS.threshold,Brummelkamp.kbm7.GTS.ratio,Brummelkamp.kbm7.GTS.threshold,Chen2019.pos.score,Chen2019.pos.p.value,Chen2019.pos.p.value.threshold,...,Sabbatini2017.TF.1.threshold,Sabbatini2017.THP.1,Sabbatini2017.THP.1.threshold,Wallace2016.DEseq2.Log2.Fold.Change,Wallace2016.DEseq2.P.Value,Wallace2016.threshold,Rep1_2.Mann.Whitney.p.value,average.phenotype.of.strongest.3.2,Rep1_2.Mann.Whitney.p.value.1,average.phenotype.of.strongest.3.2.1
0,IGDCC3,15,,0.503403,0,0.534752,0,0.89178,0.89137,0,...,0,-0.13,0,,,0,2.32e-07,-0.211542,0.692807,0.004401
1,HSF2BP,21,,0.514247,0,0.521775,0,0.62112,0.62094,0,...,0,-0.221,0,,,0,0.04682615,-0.006646,0.789091,-0.007323
2,ADGRA2,8,,,0,,0,0.18968,0.26085,0,...,0,,0,,,0,,,,
3,TKFC,11,,,0,,0,0.94001,0.93984,0,...,0,,0,,,0,,,,
4,ERMARD,6,,,0,,0,0.28938,0.32906,0,...,0,-0.07,0,,,0,0.4053098,0.010147,0.462927,-0.012278


## Deletion of column data¶

The column `Weissman2014.CRISPRi.Growth.phenotype..mean.of.top.3.gammas.` will be deleted because most of the hits in this CRISPRi screen are underpowered.

In [5]:
gwide = gwide.drop(['Weissman2014.CRISPRi.Growth.phenotype..mean.of.top.3.gammas.'], axis=1)

In [6]:
gwide.columns.tolist()

['Gene',
 'chromosome',
 'HSC.logCPM',
 'Brummelkamp.hap1.GTS.ratio',
 'Brummelkamp.hap1.GTS.threshold',
 'Brummelkamp.kbm7.GTS.ratio',
 'Brummelkamp.kbm7.GTS.threshold',
 'Chen2019.pos.score',
 'Chen2019.pos.p.value',
 'Chen2019.pos.p.value.threshold',
 'Chen2019.pos.fdr',
 'Chen2019.pos.rank',
 'Weissman2014.CRISPRa.Growth.phenotype..mean.of.top.3.gammas.',
 'Weissman2014.CRISPRa.Growth.phenotype.threshold',
 'Doench2018.Average.LFC',
 'Doench2018.Average.nlog.p.values.',
 'Doench2018.Average.nlog.p.values.threshold',
 'Elledge2013.TUSON_p_value_TSG',
 'Elledge2013.TUSON_p_value_TSG.threshold',
 'Elledge2013.TUSON_q_value_TSG',
 'Elledge2013.TSG_Probability_LASSO',
 'Elledge2019.HMEC.Average.Log2FC.Drop',
 'Elledge2019.HMEC.Combined.pvalue.drop',
 'Elledge2019.HMEC.FDR.drop',
 'Elledge2019.HMEC.Average.threshold',
 'Elledge2019.HPNE.Average.Log2.Drop',
 'Elledge2019.HPNE.Combined.pvalue.drop',
 'Elledge2019.HPNE.FDR.drop',
 'Elledge2019.HPNE.Average.threshold',
 'Sabbatini2015.KBM7.C

## Cleaning of gwide data set

In [7]:
def percent_nan(list):
    return (sum(np.isnan(list))/len(list))

def percent_zero(list):
    return ((len(list) - np.count_nonzero(list))/len(list))

### Check NaN columns

In [8]:
nan_check = gwide.drop(['Gene', 'chromosome'], 
                     axis=1).apply(percent_nan)
nan_check.sort_values(ascending = False).head(30)

Wallace2016.DEseq2.P.Value                                      0.950480
Wallace2016.DEseq2.Log2.Fold.Change                             0.950480
Sabbatini2015.GTS                                               0.765200
HSC.logCPM                                                      0.532756
Weissman2014.CRISPRa.Growth.phenotype..mean.of.top.3.gammas.    0.413495
Elledge2019.HPNE.FDR.drop                                       0.294387
Elledge2019.HPNE.Combined.pvalue.drop                           0.294387
Elledge2019.HPNE.Average.Log2.Drop                              0.294387
Elledge2019.HMEC.Combined.pvalue.drop                           0.292933
Elledge2019.HMEC.FDR.drop                                       0.292933
Elledge2019.HMEC.Average.Log2FC.Drop                            0.292933
Brummelkamp.kbm7.GTS.ratio                                      0.221694
Brummelkamp.hap1.GTS.ratio                                      0.174861
Sabbatini2015.Jiyoye.CS                            

In [9]:
gwide_cleaned_nan = gwide.drop(nan_check[nan_check > 0.75].index.tolist(), axis=1)

### Check Zero Variance Column

In [10]:
col_var = gwide_cleaned_nan.drop(['Gene', 'chromosome'], 
                              axis=1).apply(np.var)
gwide_cleaned_var = gwide_cleaned_nan.drop(col_var[col_var == 0].index.tolist(), axis=1)

### Check Zero Value Column

In [11]:
zeros_check = gwide_cleaned_var.drop(['Gene', 'chromosome'], 
                              axis=1).apply(percent_zero)
zeros_check.sort_values(ascending = False).head(30)

Sabbatini2017.NB4.rep2.threshold                   0.999978
Sabbatini2017.HEL.threshold                        0.999978
Sabbatini2017.OCI.AML2.threshold                   0.999956
Sabbatini2017.TF.1.threshold                       0.999890
Sabbatini2017.MonoMac1.threshold                   0.999868
Sabbatini2017.PL.21.threshold                      0.999846
Sabbatini2017.THP.1.threshold                      0.999802
Sabbatini2017.MV412                                0.999802
Sabbatini2017.OCI.AML5.threshold                   0.999802
Sabbatini2017.EOL.1.threshold                      0.999537
Sabbatini2017.OCI.AML3.threshold                   0.999141
Sabbatini2017.SKM.threshold                        0.999097
Sabbatini2017.MOLM.13.threshold                    0.999097
Sabbatini2015.Jiyoye.threshold                     0.998039
Sabbatini2015.KBM7.threshold                       0.997423
Sabbatini2015.Raji.threshold                       0.993656
Weissman2014.CRISPRa.Growth.phenotype.th

In [12]:
gwide_cleaned_zero = gwide_cleaned_var.drop(zeros_check[zeros_check > 0.6].index.tolist(), axis=1)
gwide_cleaned_zero.columns.tolist()

['Gene',
 'chromosome',
 'HSC.logCPM',
 'Brummelkamp.hap1.GTS.ratio',
 'Brummelkamp.kbm7.GTS.ratio',
 'Chen2019.pos.score',
 'Chen2019.pos.p.value',
 'Chen2019.pos.fdr',
 'Chen2019.pos.rank',
 'Weissman2014.CRISPRa.Growth.phenotype..mean.of.top.3.gammas.',
 'Doench2018.Average.LFC',
 'Doench2018.Average.nlog.p.values.',
 'Elledge2013.TUSON_p_value_TSG',
 'Elledge2013.TUSON_q_value_TSG',
 'Elledge2013.TSG_Probability_LASSO',
 'Elledge2019.HMEC.Average.Log2FC.Drop',
 'Elledge2019.HMEC.Combined.pvalue.drop',
 'Elledge2019.HMEC.FDR.drop',
 'Elledge2019.HPNE.Average.Log2.Drop',
 'Elledge2019.HPNE.Combined.pvalue.drop',
 'Elledge2019.HPNE.FDR.drop',
 'Sabbatini2015.KBM7.CS',
 'Sabbatini2015.KBM7.adjusted.p.value',
 'Sabbatini2015.K562.CS',
 'Sabbatini2015.K562.adjusted.p.value',
 'Sabbatini2015.Jiyoye.CS',
 'Sabbatini2015.Jiyoye.adjusted.p.value',
 'Sabbatini2015.Raji.CS',
 'Sabbatini2015.Raji.adjusted.p.value',
 'Sabbatini2017.EOL.1',
 'Sabbatini2017.HEL',
 'Sabbatini2017.MOLM.13',
 'Sabbat

### Remove Threshold Column

In [13]:
gwide_clean = gwide_cleaned_zero.drop(['Sabbatini2017.MV411.threshold'], axis=1)

In [14]:
gwide_col_names = {'Brummelkamp.hap1.GTS.ratio': 'Brum.HAP1.GTS',
                   'Brummelkamp.kbm7.GTS.ratio': 'Brum.KBM7.GTS',
                   'Chen2019.pos.score': 'Chen.cs',
                   'Chen2019.pos.fdr': 'Chen.fdr', 
                   'Chen2019.pos.p.value': 'Chen.pvalue',
                   'Chen2019.pos.rank': 'Chen.ts.rank',
                   'Doench2018.Average.LFC': 'Doen.avg.LFC',
                   'Doench2018.Average.nlog.p.values.': 'Doen.avg.neg.log.pvalue',
                   'Elledge2013.TUSON_p_value_TSG': 'Elle13.pvalue',
                   'Elledge2013.TUSON_q_value_TSG': 'Elle13.fdr',
                   'Elledge2013.TSG_Probability_LASSO': 'Elle13.lasso.prob',
                   'Elledge2019.HMEC.Average.Log2FC.Drop': 'Elle18.HMEC.avg.LFC',
                   'Elledge2019.HMEC.Combined.pvalue.drop': 'Elle18.HMEC.combined.pvalue', 
                   'Elledge2019.HMEC.FDR.drop': 'Elle18.HMEC.fdr',
                   'Elledge2019.HPNE.Average.Log2.Drop': 'Elle18.HPNE.avg.LFC',
                   'Elledge2019.HPNE.Combined.pvalue.drop': 'Elle18.HPNE.Combined.pvalue',
                   'Elledge2019.HPNE.FDR.drop': 'Elle18.HPNE.fdr',
                   'Sabbatini2015.KBM7.CS': 'Sabb15.KBM7.cs',
                   'Sabbatini2015.KBM7.adjusted.p.value': 'Sabb15.KBM7.fdr',
                   'Sabbatini2015.K562.CS': 'Sabb15.K562.cs',
                   'Sabbatini2015.K562.adjusted.p.value': 'Sabb15.K562.fdr',
                   'Sabbatini2015.Jiyoye.CS': 'Sabb15.Jiyoye.cs', 
                   'Sabbatini2015.Jiyoye.adjusted.p.value': 'Sabb15.Jiyoye.fdr',
                   'Sabbatini2015.Raji.CS': 'Sabb15.Raji.cs',
                   'Sabbatini2015.Raji.adjusted.p.value': 'Sabb15.Raji.fdr',
                   'Sabbatini2017.EOL.1': 'Sabb17.EOL', 
                   'Sabbatini2017.HEL': 'Sabb17.HEL',
                   'Sabbatini2017.MOLM.13': 'Sabb17.MOLM13',
                   'Sabbatini2017.MonoMac1': 'Sabb17.MonoMac1',
                   'Sabbatini2017.NB4.rep1': 'Sabb17.NB4.rep1',
                   'Sabbatini2017.NB4.rep2': 'Sabb17.NB4.rep2', 
                   'Sabbatini2017.OCI.AML2': 'Sabb17.OCI.AML2',
                   'Sabbatini2017.OCI.AML3': 'Sabb17.OCI.AML3', 
                   'Sabbatini2017.OCI.AML5': 'Sabb17.OCI.AML5',
                   'Sabbatini2017.P31.FUJ': 'Sabb17.P31.FUJ', 
                   'Sabbatini2017.PL.21': 'Sabb17.PL21', 
                   'Sabbatini2017.SKM.1': 'Sabb17.SKM1',
                   'Sabbatini2017.TF.1': 'Sabb17.TF1', 
                   'Sabbatini2017.THP.1': 'Sabb17.THP1',
                   'Weissman2014.CRISPRa.Growth.phenotype..mean.of.top.3.gammas.': 'Weis14.csa.avg',
                   'Rep1_2.Mann.Whitney.p.value': 'Weis16.csa.MW.pvalue',
                   'average.phenotype.of.strongest.3.2': 'Weis16.csa.avg',
                   'Rep1_2.Mann.Whitney.p.value.1': 'Weis16.csi.MW.pvalue',
                   'average.phenotype.of.strongest.3.2.1': 'Weis16.csi.avg'}

In [15]:
gwide_clean_renamed = gwide_clean.rename(columns = gwide_col_names)
gwide_clean_renamed.columns.tolist()

['Gene',
 'chromosome',
 'HSC.logCPM',
 'Brum.HAP1.GTS',
 'Brum.KBM7.GTS',
 'Chen.cs',
 'Chen.pvalue',
 'Chen.fdr',
 'Chen.ts.rank',
 'Weis14.csa.avg',
 'Doen.avg.LFC',
 'Doen.avg.neg.log.pvalue',
 'Elle13.pvalue',
 'Elle13.fdr',
 'Elle13.lasso.prob',
 'Elle18.HMEC.avg.LFC',
 'Elle18.HMEC.combined.pvalue',
 'Elle18.HMEC.fdr',
 'Elle18.HPNE.avg.LFC',
 'Elle18.HPNE.Combined.pvalue',
 'Elle18.HPNE.fdr',
 'Sabb15.KBM7.cs',
 'Sabb15.KBM7.fdr',
 'Sabb15.K562.cs',
 'Sabb15.K562.fdr',
 'Sabb15.Jiyoye.cs',
 'Sabb15.Jiyoye.fdr',
 'Sabb15.Raji.cs',
 'Sabb15.Raji.fdr',
 'Sabb17.EOL',
 'Sabb17.HEL',
 'Sabb17.MOLM13',
 'Sabb17.MonoMac1',
 'Sabb17.NB4.rep1',
 'Sabb17.NB4.rep2',
 'Sabb17.OCI.AML2',
 'Sabb17.OCI.AML3',
 'Sabb17.OCI.AML5',
 'Sabb17.P31.FUJ',
 'Sabb17.PL21',
 'Sabb17.SKM1',
 'Sabb17.TF1',
 'Sabb17.THP1',
 'Weis16.csa.MW.pvalue',
 'Weis16.csa.avg',
 'Weis16.csi.MW.pvalue',
 'Weis16.csi.avg']

In [16]:
gwide_clean_renamed.to_csv('../../data/Genomewide_master_list_clean_renamed.csv')

### Gwide data median imputation 

In [17]:
def nan_to_median(df):
    new = df.copy()
    cols = new.drop(['Gene','chromosome'], axis=1).columns.tolist()
    medians = new.drop(['Gene','chromosome'], axis=1).apply(np.nanmedian)
    for i in cols:
        new[i] = new[i].replace(np.nan, medians[i])
    return new

In [18]:
gwide_median_imputation = nan_to_median(gwide_clean_renamed)
gwide_median_imputation.head()

Unnamed: 0,Gene,chromosome,HSC.logCPM,Brum.HAP1.GTS,Brum.KBM7.GTS,Chen.cs,Chen.pvalue,Chen.fdr,Chen.ts.rank,Weis14.csa.avg,...,Sabb17.OCI.AML5,Sabb17.P31.FUJ,Sabb17.PL21,Sabb17.SKM1,Sabb17.TF1,Sabb17.THP1,Weis16.csa.MW.pvalue,Weis16.csa.avg,Weis16.csi.MW.pvalue,Weis16.csi.avg
0,IGDCC3,15,5.220124,0.503403,0.534752,0.89178,0.89137,0.999997,16865.0,-0.249164,...,0.004,0.028,-0.387,-0.292,-0.053,-0.13,2.32e-07,-0.211542,0.692807,0.004401
1,HSF2BP,21,5.220124,0.514247,0.521775,0.62112,0.62094,0.999997,11749.0,0.082008,...,0.184,0.19,0.148,-0.169,-0.555,-0.221,0.04682615,-0.006646,0.789091,-0.007323
2,ADGRA2,8,5.220124,0.494457,0.5,0.18968,0.26085,0.997942,4975.0,-0.008099,...,-0.21,-0.104,-0.231,-0.128,-0.201,-0.138,0.4652979,-0.004572,0.404651,-0.007496
3,TKFC,11,5.220124,0.494457,0.5,0.94001,0.93984,0.999997,17799.0,-0.008099,...,-0.21,-0.104,-0.231,-0.128,-0.201,-0.138,0.4652979,-0.004572,0.404651,-0.007496
4,ERMARD,6,5.220124,0.494457,0.5,0.28938,0.32906,0.999997,6263.0,-0.005987,...,0.107,-0.245,-0.079,0.095,-0.349,-0.07,0.4053098,0.010147,0.462927,-0.012278


In [19]:
gwide_median_imputation.to_csv('../../data/Genomewide_master_list_imputed.csv')

## Create Design Matrix for Train and Test Set

### load cancer type data

In [21]:
metadata = pd.read_excel('../../data/cancer_type_metadata.xlsx').rename(columns = {"Unnamed: 0": 'variable'}).set_index("variable")

In [22]:
metadata

Unnamed: 0_level_0,AML,Burkitt's Lymphoma,CML,breast,pancreatic,colon,melanoma,pan cancer
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Brum.HAP1.GTS,0,0,1,0,0,0,0,0
Brum.KBM7.GTS,0,0,1,0,0,0,0,0
Chen.cs,0,0,0,0,1,0,0,0
Chen.pvalue,0,0,0,0,1,0,0,0
Chen.fdr,0,0,0,0,1,0,0,0
Chen.ts.rank,0,0,0,0,1,0,0,0
Weis14.csa.avg,0,0,1,0,0,0,0,0
Doen.avg.LFC,0,0,0,0,0,1,1,0
Doen.avg.neg.log.pvalue,0,0,0,0,0,1,1,0
Elle13.pvalue,0,0,0,0,0,0,0,1


### load classification value by TSGene

In [23]:
ts_list = pd.read_csv('../../data/TSGene_all.csv')
ts_list = ts_list[['GeneSymbol']].rename(columns = {'GeneSymbol': 'Gene'})

In [24]:
ts_list['TSGene'] = [1]*len(ts_list)

In [25]:
ts_list.head()

Unnamed: 0,Gene,TSGene
0,ACHE,1
1,ACY1,1
2,ADARB1,1
3,ADPRH,1
4,PARP1,1


### 

In [None]:
elledge = pd.read_excel('../../data/gwide_hema_classification/elledge2013supplement.xlsx')

In [None]:
elledge

In [26]:
var = metadata[(metadata["AML"] == 1) | (metadata["CML"] == 1)].index.tolist()
cols = ['Gene']
cols.extend(var)

In [27]:
train = gwide_median_imputation[cols]
train = pd.merge(train, ts_list, on = 'Gene', how = 'left')
train['TSGene'] = train['TSGene'].fillna(0).astype(int)
train.head()

Unnamed: 0,Gene,Brum.HAP1.GTS,Brum.KBM7.GTS,Weis14.csa.avg,Sabb15.KBM7.cs,Sabb15.KBM7.fdr,Sabb15.K562.cs,Sabb15.K562.fdr,Sabb17.EOL,Sabb17.HEL,...,Sabb17.P31.FUJ,Sabb17.PL21,Sabb17.SKM1,Sabb17.TF1,Sabb17.THP1,Weis16.csa.MW.pvalue,Weis16.csa.avg,Weis16.csi.MW.pvalue,Weis16.csi.avg,TSGene
0,IGDCC3,0.503403,0.534752,-0.249164,-0.471,0.284774,-0.137,0.731552,-0.143,-0.011,...,0.028,-0.387,-0.292,-0.053,-0.13,2.32e-07,-0.211542,0.692807,0.004401,0
1,HSF2BP,0.514247,0.521775,0.082008,0.064,0.715829,-0.046,0.532607,-0.163,-0.314,...,0.19,0.148,-0.169,-0.555,-0.221,0.04682615,-0.006646,0.789091,-0.007323,0
2,ADGRA2,0.494457,0.5,-0.008099,-0.014,0.496703,-0.225,0.435504,-0.198,-0.074,...,-0.104,-0.231,-0.128,-0.201,-0.138,0.4652979,-0.004572,0.404651,-0.007496,0
3,TKFC,0.494457,0.5,-0.008099,-0.014,0.496703,-0.225,0.435504,-0.198,-0.074,...,-0.104,-0.231,-0.128,-0.201,-0.138,0.4652979,-0.004572,0.404651,-0.007496,0
4,ERMARD,0.494457,0.5,-0.005987,0.05,0.5884,-0.259,0.978208,-0.65,0.095,...,-0.245,-0.079,0.095,-0.349,-0.07,0.4053098,0.010147,0.462927,-0.012278,0


In [29]:
chr7 = pd.read_csv('../../data/chr7_imputed.csv')
test = chr7[cols]

In [28]:
train = gwide_median_imputation[cols]
train = pd.merge(train, ts_list, on = 'Gene', how = 'left')
train['TSGene'] = train['TSGene'].fillna(0).astype(int)
train.head()

Unnamed: 0,Gene,Brum.HAP1.GTS,Brum.KBM7.GTS,Weis14.csa.avg,Sabb15.KBM7.cs,Sabb15.KBM7.fdr,Sabb15.K562.cs,Sabb15.K562.fdr,Sabb17.EOL,Sabb17.HEL,...,Sabb17.P31.FUJ,Sabb17.PL21,Sabb17.SKM1,Sabb17.TF1,Sabb17.THP1,Weis16.csa.MW.pvalue,Weis16.csa.avg,Weis16.csi.MW.pvalue,Weis16.csi.avg,TSGene
0,IGDCC3,0.503403,0.534752,-0.249164,-0.471,0.284774,-0.137,0.731552,-0.143,-0.011,...,0.028,-0.387,-0.292,-0.053,-0.13,2.32e-07,-0.211542,0.692807,0.004401,0
1,HSF2BP,0.514247,0.521775,0.082008,0.064,0.715829,-0.046,0.532607,-0.163,-0.314,...,0.19,0.148,-0.169,-0.555,-0.221,0.04682615,-0.006646,0.789091,-0.007323,0
2,ADGRA2,0.494457,0.5,-0.008099,-0.014,0.496703,-0.225,0.435504,-0.198,-0.074,...,-0.104,-0.231,-0.128,-0.201,-0.138,0.4652979,-0.004572,0.404651,-0.007496,0
3,TKFC,0.494457,0.5,-0.008099,-0.014,0.496703,-0.225,0.435504,-0.198,-0.074,...,-0.104,-0.231,-0.128,-0.201,-0.138,0.4652979,-0.004572,0.404651,-0.007496,0
4,ERMARD,0.494457,0.5,-0.005987,0.05,0.5884,-0.259,0.978208,-0.65,0.095,...,-0.245,-0.079,0.095,-0.349,-0.07,0.4053098,0.010147,0.462927,-0.012278,0


In [None]:
test.head()