# Classification Train-test data processing

In [1]:
# Author: Shirley Zhou

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Exclude non-coding gene from dataset

__Since `biomaRt` does not function well, the lists of protein-coding gene in train and test set are obtained from Weihan

In [3]:
gwide = (pd.read_csv('../../data/gwide_hema_classification/Genomewide_coding.csv')
         .drop(["Unnamed: 0", "GeneID", "Aliases", "X", "X.1"], axis = 1))

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
gwide.head()

Unnamed: 0,Gene,chromosome,HSC.logCPM,Brummelkamp.hap1.GTS.ratio,Brummelkamp.hap1.GTS.threshold,Brummelkamp.kbm7.GTS.ratio,Brummelkamp.kbm7.GTS.threshold,Chen2019.pos.score,Chen2019.pos.p.value,Chen2019.pos.p.value.threshold,...,Sabbatini2017.TF.1.threshold,Sabbatini2017.THP.1,Sabbatini2017.THP.1.threshold,Wallace2016.DEseq2.Log2.Fold.Change,Wallace2016.DEseq2.P.Value,Wallace2016.threshold,Rep1_2.Mann.Whitney.p.value,average.phenotype.of.strongest.3.2,Rep1_2.Mann.Whitney.p.value.1,average.phenotype.of.strongest.3.2.1
0,IGDCC3,15,,0.503403,0,0.534752,0,0.89178,0.89137,0,...,0,-0.13,0,,,0,2.32e-07,-0.211542,0.692807,0.004401
1,HSF2BP,21,,0.514247,0,0.521775,0,0.62112,0.62094,0,...,0,-0.221,0,,,0,0.04682615,-0.006646,0.789091,-0.007323
2,ADGRA2,8,,,0,,0,0.18968,0.26085,0,...,0,,0,,,0,,,,
3,TKFC,11,,,0,,0,0.94001,0.93984,0,...,0,,0,,,0,,,,
4,ERMARD,6,,,0,,0,0.28938,0.32906,0,...,0,-0.07,0,,,0,0.4053098,0.010147,0.462927,-0.012278


## Deletion of column data¶

The column `Weissman2014.CRISPRi.Growth.phenotype..mean.of.top.3.gammas.` will be deleted because most of the hits in this CRISPRi screen are underpowered.

In [5]:
gwide = gwide.drop(['Weissman2014.CRISPRi.Growth.phenotype..mean.of.top.3.gammas.',
              'Weissman2014.CRISPRa.Growth.phenotype..mean.of.top.3.gammas.'], axis=1)

In [6]:
gwide.columns.tolist()

['Gene',
 'chromosome',
 'HSC.logCPM',
 'Brummelkamp.hap1.GTS.ratio',
 'Brummelkamp.hap1.GTS.threshold',
 'Brummelkamp.kbm7.GTS.ratio',
 'Brummelkamp.kbm7.GTS.threshold',
 'Chen2019.pos.score',
 'Chen2019.pos.p.value',
 'Chen2019.pos.p.value.threshold',
 'Chen2019.pos.fdr',
 'Chen2019.pos.rank',
 'Weissman2014.CRISPRa.Growth.phenotype.threshold',
 'Doench2018.Average.LFC',
 'Doench2018.Average.nlog.p.values.',
 'Doench2018.Average.nlog.p.values.threshold',
 'Elledge2013.TUSON_p_value_TSG',
 'Elledge2013.TUSON_p_value_TSG.threshold',
 'Elledge2013.TUSON_q_value_TSG',
 'Elledge2013.TSG_Probability_LASSO',
 'Elledge2019.HMEC.Average.Log2FC.Drop',
 'Elledge2019.HMEC.Combined.pvalue.drop',
 'Elledge2019.HMEC.FDR.drop',
 'Elledge2019.HMEC.Average.threshold',
 'Elledge2019.HPNE.Average.Log2.Drop',
 'Elledge2019.HPNE.Combined.pvalue.drop',
 'Elledge2019.HPNE.FDR.drop',
 'Elledge2019.HPNE.Average.threshold',
 'Sabbatini2015.KBM7.CS',
 'Sabbatini2015.KBM7.adjusted.p.value',
 'Sabbatini2015.KBM7.

## Cleaning of gwide data set

In [8]:
def percent_nan(list):
    return (sum(np.isnan(list))/len(list))

def percent_zero(list):
    return ((len(list) - np.count_nonzero(list))/len(list))

### Check NaN columns

In [9]:
nan_check = gwide.drop(['Gene', 'chromosome'], 
                     axis=1).apply(percent_nan)
nan_check.sort_values(ascending = False).head(30)

Wallace2016.DEseq2.P.Value               0.950480
Wallace2016.DEseq2.Log2.Fold.Change      0.950480
Sabbatini2015.GTS                        0.765200
HSC.logCPM                               0.532756
Elledge2019.HPNE.FDR.drop                0.294387
Elledge2019.HPNE.Combined.pvalue.drop    0.294387
Elledge2019.HPNE.Average.Log2.Drop       0.294387
Elledge2019.HMEC.Average.Log2FC.Drop     0.292933
Elledge2019.HMEC.FDR.drop                0.292933
Elledge2019.HMEC.Combined.pvalue.drop    0.292933
Brummelkamp.kbm7.GTS.ratio               0.221694
Brummelkamp.hap1.GTS.ratio               0.174861
Sabbatini2015.Jiyoye.CS                  0.101727
Sabbatini2015.Raji.CS                    0.101727
Sabbatini2015.Raji.adjusted.p.value      0.101727
Sabbatini2015.K562.adjusted.p.value      0.101727
Sabbatini2015.K562.CS                    0.101727
Sabbatini2015.Jiyoye.adjusted.p.value    0.101727
Sabbatini2015.KBM7.adjusted.p.value      0.101727
Sabbatini2015.KBM7.CS                    0.101727


In [10]:
gwide_cleaned_nan = gwide.drop(nan_check[nan_check > 0.75].index.tolist(), axis=1)

### Check Zero Variance Column

In [12]:
col_var = gwide_cleaned_nan.drop(['Gene', 'chromosome'], 
                              axis=1).apply(np.var)
gwide_cleaned_var = gwide_cleaned_nan.drop(col_var[col_var == 0].index.tolist(), axis=1)

### Check Zero Value Column

In [14]:
zeros_check = gwide_cleaned_var.drop(['Gene', 'chromosome'], 
                              axis=1).apply(percent_zero)
zeros_check.sort_values(ascending = False).head(30)

Sabbatini2017.NB4.rep2.threshold                   0.999978
Sabbatini2017.HEL.threshold                        0.999978
Sabbatini2017.OCI.AML2.threshold                   0.999956
Sabbatini2017.TF.1.threshold                       0.999890
Sabbatini2017.MonoMac1.threshold                   0.999868
Sabbatini2017.PL.21.threshold                      0.999846
Sabbatini2017.THP.1.threshold                      0.999802
Sabbatini2017.MV412                                0.999802
Sabbatini2017.OCI.AML5.threshold                   0.999802
Sabbatini2017.EOL.1.threshold                      0.999537
Sabbatini2017.OCI.AML3.threshold                   0.999141
Sabbatini2017.SKM.threshold                        0.999097
Sabbatini2017.MOLM.13.threshold                    0.999097
Sabbatini2015.Jiyoye.threshold                     0.998039
Sabbatini2015.KBM7.threshold                       0.997423
Sabbatini2015.Raji.threshold                       0.993656
Weissman2014.CRISPRa.Growth.phenotype.th

In [15]:
gwide_cleaned_zero = gwide_cleaned_var.drop(zeros_check[zeros_check > 0.6].index.tolist(), axis=1)
gwide_cleaned_zero.columns.tolist()

['Gene',
 'chromosome',
 'HSC.logCPM',
 'Brummelkamp.hap1.GTS.ratio',
 'Brummelkamp.kbm7.GTS.ratio',
 'Chen2019.pos.score',
 'Chen2019.pos.p.value',
 'Chen2019.pos.fdr',
 'Chen2019.pos.rank',
 'Doench2018.Average.LFC',
 'Doench2018.Average.nlog.p.values.',
 'Elledge2013.TUSON_p_value_TSG',
 'Elledge2013.TUSON_q_value_TSG',
 'Elledge2013.TSG_Probability_LASSO',
 'Elledge2019.HMEC.Average.Log2FC.Drop',
 'Elledge2019.HMEC.Combined.pvalue.drop',
 'Elledge2019.HMEC.FDR.drop',
 'Elledge2019.HPNE.Average.Log2.Drop',
 'Elledge2019.HPNE.Combined.pvalue.drop',
 'Elledge2019.HPNE.FDR.drop',
 'Sabbatini2015.KBM7.CS',
 'Sabbatini2015.KBM7.adjusted.p.value',
 'Sabbatini2015.K562.CS',
 'Sabbatini2015.K562.adjusted.p.value',
 'Sabbatini2015.Jiyoye.CS',
 'Sabbatini2015.Jiyoye.adjusted.p.value',
 'Sabbatini2015.Raji.CS',
 'Sabbatini2015.Raji.adjusted.p.value',
 'Sabbatini2017.EOL.1',
 'Sabbatini2017.HEL',
 'Sabbatini2017.MOLM.13',
 'Sabbatini2017.MonoMac1',
 'Sabbatini2017.MV411.threshold',
 'Sabbatini2