In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
files = glob.glob("path/*.csv")

import gc
import itertools
from copy import deepcopy

from tqdm import tqdm

from scipy.stats import ks_2samp

from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

from sklearn import manifold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
%matplotlib inline

In [11]:
train_folds = {}
for filename in glob.glob('*.csv'):
    if "train" in filename:
        train_folds[filename[:-4]] = pd.read_csv(filename)


In [12]:
test_folds = {}
for filename in glob.glob('*.csv'):
    if "val" in filename:
        test_folds[filename[:-4]] = pd.read_csv(filename)

In [13]:
train1 = pd.DataFrame.from_dict(train_folds['train_fold_1'])
train2 = pd.DataFrame.from_dict(train_folds['train_fold_2'])
train3 = pd.DataFrame.from_dict(train_folds['train_fold_3'])
train4 = pd.DataFrame.from_dict(train_folds['train_fold_4'])
train5 = pd.DataFrame.from_dict(train_folds['train_fold_5'])
train6 = pd.DataFrame.from_dict(train_folds['train_fold_6'])
train7 = pd.DataFrame.from_dict(train_folds['train_fold_7'])
train8 = pd.DataFrame.from_dict(train_folds['train_fold_8'])
train9 = pd.DataFrame.from_dict(train_folds['train_fold_9'])
train10 = pd.DataFrame.from_dict(train_folds['train_fold_10'])


In [14]:
test1 = pd.DataFrame.from_dict(test_folds['val_fold_1'])
test2 = pd.DataFrame.from_dict(test_folds['val_fold_2'])
test3 = pd.DataFrame.from_dict(test_folds['val_fold_3'])
test4 = pd.DataFrame.from_dict(test_folds['val_fold_4'])
test5 = pd.DataFrame.from_dict(test_folds['val_fold_5'])
test6 = pd.DataFrame.from_dict(test_folds['val_fold_6'])
test7 = pd.DataFrame.from_dict(test_folds['val_fold_7'])
test8 = pd.DataFrame.from_dict(test_folds['val_fold_8'])
test9 = pd.DataFrame.from_dict(test_folds['val_fold_9'])
test10 = pd.DataFrame.from_dict(test_folds['val_fold_10'])

In [16]:
groupDF = [train1,train2,train3,train4,train5,train6,train7,train8,train9,train10,test1,test2,test3,test4,test5,test6,test7,test8,test9,test10
]
for i in groupDF:
    i.drop(i.columns[[0,59]], axis=1, inplace=True)

In [17]:
# Find the columns where the distributions are very different
threshold=0.1
diff_data1 = []
for col in tqdm(train1.columns):
    statistic, pvalue = ks_2samp(
        train1[col].values, 
        test1[col].values
    )
    if pvalue <= 0.01 and np.abs(statistic) > threshold:
        diff_data1.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})

# Put the differences into a dataframe
diff_df1 = pd.DataFrame(diff_data1).sort_values(by='statistic', ascending=False)

diff_df1

100%|██████████| 58/58 [00:00<00:00, 489.56it/s]


Unnamed: 0,feature,p,statistic
0,Brain - Amygdala_GTExTPM,0.00881,0.31


In [18]:
threshold=0.1
diff_data2 = []
for col in tqdm(train1.columns):
    statistic, pvalue = ks_2samp(
        train2[col].values, 
        test2[col].values
    )
    if pvalue <= 0.01 and np.abs(statistic) > threshold:
        diff_data2.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})

# Put the differences into a dataframe
diff_df2 = pd.DataFrame(diff_data2).sort_values(by='statistic', ascending=False)

diff_df2

100%|██████████| 58/58 [00:00<00:00, 498.93it/s]


Unnamed: 0,feature,p,statistic
0,Kidney - Cortex_GTExTPM,0.00123,0.36
2,Liver_GTExTPM,0.00228,0.35
5,Adrenal Gland_GTExTPM,0.00174,0.35
3,Testis_GTExTPM,0.00313,0.34
4,Pancreas_GTExTPM,0.00396,0.33
1,Kidney - Medulla_GTExTPM,0.00794,0.31


In [19]:
threshold=0.1
diff_data3 = []
for col in tqdm(train3.columns):
    statistic, pvalue = ks_2samp(
        train3[col].values, 
        test3[col].values
    )
    if pvalue <= 0.01 and np.abs(statistic) > threshold:
        diff_data3.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})
    else: 
        diff_data3.append({'feature': col,'statistic':'No significant difference'})

# Put the differences into a dataframe
diff_df3 = pd.DataFrame(diff_data3).sort_values(by='statistic', ascending=False)

diff_df3

100%|██████████| 58/58 [00:00<00:00, 499.11it/s]


Unnamed: 0,feature,statistic
0,wgEncodeBroadHmmHuvecHMM.count,No significant difference
43,Cervix - Endocervix_GTExTPM,No significant difference
31,Adipose - Subcutaneous_GTExTPM,No significant difference
32,Adrenal Gland_GTExTPM,No significant difference
33,Artery - Aorta_GTExTPM,No significant difference
34,Artery - Coronary_GTExTPM,No significant difference
35,Artery - Tibial_GTExTPM,No significant difference
36,Bladder_GTExTPM,No significant difference
37,Brain - Amygdala_GTExTPM,No significant difference
38,Brain - Anterior cingulate cortex (BA24)_GTExTPM,No significant difference


In [20]:
threshold=0.1
diff_data4 = []
nondiff_data4 = []
for col in tqdm(train4.columns):
    statistic, pvalue = ks_2samp(
        train4[col].values, 
        test4[col].values
    )
    if pvalue <= 0.01 and np.abs(statistic) > threshold:
        diff_data4.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})
    else: 
        diff_data4.append({'feature': col,'statistic':'No significant difference'})

# Put the differences into a dataframe
diff_df4 = pd.DataFrame(diff_data4).sort_values(by='statistic', ascending=False)
diff_df4

100%|██████████| 58/58 [00:00<00:00, 498.39it/s]


Unnamed: 0,feature,statistic
0,wgEncodeBroadHmmHuvecHMM.count,No significant difference
43,Cervix - Endocervix_GTExTPM,No significant difference
31,Adipose - Subcutaneous_GTExTPM,No significant difference
32,Adrenal Gland_GTExTPM,No significant difference
33,Artery - Aorta_GTExTPM,No significant difference
34,Artery - Coronary_GTExTPM,No significant difference
35,Artery - Tibial_GTExTPM,No significant difference
36,Bladder_GTExTPM,No significant difference
37,Brain - Amygdala_GTExTPM,No significant difference
38,Brain - Anterior cingulate cortex (BA24)_GTExTPM,No significant difference


In [21]:
threshold=0.1
diff_data5 = []
for col in tqdm(train5.columns):
    statistic, pvalue = ks_2samp(
        train5[col].values, 
        test5[col].values
    )
    if pvalue <= 0.01 and np.abs(statistic) > threshold:
        diff_data5.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})
    else: 
        diff_data5.append({'feature': col,'statistic':'No significant difference'})

# Put the differences into a dataframe
diff_df5 = pd.DataFrame(diff_data5).sort_values(by='statistic', ascending=False)

diff_df5

100%|██████████| 58/58 [00:00<00:00, 482.38it/s]


Unnamed: 0,feature,statistic
0,wgEncodeBroadHmmHuvecHMM.count,No significant difference
43,Cervix - Endocervix_GTExTPM,No significant difference
31,Adipose - Subcutaneous_GTExTPM,No significant difference
32,Adrenal Gland_GTExTPM,No significant difference
33,Artery - Aorta_GTExTPM,No significant difference
34,Artery - Coronary_GTExTPM,No significant difference
35,Artery - Tibial_GTExTPM,No significant difference
36,Bladder_GTExTPM,No significant difference
37,Brain - Amygdala_GTExTPM,No significant difference
38,Brain - Anterior cingulate cortex (BA24)_GTExTPM,No significant difference


In [22]:
threshold=0.1
diff_data6 = []
for col in tqdm(train5.columns):
    statistic, pvalue = ks_2samp(
        train6[col].values, 
        test6[col].values
    )
    if pvalue <= 0.01 and np.abs(statistic) > threshold:
        diff_data6.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})
    else: 
        diff_data6.append({'feature': col,'statistic':'No significant difference'})

# Put the differences into a dataframe
diff_df6 = pd.DataFrame(diff_data6).sort_values(by='statistic', ascending=False)

diff_df6

100%|██████████| 58/58 [00:00<00:00, 475.98it/s]


Unnamed: 0,feature,statistic
0,wgEncodeBroadHmmHuvecHMM.count,No significant difference
43,Cervix - Endocervix_GTExTPM,No significant difference
31,Adipose - Subcutaneous_GTExTPM,No significant difference
32,Adrenal Gland_GTExTPM,No significant difference
33,Artery - Aorta_GTExTPM,No significant difference
34,Artery - Coronary_GTExTPM,No significant difference
35,Artery - Tibial_GTExTPM,No significant difference
36,Bladder_GTExTPM,No significant difference
37,Brain - Amygdala_GTExTPM,No significant difference
38,Brain - Anterior cingulate cortex (BA24)_GTExTPM,No significant difference


In [23]:
threshold=0.1
diff_data7 = []
for col in tqdm(train7.columns):
    statistic, pvalue = ks_2samp(
        train7[col].values, 
        test7[col].values
    )
    if pvalue <= 0.01 and np.abs(statistic) > threshold:
        diff_data7.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})

diff_df7 = pd.DataFrame(diff_data7).sort_values(by='statistic', ascending=False)

diff_df7

100%|██████████| 58/58 [00:00<00:00, 505.23it/s]


Unnamed: 0,feature,p,statistic
0,Muscle - Skeletal_GTExTPM,0.00509,0.33
2,Brain - Hypothalamus_GTExTPM,0.00742,0.32
1,Brain - Caudate (basal ganglia)_GTExTPM,0.00965,0.31


In [24]:
threshold=0.1
diff_data8 = []
for col in tqdm(train3.columns):
    statistic, pvalue = ks_2samp(
        train8[col].values, 
        test8[col].values
    )
    if pvalue <= 0.01 and np.abs(statistic) > threshold:
        diff_data8.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})
    else: 
        diff_data8.append({'feature': col,'statistic':'No significant difference'})

# Put the differences into a dataframe
diff_df8 = pd.DataFrame(diff_data8).sort_values(by='statistic', ascending=False)

diff_df8

100%|██████████| 58/58 [00:00<00:00, 496.87it/s]


Unnamed: 0,feature,statistic
0,wgEncodeBroadHmmHuvecHMM.count,No significant difference
43,Cervix - Endocervix_GTExTPM,No significant difference
31,Adipose - Subcutaneous_GTExTPM,No significant difference
32,Adrenal Gland_GTExTPM,No significant difference
33,Artery - Aorta_GTExTPM,No significant difference
34,Artery - Coronary_GTExTPM,No significant difference
35,Artery - Tibial_GTExTPM,No significant difference
36,Bladder_GTExTPM,No significant difference
37,Brain - Amygdala_GTExTPM,No significant difference
38,Brain - Anterior cingulate cortex (BA24)_GTExTPM,No significant difference


In [25]:
threshold=0.1
diff_data9 = []
for col in tqdm(train9.columns):
    statistic, pvalue = ks_2samp(
        train9[col].values, 
        test9[col].values
    )
    if pvalue <= 0.01 and np.abs(statistic) > threshold:
        diff_data9.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})
    else: 
        diff_data9.append({'feature': col,'statistic':'No significant difference'})


# Put the differences into a dataframe
diff_df9 = pd.DataFrame(diff_data9).sort_values(by='statistic', ascending=False)

diff_df9

100%|██████████| 58/58 [00:00<00:00, 509.34it/s]


Unnamed: 0,feature,statistic
0,wgEncodeBroadHmmHuvecHMM.count,No significant difference
43,Cervix - Endocervix_GTExTPM,No significant difference
31,Adipose - Subcutaneous_GTExTPM,No significant difference
32,Adrenal Gland_GTExTPM,No significant difference
33,Artery - Aorta_GTExTPM,No significant difference
34,Artery - Coronary_GTExTPM,No significant difference
35,Artery - Tibial_GTExTPM,No significant difference
36,Bladder_GTExTPM,No significant difference
37,Brain - Amygdala_GTExTPM,No significant difference
38,Brain - Anterior cingulate cortex (BA24)_GTExTPM,No significant difference


In [26]:
threshold=0.1
diff_data10 = []
for col in tqdm(train3.columns):
    statistic, pvalue = ks_2samp(
        train10[col].values, 
        test10[col].values
    )
    if pvalue <= 0.01 and np.abs(statistic) > threshold:
        diff_data10.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})
    else: 
        diff_data10.append({'feature': col,'statistic':'No significant difference'})

# Put the differences into a dataframe
diff_df10 = pd.DataFrame(diff_data10).sort_values(by='statistic', ascending=False)

diff_df10

100%|██████████| 58/58 [00:00<00:00, 514.01it/s]


Unnamed: 0,feature,statistic
0,wgEncodeBroadHmmHuvecHMM.count,No significant difference
43,Cervix - Endocervix_GTExTPM,No significant difference
31,Adipose - Subcutaneous_GTExTPM,No significant difference
32,Adrenal Gland_GTExTPM,No significant difference
33,Artery - Aorta_GTExTPM,No significant difference
34,Artery - Coronary_GTExTPM,No significant difference
35,Artery - Tibial_GTExTPM,No significant difference
36,Bladder_GTExTPM,No significant difference
37,Brain - Amygdala_GTExTPM,No significant difference
38,Brain - Anterior cingulate cortex (BA24)_GTExTPM,No significant difference


In [27]:
frames = [diff_df1, diff_df2,diff_df7,]
result = pd.concat(frames)

In [28]:
result_df = result.drop_duplicates(subset=['feature'], keep='first')
list(result_df['feature'])

['Brain - Amygdala_GTExTPM',
 'Kidney - Cortex_GTExTPM',
 'Liver_GTExTPM',
 'Adrenal Gland_GTExTPM',
 'Testis_GTExTPM',
 'Pancreas_GTExTPM',
 'Kidney - Medulla_GTExTPM',
 'Muscle - Skeletal_GTExTPM',
 'Brain - Hypothalamus_GTExTPM',
 'Brain - Caudate (basal ganglia)_GTExTPM']

In [30]:
df = pd.read_csv('imputed_training_data.csv')
df = df.drop(['Brain - Amygdala_GTExTPM',
 'Kidney - Cortex_GTExTPM',
 'Liver_GTExTPM',
 'Adrenal Gland_GTExTPM',
 'Testis_GTExTPM',
 'Pancreas_GTExTPM',
 'Kidney - Medulla_GTExTPM',
 'Muscle - Skeletal_GTExTPM',
 'Brain - Hypothalamus_GTExTPM',
 'Brain - Caudate (basal ganglia)_GTExTPM'], 1, errors='ignore')
df.to_csv('cleaned_imputed_training_data.csv', index=False)