## Imports

In [28]:
import pandas as pd
import numpy as np
import scipy as sp
import statsmodels.stats.multitest as smm

## Data preprocessing

In [4]:
data = pd.read_csv("gene_high_throughput_sequencing (1).csv")
data

Unnamed: 0,Patient_id,Diagnosis,LOC643837,LOC100130417,SAMD11,NOC2L,KLHL17,PLEKHN1,C1orf170,HES4,...,CLIC2,RPS4Y1,ZFY,PRKY,USP9Y,DDX3Y,CD24,CYorf15B,KDM5D,EIF1AY
0,STT5425_Breast_001_normal,normal,1.257614,2.408148,13.368622,9.494779,20.880435,12.722017,9.494779,54.349694,...,4.761250,1.257614,1.257614,1.257614,1.257614,1.257614,23.268694,1.257614,1.257614,1.257614
1,STT5427_Breast_023_normal,normal,4.567931,16.602734,42.477752,25.562376,23.221137,11.622386,14.330573,72.445474,...,6.871902,1.815112,1.815112,1.815112,1.815112,1.815112,10.427023,1.815112,1.815112,1.815112
2,STT5430_Breast_002_normal,normal,2.077597,3.978294,12.863214,13.728915,14.543176,14.141907,6.232790,57.011005,...,7.096343,2.077597,2.077597,2.077597,2.077597,2.077597,22.344226,2.077597,2.077597,2.077597
3,STT5439_Breast_003_normal,normal,2.066576,8.520713,14.466035,7.823932,8.520713,2.066576,10.870009,53.292034,...,5.200770,2.066576,2.066576,2.066576,2.066576,2.066576,49.295538,2.066576,2.066576,2.066576
4,STT5441_Breast_004_normal,normal,2.613616,3.434965,12.682222,10.543189,26.688686,12.484822,1.364917,67.140393,...,11.227770,1.364917,1.364917,1.364917,1.364917,1.364917,23.627911,1.364917,1.364917,1.364917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,STT5750_Breast_021_DCIS,cancer,1.466240,3.689955,21.331981,19.359546,23.505609,11.576176,11.325851,80.572057,...,2.807635,1.466240,2.807635,1.466240,2.807635,3.689955,18.449777,1.466240,1.466240,1.466240
68,STT5751_Breast_021_IDC,cancer,2.492410,12.279444,30.087759,17.703926,26.064890,10.707081,11.520120,65.041865,...,2.492410,2.492410,1.301619,3.275668,1.301619,1.301619,8.601194,1.301619,1.301619,1.301619
69,STT5758_Breast_022_IDC,cancer,3.530477,20.006038,53.924651,25.449565,30.949995,16.794696,17.121366,70.958462,...,1.176826,1.176826,1.176826,2.961613,1.176826,1.176826,10.764365,1.176826,1.176826,1.176826
70,STT5763_Breast_022_IDC,cancer,3.733734,8.860505,32.538666,21.585069,24.987992,11.460224,11.201202,67.767125,...,2.840946,1.483635,1.483635,1.483635,1.483635,1.483635,8.522837,1.483635,1.483635,1.483635


In [5]:
COLS = data.columns.tolist()[2:]
ALPHA = 0.05

## Split data to 3 datasets for each diagnosis

In [31]:
normal = {}
data_normal = data[data['Diagnosis'] == 'normal'].drop(['Patient_id', 'Diagnosis'], axis=1)

for column in data_normal.columns:
    normal[column] = data_normal[column].tolist()


early_neoplasia = {}
data_en = data[data['Diagnosis'] == 'early neoplasia'].drop(['Patient_id', 'Diagnosis'], axis=1)

for column in data_en.columns:
    early_neoplasia[column] = data_en[column].tolist()


cancer = {}
data_cancer = data[data['Diagnosis'] == 'cancer'].drop(['Patient_id', 'Diagnosis'], axis=1)

for column in data_cancer.columns:
    cancer[column] = data_cancer[column].tolist()

In [102]:
normal['LOC643837']

[1.257614161,
 4.567931059,
 2.077596519,
 2.066576443,
 2.613616279,
 3.94227485,
 1.084113356,
 3.15389961,
 2.551799926,
 3.693128348,
 3.55822218,
 0.938061115,
 1.003451076,
 7.364879258,
 2.561870939,
 2.971204691,
 2.871766653,
 1.045382387,
 1.801865438,
 3.515833847,
 2.23457622,
 4.717821882,
 1.474173506,
 1.282995362]

### Make t-tests for each group

In [81]:
ttest_pvalue_normal_early = []
for item in COLS:
    ttest_pvalue_normal_early.append(float(sp.stats.ttest_ind(normal[item], early_neoplasia[item], alternative='two-sided').pvalue))

ttest_pvalue_normal_early

[0.6905974705672382,
 3.270551648843383e-05,
 0.05845490875510862,
 0.8259844327287174,
 0.05039072452778555,
 0.14377106032135595,
 0.328719363345765,
 0.023921314374425896,
 0.2400002541058592,
 0.03866311260288654,
 0.02044628965233119,
 0.009818670687346799,
 0.7118113326560942,
 0.17519972600209976,
 0.38061939592111094,
 0.15775892653695853,
 0.09761248046263993,
 0.0063701077127594746,
 0.748325148992405,
 0.13481698630693562,
 0.17742185039907635,
 0.5827319134696256,
 0.8101157602385761,
 0.9612724307882259,
 0.6368685748218723,
 0.6201522577392664,
 0.43211341879733745,
 0.5842580027685232,
 0.7334189750757707,
 0.24506590444548845,
 0.22233741652321393,
 0.010012567807576023,
 0.4863190224529448,
 0.17905071969583883,
 0.4297351902725317,
 0.4015786923571075,
 0.9696117294129436,
 0.697392417904596,
 0.09757839979280732,
 0.4656123628900103,
 0.10642536446157672,
 0.31964229347221157,
 0.4147186915632016,
 0.4572647317888636,
 0.6958211447926623,
 0.023428533589542763,
 0.00

In [82]:
ttest_pvalue_early_cancer = []
for item in COLS:
    ttest_pvalue_early_cancer.append(float(sp.stats.ttest_ind(early_neoplasia[item], cancer[item], alternative='two-sided').pvalue))

ttest_pvalue_early_cancer

[0.41676374995320964,
 0.6500720940141129,
 0.06748410223471249,
 0.28764410481581915,
 0.47072018491497103,
 0.006795973978412702,
 0.48002310194383513,
 0.5688998464954522,
 0.0001823416669113017,
 0.7143951350735523,
 0.21336363512896586,
 0.13128953561911996,
 0.0074914347843940444,
 0.0019602677786270052,
 0.08773888981775227,
 0.9000166196382243,
 0.02769985877552699,
 0.00030346546200162017,
 0.21675516717050117,
 0.2699929727105205,
 0.7029029915001161,
 0.0329683401367385,
 0.8620132714485453,
 0.1041513194224889,
 0.29993190871168496,
 0.7340173802186681,
 0.3068660867313154,
 0.02769629020478724,
 0.034778677512099745,
 0.003222538590589672,
 0.16694872687945272,
 0.1523660035841487,
 0.3882545146433585,
 0.5810175371517001,
 0.23738701453105493,
 0.003902806183389724,
 0.05030331806890033,
 0.012714669377723961,
 0.04488517567640983,
 0.6289734816872184,
 0.7422623617632984,
 0.6432477086848707,
 0.7468118361040811,
 0.004423784554065988,
 0.0012585617012045142,
 0.09924076

### Count elements with statistical significance level less than p-value


In [83]:
counter_pval_normal_early = 0
for i in range(len(ttest_pvalue_normal_early)):
    if ttest_pvalue_normal_early[i] <= 0.025:
        counter_pval_normal_early += 1

counter_pval_normal_early

1583

In [84]:
counter_pval_early_cancer = 0
for i in range(len(ttest_pvalue_early_cancer)):
    if ttest_pvalue_early_cancer[i] <= 0.025:
        counter_pval_early_cancer += 1

counter_pval_early_cancer

3529

In [99]:
holm_normal_early = smm.multipletests(ttest_pvalue_normal_early, alpha=0.975, method='holm')
holm_early_cancer = smm.multipletests(ttest_pvalue_early_cancer, alpha=0.975, method='holm')
hne = holm_normal_early[0].tolist()
hec = holm_early_cancer[0].tolist()

доделать

In [100]:
for i in hne:
    if hne[i] == True:
        print(123)

123
123
123
123
123
123
123
123
123
123
123
123
123
123
123
123
123
123
123
123
123
123
123


In [98]:
for i in hec:
    if hec[i] == True:
        print(123)