In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('gene_high_throughput_sequencing.csv')
df.head()

Unnamed: 0,Patient_id,Diagnosis,LOC643837,LOC100130417,SAMD11,NOC2L,KLHL17,PLEKHN1,C1orf170,HES4,...,CLIC2,RPS4Y1,ZFY,PRKY,USP9Y,DDX3Y,CD24,CYorf15B,KDM5D,EIF1AY
0,STT5425_Breast_001_normal,normal,1.257614,2.408148,13.368622,9.494779,20.880435,12.722017,9.494779,54.349694,...,4.76125,1.257614,1.257614,1.257614,1.257614,1.257614,23.268694,1.257614,1.257614,1.257614
1,STT5427_Breast_023_normal,normal,4.567931,16.602734,42.477752,25.562376,23.221137,11.622386,14.330573,72.445474,...,6.871902,1.815112,1.815112,1.815112,1.815112,1.815112,10.427023,1.815112,1.815112,1.815112
2,STT5430_Breast_002_normal,normal,2.077597,3.978294,12.863214,13.728915,14.543176,14.141907,6.23279,57.011005,...,7.096343,2.077597,2.077597,2.077597,2.077597,2.077597,22.344226,2.077597,2.077597,2.077597
3,STT5439_Breast_003_normal,normal,2.066576,8.520713,14.466035,7.823932,8.520713,2.066576,10.870009,53.292034,...,5.20077,2.066576,2.066576,2.066576,2.066576,2.066576,49.295538,2.066576,2.066576,2.066576
4,STT5441_Breast_004_normal,normal,2.613616,3.434965,12.682222,10.543189,26.688686,12.484822,1.364917,67.140393,...,11.22777,1.364917,1.364917,1.364917,1.364917,1.364917,23.627911,1.364917,1.364917,1.364917


In [4]:
df.shape

(72, 15750)

# Часть 1: применение t-критерия Стьюдента
В первой части вам нужно будет применить критерий Стьюдента для проверки гипотезы о равенстве средних в двух независимых выборках. Применить критерий для каждого гена нужно будет дважды:
1. для групп normal (control) и early neoplasia (treatment)
1. для групп early neoplasia (control) и cancer (treatment)

В качестве ответа в этой части задания необходимо указать количество статистически значимых отличий, которые вы нашли с помощью t-критерия Стьюдента, то есть число генов, у которых p-value этого теста оказался меньше, чем уровень значимости.

In [5]:
df['Diagnosis'].value_counts()

early neoplasia    25
normal             24
cancer             23
Name: Diagnosis, dtype: int64

In [87]:
early_neoplasia_df = df[df['Diagnosis'] == 'early neoplasia'].drop(['Patient_id', 'Diagnosis'], axis='columns')
normal_df = df[df['Diagnosis'] == 'normal'].drop(['Patient_id', 'Diagnosis'], axis='columns')
cancer_df = df[df['Diagnosis'] == 'cancer'].drop(['Patient_id', 'Diagnosis'], axis='columns')
early_neoplasia_df.shape, normal_df.shape, cancer_df.shape

((25, 15748), (24, 15748), (23, 15748))

In [94]:
from scipy.stats import ttest_ind

normal_early_ttest_statistic, normal_early_ttest_pvalue = ttest_ind(normal_df, early_neoplasia_df, equal_var=False)

In [95]:
ans_1 = str((normal_early_ttest_pvalue < 0.05).sum())
ans_1

'1575'

In [96]:
with open('ans_1', 'w') as fout:
    fout.write(ans_1)

In [97]:
early_cancer_ttest_statistic, early_cancer_ttest_pvalue = ttest_ind(early_neoplasia_df, cancer_df, equal_var=False)
ans_2 = str((early_cancer_ttest_pvalue < 0.05).sum())
with open('ans_2', 'w') as fout:
    fout.write(ans_2)
ans_2

'3490'

In [98]:
import statsmodels.stats.multitest as smm

In [78]:
def fold_change(treatment, control):
    return np.array([t/c if t > c else -c/t for t, c in zip(treatment, control)])

In [100]:
normal_early_reject, normal_early_pvalues_corrected, _, _ = \
    smm.multipletests(normal_early_ttest_pvalue, alpha = 0.025, method='holm')

In [101]:
normal_early_fc = fold_change(early_neoplasia_df.loc[:, normal_early_reject].mean(axis=0),
                              normal_df.loc[:, normal_early_reject].mean(axis=0))
normal_early_num = (np.abs(normal_early_fc) > 1.5).sum()
with open('ans_3', 'w') as fout:
    fout.write(str(normal_early_num))
normal_early_num

2

In [102]:
early_cancer_reject, early_cancer_pvalues_corrected, _, _ = \
    smm.multipletests(early_cancer_ttest_pvalue, alpha = 0.025, method='holm')

In [103]:
early_cancer_fc = fold_change(cancer_df.loc[:, early_cancer_reject].mean(axis=0),
                              early_neoplasia_df.loc[:, early_cancer_reject].mean(axis=0))
early_cancer_num = (np.abs(early_cancer_fc) > 1.5).sum()
with open('ans_4', 'w') as fout:
    fout.write(str(early_cancer_num))
early_cancer_num

77

In [105]:
normal_early_reject, normal_early_pvalues_corrected, _, _ = \
    smm.multipletests(normal_early_ttest_pvalue, alpha = 0.025, method='fdr_bh')

In [106]:
normal_early_fc = fold_change(early_neoplasia_df.loc[:, normal_early_reject].mean(axis=0),
                              normal_df.loc[:, normal_early_reject].mean(axis=0))
normal_early_num = (np.abs(normal_early_fc) > 1.5).sum()
with open('ans_5', 'w') as fout:
    fout.write(str(normal_early_num))
normal_early_num

4

In [107]:
early_cancer_reject, early_cancer_pvalues_corrected, _, _ = \
    smm.multipletests(early_cancer_ttest_pvalue, alpha = 0.025, method='fdr_bh')

In [108]:
early_cancer_fc = fold_change(cancer_df.loc[:, early_cancer_reject].mean(axis=0),
                              early_neoplasia_df.loc[:, early_cancer_reject].mean(axis=0))
early_cancer_num = (np.abs(early_cancer_fc) > 1.5).sum()
with open('ans_6', 'w') as fout:
    fout.write(str(early_cancer_num))
early_cancer_num

524