In [1]:
import numpy as np
import pandas as pd
import sys

from statsmodels.stats import multitest as smm
from scipy.stats import ttest_ind

sys.path.append('.\\..\\..\\..\\..')

from src.utils.Utils import save_answer

In [2]:
data = pd.read_csv('data/gene_high_throughput_sequencing.csv')
data.head()

Unnamed: 0,Patient_id,Diagnosis,LOC643837,LOC100130417,SAMD11,NOC2L,KLHL17,PLEKHN1,C1orf170,HES4,...,CLIC2,RPS4Y1,ZFY,PRKY,USP9Y,DDX3Y,CD24,CYorf15B,KDM5D,EIF1AY
0,STT5425_Breast_001_normal,normal,1.257614,2.408148,13.368622,9.494779,20.880435,12.722017,9.494779,54.349694,...,4.76125,1.257614,1.257614,1.257614,1.257614,1.257614,23.268694,1.257614,1.257614,1.257614
1,STT5427_Breast_023_normal,normal,4.567931,16.602734,42.477752,25.562376,23.221137,11.622386,14.330573,72.445474,...,6.871902,1.815112,1.815112,1.815112,1.815112,1.815112,10.427023,1.815112,1.815112,1.815112
2,STT5430_Breast_002_normal,normal,2.077597,3.978294,12.863214,13.728915,14.543176,14.141907,6.23279,57.011005,...,7.096343,2.077597,2.077597,2.077597,2.077597,2.077597,22.344226,2.077597,2.077597,2.077597
3,STT5439_Breast_003_normal,normal,2.066576,8.520713,14.466035,7.823932,8.520713,2.066576,10.870009,53.292034,...,5.20077,2.066576,2.066576,2.066576,2.066576,2.066576,49.295538,2.066576,2.066576,2.066576
4,STT5441_Breast_004_normal,normal,2.613616,3.434965,12.682222,10.543189,26.688686,12.484822,1.364917,67.140393,...,11.22777,1.364917,1.364917,1.364917,1.364917,1.364917,23.627911,1.364917,1.364917,1.364917


In [3]:
data_normal = data[data['Diagnosis'] == 'normal']
data_neoplasia  = data[data['Diagnosis'] == 'early neoplasia']
data_cancer  = data[data['Diagnosis'] == 'cancer']
data.shape

(72, 15750)

In [4]:
def count_stat(data1, data2):
    stat = []
    for c in data.columns[2:]:
        stat.append(ttest_ind(data1[c], data2[c], equal_var = False))
    
    return stat

In [5]:
answer1_1 = len([t for t in count_stat(data_normal, data_neoplasia) if t.pvalue < 0.05])
answer1_2 = len([t for t in count_stat(data_neoplasia, data_cancer) if t.pvalue < 0.05])

save_answer("1_1", answer1_1)
save_answer("1_2", answer1_2)


Task №1_1 answer 1575


Task №1_2 answer 3490



In [6]:
def fc(c, t) :
    if t > c :
        r = t/c
    else :
        r =  c/t
    return r

In [7]:
def getPreparedDF(data1, data2, method):
    stat = {"pvalue": [t.pvalue for t in count_stat(data1, data2)], "column": data.columns[2:]}

    df = pd.DataFrame(stat)

    rej, pval_corr = smm.multipletests(df['pvalue'], alpha=0.025, method=method)[:2]
    df['rejected'] = rej
    df['pval_corr'] = pval_corr
    
    return df

In [8]:
def addFC(df, mean1, mean2):
    fc_list = []

    for c in df.column:
        fc_list.append(fc(mean1[c], mean2[c]))

    df['FC'] = fc_list
    return df

In [9]:
def getAnswers(data1, data2):
    len1 = data1[data1['rejected']][data1['FC'] > 1.5].shape[0]
    len2 = data2[data2['rejected']][data2['FC'] > 1.5].shape[0]
    
    return len1, len2

In [10]:
mean_norm = data_normal.mean(axis=0)
mean_neoplasia = data_neoplasia.mean(axis=0)
mean_canser = data_cancer.mean(axis=0)

In [11]:
data_norm_neoplasia = getPreparedDF(data_normal, data_neoplasia, 'h')
data_norm_neoplasia = addFC(data_norm_neoplasia, mean_neoplasia, mean_norm)
data_norm_neoplasia.head()

Unnamed: 0,column,pvalue,rejected,pval_corr,FC
0,LOC643837,0.690766,False,1.0,1.067858
1,LOC100130417,3.2e-05,False,0.500174,1.996517
2,SAMD11,0.060273,False,1.0,1.222418
3,NOC2L,0.826429,False,1.0,1.02007
4,KLHL17,0.049876,False,1.0,1.125471


In [12]:
data_neoplasia_canser = getPreparedDF(data_neoplasia, data_cancer, 'h')
data_neoplasia_canser = addFC(data_neoplasia_canser, mean_canser, mean_neoplasia)
data_neoplasia_canser.head()

Unnamed: 0,column,pvalue,rejected,pval_corr,FC
0,LOC643837,0.413735,False,1.0,1.148593
1,LOC100130417,0.653429,False,1.0,1.064871
2,SAMD11,0.079556,False,1.0,1.278517
3,NOC2L,0.287581,False,1.0,1.092635
4,KLHL17,0.463292,False,1.0,1.036574


In [13]:
ans1, ans2 = getAnswers(data_norm_neoplasia, data_neoplasia_canser)

save_answer("3_1", ans1)
save_answer("3_2", ans2)


Task №3_1 answer 2


Task №3_2 answer 77



  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [14]:
data_norm_neoplasia = getPreparedDF(data_normal, data_neoplasia, 'fdr_bh')
data_norm_neoplasia = addFC(data_norm_neoplasia, mean_neoplasia, mean_norm)
data_norm_neoplasia.head()

Unnamed: 0,column,pvalue,rejected,pval_corr,FC
0,LOC643837,0.690766,False,0.966511,1.067858
1,LOC100130417,3.2e-05,False,0.035698,1.996517
2,SAMD11,0.060273,False,0.536103,1.222418
3,NOC2L,0.826429,False,0.980777,1.02007
4,KLHL17,0.049876,False,0.499016,1.125471


In [15]:
data_neoplasia_canser = getPreparedDF(data_neoplasia, data_cancer, 'fdr_bh')
data_neoplasia_canser = addFC(data_neoplasia_canser, mean_canser, mean_neoplasia)
data_neoplasia_canser.head()

Unnamed: 0,column,pvalue,rejected,pval_corr,FC
0,LOC643837,0.413735,False,0.675195,1.148593
1,LOC100130417,0.653429,False,0.836406,1.064871
2,SAMD11,0.079556,False,0.288873,1.278517
3,NOC2L,0.287581,False,0.563007,1.092635
4,KLHL17,0.463292,False,0.712214,1.036574


In [16]:
ans1, ans2 = getAnswers(data_norm_neoplasia, data_neoplasia_canser)

save_answer("3_1", ans1)
save_answer("3_2", ans2)


Task №3_1 answer 4


Task №3_2 answer 524



  from ipykernel import kernelapp as app
  app.launch_new_instance()
