## z检验

In [1]:
import pandas as pd
import statsmodels.stats.weightstats as sw

In [6]:
# 针对df里面的患病和对照组进行z检验
def generate_z_value(series, cols):
    result = []
    for col in cols:
        key = col.split('|')[0]
        val = col.split('|')[1]
        if str(series[key]) != val:
            return 0
    return 1
        
        
def z_test(df, cols):
    case_df = df[df['caco'] == 1]
    control_df = df[df['caco'] == 0]
    case_arr = case_df.apply(generate_z_value, axis=1, args=(cols,)).tolist()
    control_arr = control_df.apply(generate_z_value, axis=1, args=(cols,)).tolist()
    x, p_value = sw.ztest(case_arr, control_arr, value=0)
    return p_value

def get_all_df():
    df = pd.read_csv('1.csv')
    df = df.dropna()
    df1 = pd.read_csv('2.csv')
    df1 = df1.dropna()
    final_df = pd.concat([df, df1], ignore_index=True)
    #######开始剔除错误数据###########
    final_case_df = final_df[(final_df['caco'] == 1) & (final_df['HL'] >= 40)]
    final_control_df = final_df[(final_df['caco'] == 0) & (final_df['HL']) < 40]
    final_df = pd.concat([final_case_df, final_control_df], ignore_index=True)
    final_df = final_df[final_df['age']>10]
    return final_df
def get_all_cols():
    result = []
    result.append(['rs1200137|C/C', 'rs1200130|C/C'])
    result.append(['rs1678690|G/G'])
    result.append(['smoke|1'])
    result.append(['rs1200130|C/C', 'rs1358714|A/A'])
    result.append(['rs1200130|C/C', 'rs17412009|C/C'])
    result.append(['rs2070703|C/C', 'rs1200130|C/C'])
    result.append(['rs34996498|A/A', 'rs1200130|C/C'])
    result.append(['rs3766031|C/C', 'rs1200130|C/C'])
    result.append(['rs6458080|T/T', 'rs1200130|C/C'])
    result.append(['rs1200135|G/G', 'rs17412009|C/C'])
    result.append(['rs1200135|G/G', 'rs34996498|A/A'])
    result.append(['rs1200135|G/G', 'rs34996498|A/A'])
    result.append(['rs1200137|C/C', 'rs6458080|T/T'])
    result.append(['rs159153|T/T', 'rs1358714|A/A'])
    result.append(['rs17412009|C/C', 'rs1358714|A/A'])
    result.append(['rs34996498|A/A', 'rs1358714|A/A'])
    result.append(['rs6458080|T/T', 'rs159153|T/T'])
    result.append(['rs1678690|G/G', 'rs6458080|T/T'])
    result.append(['rs1678690|G/G', 'sex|1'])
    result.append(['rs34996498|A/A', 'rs17412009|C/C'])
    result.append(['rs6458080|T/T', 'rs17412009|C/C'])
    result.append(['rs6458080|T/T', 'rs34996498|A/A'])
    result.append(['smoke|1', 'sex|1'])
    result.append(['rs1200137|C/C', 'rs1200130|C/C', 'sex|1'])
    result.append(['rs159153|T/T', 'rs1200130|C/C', 'sex|1'])
    result.append(['rs17412009|C/C', 'rs1200130|C/C', 'sex|1'])
    result.append(['rs34996498|A/A', 'rs1200130|C/C', 'sex|1'])
    result.append(['rs3766031|C/C', 'rs1200130|C/C', 'sex|1'])
    result.append(['rs1200137|C/C', 'rs1200135|G/G', 'rs1358714|A/A'])
    result.append(['rs1200137|C/C', 'rs1200135|G/G', 'rs17412009|C/C'])
    result.append(['rs1200137|C/C', 'rs1200135|G/G', 'rs34996498|A/A'])
    result.append(['rs1200135|G/G', 'rs17412009|C/C', 'rs1358714|A/A'])
    result.append(['rs1200135|G/G', 'rs34996498|A/A', 'rs1358714|A/A'])
    result.append(['rs1200137|C/C', 'rs17412009|C/C', 'rs1358714|A/A'])
    result.append(['rs1200137|C/C', 'rs34996498|A/A', 'rs1358714|A/A'])
    result.append(['rs1200137|C/C', 'rs17412009|C/C', 'sex|1'])
    result.append(['rs1200137|C/C', 'rs34996498|A/A', 'sex|1'])
    result.append(['rs17412009|C/C', 'rs159153|T/T', 'sex|1'])
    result.append(['rs1678690|G/G', 'rs6458080|T/T', 'sex|1'])
    result.append(['rs34996498|A/A', 'rs17412009|C/C', 'sex|1'])
    result.append(['rs6458080|T/T', 'rs17412009|C/C', 'sex|1'])
    
    return result

def check_all_col():
    df = get_all_df()
    cols = get_all_cols()
    result = {}
    for col in cols:
        p_val = z_test(df, col)
        result[','.join(col)]=p_val
    return result
        
    
    
    
    

In [7]:
result = check_all_col()

In [8]:
result

{'rs1200137|C/C,rs1200130|C/C': 0.018860482871986328,
 'rs1678690|G/G': 0.02226742509689742,
 'smoke|1': 0.014546056299886078,
 'rs1200130|C/C,rs1358714|A/A': 0.0030657986843534387,
 'rs1200130|C/C,rs17412009|C/C': 0.011045033466155336,
 'rs2070703|C/C,rs1200130|C/C': 0.04515570938890023,
 'rs34996498|A/A,rs1200130|C/C': 0.15384892091312313,
 'rs3766031|C/C,rs1200130|C/C': 0.4200920160907612,
 'rs6458080|T/T,rs1200130|C/C': 0.0014769630356689701,
 'rs1200135|G/G,rs17412009|C/C': 0.0771160783156328,
 'rs1200135|G/G,rs34996498|A/A': 0.06874008871431125,
 'rs1200137|C/C,rs6458080|T/T': 0.01747913632774771,
 'rs159153|T/T,rs1358714|A/A': 0.07923526868204167,
 'rs17412009|C/C,rs1358714|A/A': 0.01393554393429058,
 'rs34996498|A/A,rs1358714|A/A': 0.04202475804249598,
 'rs6458080|T/T,rs159153|T/T': 0.1094964342377722,
 'rs1678690|G/G,rs6458080|T/T': 0.010428545708606154,
 'rs1678690|G/G,sex|1': 0.005245973944072859,
 'rs34996498|A/A,rs17412009|C/C': 0.026233885767950226,
 'rs6458080|T/T,rs1741

In [9]:
keys = result.keys()
vals = result.values()

In [11]:
vals

dict_values([0.018860482871986328, 0.02226742509689742, 0.014546056299886078, 0.0030657986843534387, 0.011045033466155336, 0.04515570938890023, 0.15384892091312313, 0.4200920160907612, 0.0014769630356689701, 0.0771160783156328, 0.06874008871431125, 0.01747913632774771, 0.07923526868204167, 0.01393554393429058, 0.04202475804249598, 0.1094964342377722, 0.010428545708606154, 0.005245973944072859, 0.026233885767950226, 3.705923419296605e-05, 0.05824294690138939, 0.019449528307333534, 0.020023932096742452, 0.09865893936882028, 0.008883463550096785, 0.15827121014713497, 0.24114486262097934, 0.3224246322230421, 0.03564179203038194, 0.05185877446986129, 0.0737835141193394, 0.0740677046865437, 0.024036500438045477, 0.059148407939953386, 0.02185352926243806, 0.11958623545161766, 0.09067174394555386, 0.003092685191573518, 0.027933480205552096, 2.710742336842838e-05])

In [12]:
df = pd.DataFrame.from_dict({'interaction_snp':keys, 'p_value':vals})

In [15]:
df.head()

Unnamed: 0,interaction_snp,p_value
0,"rs1200137|C/C,rs1200130|C/C",0.01886
1,rs1678690|G/G,0.022267
2,smoke|1,0.014546
3,"rs1200130|C/C,rs1358714|A/A",0.003066
4,"rs1200130|C/C,rs17412009|C/C",0.011045


In [None]:
df.to_csv('snp_ztest_pvalue.csv',index=False)