In [1]:
import numpy as np
import csv
import sys
from scipy.special import comb
import pandas as pd

from mutation_info import *



In [4]:
# set some things

# choose statistical or biochemical epistasis
ep_type = 'biochem' 
#ep_type = 'stat'

order_H1 = 5
order_H3 = 4
order_B = 1
    

num_term_list_H1 = np.array([int(comb(num_muts_H1,i)) for i in range(1,order_H1+1)])
total_params_H1 = sum(num_term_list_H1)
order_start_indices_H1 = list(np.cumsum(num_term_list_H1)+1)
order_start_indices_H1.insert(0,1)
print(num_term_list_H1,total_params_H1)
print(order_start_indices_H1)

num_term_list_H3 = np.array([int(comb(num_muts_H3,i)) for i in range(1,order_H3+1)])
total_params_H3 = sum(num_term_list_H3)
order_start_indices_H3 = list(np.cumsum(num_term_list_H3)+1)
order_start_indices_H3.insert(0,1)
print(num_term_list_H3,total_params_H3)
print(order_start_indices_H3)

num_term_list_B = np.array([int(comb(num_muts_B,i)) for i in range(1,order_B+1)])
total_params_B = sum(num_term_list_B)
order_start_indices_B = list(np.cumsum(num_term_list_B)+1)
order_start_indices_B.insert(0,1)
print(num_term_list_B,total_params_B)
print(order_start_indices_B)


[  16  120  560 1820 4368] 6884
[1, 17, 137, 697, 2517, 6885]
[ 13  78 286 715] 1092
[1, 14, 92, 378, 1093]
[8] 8
[1, 9]


In [5]:
# read in KD data, filter, and transform to numpy arrays

df = pd.read_csv('../Kd_meanbin/kd_processed/20210427_HA_unadj_fil_merg.csv',dtype={"variant": str})

all_genos = df[['variant']].values.flatten()

# H1
df_H1 = df.dropna(subset=['h1_mean'])
genos_H1 = np.array(df_H1[['pos'+x for x in H1_mutations]].copy(),dtype=np.float64)
phenos_H1 = df_H1[['h1_mean']].values.flatten()
print(genos_H1.shape,phenos_H1.shape)
N_H1 = len(phenos_H1)

# for H3, filter for the three required mutations and remove them
df_H3 = df.dropna(subset=['h3_mean'])
for mut in H3_required_mutations:
    df_H3 = df_H3.loc[df_H3['pos'+mut] == 1]

genos_H3 = np.array(df_H3[['pos'+x for x in H3_mutations]].copy(),dtype=np.float64)
phenos_H3 = df_H3[['h3_mean']].values.flatten()
print(genos_H3.shape,phenos_H3.shape)
N_H3 = len(phenos_H3)


# for FluB, filter for the eight required mutations and remove them
df_B = df.dropna(subset=['fluB_mean'])
for mut in B_required_mutations:
    df_B = df_B.loc[df_B['pos'+mut] == 1]
    
genos_B = np.array(df_B[['pos'+x for x in B_mutations]].copy(),dtype=np.float64)
phenos_B = df_B[['fluB_mean']].values.flatten()
print(genos_B.shape,phenos_B.shape)
N_B = len(phenos_B)




(65094, 16) (65094,)
(8192, 13) (8192,)
(254, 8) (254,)


In [23]:
# read model coefficients for H1 from file & make dataframes

coefs_H1 = np.zeros(total_params_H1+1)
errs_H1 = np.zeros(total_params_H1+1)
pvals_H1 = np.zeros(total_params_H1+1)
cis_lower_H1 = np.zeros(total_params_H1+1)
cis_upper_H1 = np.zeros(total_params_H1+1)

names_H1 = []
sig_H1 = np.full((total_params_H1+1),0)

with open('model_coefs/H1_'+str(order_H1)+'order_'+ep_type+'.txt','r') as readfile:
    coef_reader = csv.reader(readfile,delimiter='\t')
    num_params_H1 = int(next(coef_reader)[-1])
    r2_train = float(next(coef_reader)[-1])
    header = next(coef_reader)
    for i in range(total_params_H1+1):
        row = next(coef_reader)
        
        if i > 0:
            full_name = ','.join([mut_names[int(i)-1] for i in row[0].split(',')])
            names_H1.append(full_name) 
            coefs_H1[i] = float(row[1])
            errs_H1[i] = float(row[2])
            pvals_H1[i] = float(row[3])
            cis_lower_H1[i] = float(row[4])
            cis_upper_H1[i] = float(row[5])
        else:
            names_H1.append(row[0])
            coefs_H1[i] = float(row[1])
            errs_H1[i] = np.nan
            pvals_H1[i] = np.nan
            cis_lower_H1[i] = np.nan
            cis_upper_H1[i] = np.nan


    readfile.close()

antigen_H1 = ['H1']*(total_params_H1+1)
params_H1 = [num_params_H1]*(total_params_H1+1)
seqs_H1 = [N_H1]*(total_params_H1+1)

H1_df = pd.DataFrame(
    {'Antigen': antigen_H1,
     'Num_Sequences': seqs_H1,
     'Num_Parameters': params_H1,
     'Term':names_H1,
     'Coefficient':coefs_H1,
     'StdError':errs_H1,
     'pValue':pvals_H1,
     'CI_Lower':cis_lower_H1,
     'CI_Upper':cis_upper_H1    
    })


H1_df

Unnamed: 0,Antigen,Num_Sequences,Num_Parameters,Term,Coefficient,StdError,pValue,CI_Lower,CI_Upper
0,H1,65094,6885,Intercept,8.535212,,,,
1,H1,65094,6885,30,-0.872758,0.038527,4.034908e-113,-1.045602,-0.699914
2,H1,65094,6885,35,0.031795,0.038005,4.028223e-01,-0.138705,0.202294
3,H1,65094,6885,36,0.204205,0.038007,7.782075e-08,0.033695,0.374714
4,H1,65094,6885,57,-0.112460,0.037995,3.078748e-03,-0.282914,0.057994
...,...,...,...,...,...,...,...,...,...
6880,H1,65094,6885,84859295113,-0.000556,0.013807,9.678729e-01,-0.062498,0.061386
6881,H1,65094,6885,848592103113,0.005036,0.013807,7.153311e-01,-0.056907,0.066978
6882,H1,65094,6885,848595103113,-0.009825,0.013807,4.767375e-01,-0.071768,0.052118
6883,H1,65094,6885,849295103113,-0.019008,0.013807,1.686251e-01,-0.080950,0.042935


In [16]:
# read in coefficients for H3

coefs_H3 = np.zeros(total_params_H3+1)
errs_H3 = np.zeros(total_params_H3+1)
pvals_H3 = np.zeros(total_params_H3+1)
cis_lower_H3 = np.zeros(total_params_H3+1)
cis_upper_H3 = np.zeros(total_params_H3+1)

names_H3 = []
sig_H3 = np.full((total_params_H3+1),0)

with open('model_coefs/H3_'+str(order_H3)+'order_'+ep_type+'.txt','r') as readfile:
    coef_reader = csv.reader(readfile,delimiter='\t')
    num_params_H3 = int(next(coef_reader)[-1])
    r2_train = float(next(coef_reader)[-1])
    header = next(coef_reader)
    for i in range(total_params_H3+1):
        row = next(coef_reader)
        
        if i > 0:
            full_name = ','.join([mut_names[int(i)-1] for i in row[0].split(',')])
            names_H3.append(full_name) 
            coefs_H3[i] = float(row[1])
            errs_H3[i] = float(row[2])
            pvals_H3[i] = float(row[3])
            cis_lower_H3[i] = float(row[4])
            cis_upper_H3[i] = float(row[5])
        else:
            names_H3.append(row[0])
            coefs_H3[i] = float(row[1])
            errs_H3[i] = np.nan
            pvals_H3[i] = np.nan
            cis_lower_H3[i] = np.nan
            cis_upper_H3[i] = np.nan


    readfile.close()

antigen_H3 = ['H3']*(total_params_H3+1)
params_H3 = [num_params_H3]*(total_params_H3+1)
seqs_H3 = [N_H3]*(total_params_H3+1)

H3_df = pd.DataFrame(
    {'Antigen': antigen_H3,
     'Num_Sequences': seqs_H3,
     'Num_Parameters': params_H3,
     'Term':names_H3,
     'Coefficient':coefs_H3,
     'StdError':errs_H3,
     'pValue':pvals_H3,
     'CI_Lower':cis_lower_H3,
     'CI_Upper':cis_upper_H3    
    })

H3_df

Unnamed: 0,Antigen,Num_Sequences,Num_Parameters,Term,Coefficient,StdError,pValue,CI_Lower,CI_Upper
0,H3,8192,1093,Intercept,5.960326,,,,
1,H3,8192,1093,30,0.049518,0.042891,0.248329,-0.125429,0.224464
2,H3,8192,1093,35,-0.007709,0.042891,0.857369,-0.182655,0.167238
3,H3,8192,1093,36,-0.033065,0.042891,0.440779,-0.208012,0.141881
4,H3,8192,1093,64,0.105024,0.042891,0.014363,-0.069922,0.279971
...,...,...,...,...,...,...,...,...,...
1088,H3,8192,1093,859295103,-0.016395,0.019843,0.408694,-0.097335,0.064544
1089,H3,8192,1093,859295113,-0.018736,0.019843,0.345093,-0.099676,0.062203
1090,H3,8192,1093,8592103113,0.008180,0.019843,0.680183,-0.072759,0.089119
1091,H3,8192,1093,8595103113,-0.005371,0.019843,0.786635,-0.086311,0.075568


In [18]:
# read in coefficients for FluB

coefs_B = np.zeros(total_params_B+1)
errs_B = np.zeros(total_params_B+1)
pvals_B = np.zeros(total_params_B+1)
cis_lower_B = np.zeros(total_params_B+1)
cis_upper_B = np.zeros(total_params_B+1)

names_B = []
sig_B = np.full((total_params_B+1),0)

with open('model_coefs/B_'+str(order_B)+'order_'+ep_type+'.txt','r') as readfile:
    coef_reader = csv.reader(readfile,delimiter='\t')
    num_params_B = int(next(coef_reader)[-1])
    r2_train = float(next(coef_reader)[-1])
    header = next(coef_reader)
    for i in range(total_params_B+1):
        row = next(coef_reader)
        
        if i > 0:
            full_name = ','.join([mut_names[int(i)-1] for i in row[0].split(',')])
            names_B.append(full_name) 
            coefs_B[i] = float(row[1])
            errs_B[i] = float(row[2])
            pvals_B[i] = float(row[3])
            cis_lower_B[i] = float(row[4])
            cis_upper_B[i] = float(row[5])
        else:
            names_B.append(row[0])
            coefs_B[i] = float(row[1])
            errs_B[i] = np.nan
            pvals_B[i] = np.nan
            cis_lower_B[i] = np.nan
            cis_upper_B[i] = np.nan


    readfile.close()

antigen_B = ['FluB']*(total_params_B+1)
params_B = [num_params_B]*(total_params_B+1)
seqs_B = [N_B]*(total_params_B+1)

B_df = pd.DataFrame(
    {'Antigen': antigen_B,
     'Num_Sequences': seqs_B,
     'Num_Parameters': params_B,
     'Term':names_B,
     'Coefficient':coefs_B,
     'StdError':errs_B,
     'pValue':pvals_B,
     'CI_Lower':cis_lower_B,
     'CI_Upper':cis_upper_B    
    })

B_df

Unnamed: 0,Antigen,Num_Sequences,Num_Parameters,Term,Coefficient,StdError,pValue,CI_Lower,CI_Upper
0,FluB,254,9,Intercept,6.334264,,,,
1,FluB,254,9,35,-0.317117,0.031821,7.393247e-20,-0.406143,-0.22809
2,FluB,254,9,66,0.027491,0.031821,0.3884778,-0.061536,0.116518
3,FluB,254,9,79,0.130174,0.031821,5.836405e-05,0.041148,0.219201
4,FluB,254,9,84,-0.199128,0.031818,1.726155e-09,-0.288146,-0.110109
5,FluB,254,9,92,0.119757,0.031821,0.0002098133,0.03073,0.208784
6,FluB,254,9,95,0.019369,0.031818,0.543268,-0.069649,0.108387
7,FluB,254,9,103,-0.100113,0.031821,0.001859225,-0.18914,-0.011086
8,FluB,254,9,113,1.230509,0.031818,2.6694880000000003e-106,1.141491,1.319527


In [28]:
# combine dataframes
df_final = H1_df.append(H3_df, ignore_index=True)
df_final = df_final.append(B_df, ignore_index=True)
df_final


# write to file
df_final.to_csv('model_coefs/9114_coefs.csv',index=False)
    