In [1]:
import numpy as np
import csv
import sys
from scipy.special import comb
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

from mutation_info import *



In [2]:
# set some things

# choose statistical or biochemical epistasis
ep_type = 'biochem' 
#ep_type = 'stat'

order_H1 = 4
order_H9 = 4
    

num_term_list_H1 = np.array([int(comb(num_muts_H1,i)) for i in range(1,order_H1+1)])
total_params_H1 = sum(num_term_list_H1)
order_start_indices_H1 = list(np.cumsum(num_term_list_H1)+1)
order_start_indices_H1.insert(0,1)
print(num_term_list_H1,total_params_H1)
print(order_start_indices_H1)

num_term_list_H9 = np.array([int(comb(num_muts_H9,i)) for i in range(1,order_H9+1)])
total_params_H9 = sum(num_term_list_H9)
order_start_indices_H9 = list(np.cumsum(num_term_list_H9)+1)
order_start_indices_H9.insert(0,1)
print(num_term_list_H9,total_params_H9)
print(order_start_indices_H9)



[ 11  55 165 330] 561
[1, 12, 67, 232, 562]
[ 11  55 165 330] 561
[1, 12, 67, 232, 562]


In [7]:
# read in KD data, filter, and transform to numpy arrays

df = pd.read_csv('../Kd_meanbin/kd_processed/20210323_6261_HA_unadj_fil_merg.csv',dtype={"variant": str})

all_genos = df[['variant']].values.flatten()

# H1
df_H1 = df.dropna(subset=['h1_mean'])
genos_H1 = np.array(df_H1[['pos'+x for x in mutations]].copy(),dtype=np.float64)
phenos_H1 = df_H1[['h1_mean']].values.flatten()
print(genos_H1.shape,phenos_H1.shape)
N_H1 = len(phenos_H1)

genos_H9 = np.array(df_H9[['pos'+x for x in mutations]].copy(),dtype=np.float64)
phenos_H9 = df_H9[['h9_mean']].values.flatten()
print(genos_H9.shape,phenos_H9.shape)
N_H9 = len(phenos_H9)






(1887, 11) (1887,)
(1842, 11) (1842,)


In [8]:
# read model coefficients from file & make dataframes

coefs_H1 = np.zeros(total_params_H1+1)
errs_H1 = np.zeros(total_params_H1+1)
pvals_H1 = np.zeros(total_params_H1+1)
cis_lower_H1 = np.zeros(total_params_H1+1)
cis_upper_H1 = np.zeros(total_params_H1+1)

names_H1 = []
sig_H1 = np.full((total_params_H1+1),0)

with open('model_coefs/H1_'+str(order_H1)+'order_'+ep_type+'.txt','r') as readfile:
    coef_reader = csv.reader(readfile,delimiter='\t')
    num_params_H1 = int(next(coef_reader)[-1])
    r2_train = float(next(coef_reader)[-1])
    header = next(coef_reader)
    for i in range(total_params_H1+1):
        row = next(coef_reader)
        
        if i > 0:
            full_name = ','.join([mut_names[int(i)-1] for i in row[0].split(',')])
            names_H1.append(full_name) 
            coefs_H1[i] = float(row[1])
            errs_H1[i] = float(row[2])
            pvals_H1[i] = float(row[3])
            cis_lower_H1[i] = float(row[4])
            cis_upper_H1[i] = float(row[5])
        else:
            names_H1.append(row[0])
            coefs_H1[i] = float(row[1])
            errs_H1[i] = np.nan
            pvals_H1[i] = np.nan
            cis_lower_H1[i] = np.nan
            cis_upper_H1[i] = np.nan


    readfile.close()

antigen_H1 = ['H1']*(total_params_H1+1)
params_H1 = [num_params_H1]*(total_params_H1+1)
seqs_H1 = [N_H1]*(total_params_H1+1)

H1_df = pd.DataFrame(
    {'Antigen': antigen_H1,
     'Num_Sequences': seqs_H1,
     'Num_Parameters': params_H1,
     'Term':names_H1,
     'Coefficient':coefs_H1,
     'StdError':errs_H1,
     'pValue':pvals_H1,
     'CI_Lower':cis_lower_H1,
     'CI_Upper':cis_upper_H1    
    })


H1_df

Unnamed: 0,Antigen,Num_Sequences,Num_Parameters,Term,Coefficient,StdError,pValue,CI_Lower,CI_Upper
0,H1,1887,562,Intercept,6.868700,,,,
1,H1,1887,562,29,0.807303,0.079287,1.702347e-23,0.495627,1.118979
2,H1,1887,562,35,0.573613,0.086393,4.576728e-11,0.234003,0.913223
3,H1,1887,562,65,0.073511,0.079230,3.536715e-01,-0.237941,0.384963
4,H1,1887,562,66,-0.054930,0.079087,4.874615e-01,-0.365822,0.255962
...,...,...,...,...,...,...,...,...,...
557,H1,1887,562,83848587,0.232800,0.053257,1.331838e-05,0.023448,0.442153
558,H1,1887,562,838485112.1,-0.082969,0.051194,1.053231e-01,-0.284212,0.118274
559,H1,1887,562,838487112.1,-0.059191,0.048085,2.185534e-01,-0.248212,0.129830
560,H1,1887,562,838587112.1,-0.051652,0.048877,2.908043e-01,-0.243788,0.140483


In [9]:
coefs_H9 = np.zeros(total_params_H9+1)
errs_H9 = np.zeros(total_params_H9+1)
pvals_H9 = np.zeros(total_params_H9+1)
cis_lower_H9 = np.zeros(total_params_H9+1)
cis_upper_H9 = np.zeros(total_params_H9+1)

names_H9 = []
sig_H9 = np.full((total_params_H9+1),0)

with open('model_coefs/H9_'+str(order_H9)+'order_'+ep_type+'.txt','r') as readfile:
    coef_reader = csv.reader(readfile,delimiter='\t')
    num_params_H9 = int(next(coef_reader)[-1])
    r2_train = float(next(coef_reader)[-1])
    header = next(coef_reader)
    for i in range(total_params_H9+1):
        row = next(coef_reader)
        
        if i > 0:
            full_name = ','.join([mut_names[int(i)-1] for i in row[0].split(',')])
            names_H9.append(full_name) 
            coefs_H9[i] = float(row[1])
            errs_H9[i] = float(row[2])
            pvals_H9[i] = float(row[3])
            cis_lower_H9[i] = float(row[4])
            cis_upper_H9[i] = float(row[5])
        else:
            names_H9.append(row[0])
            coefs_H9[i] = float(row[1])
            errs_H9[i] = np.nan
            pvals_H9[i] = np.nan
            cis_lower_H9[i] = np.nan
            cis_upper_H9[i] = np.nan


    readfile.close()

antigen_H9 = ['H9']*(total_params_H9+1)
params_H9 = [num_params_H9]*(total_params_H9+1)
seqs_H9 = [N_H9]*(total_params_H9+1)

H9_df = pd.DataFrame(
    {'Antigen': antigen_H9,
     'Num_Sequences': seqs_H9,
     'Num_Parameters': params_H9,
     'Term':names_H9,
     'Coefficient':coefs_H9,
     'StdError':errs_H9,
     'pValue':pvals_H9,
     'CI_Lower':cis_lower_H9,
     'CI_Upper':cis_upper_H9    
    })

H9_df

Unnamed: 0,Antigen,Num_Sequences,Num_Parameters,Term,Coefficient,StdError,pValue,CI_Lower,CI_Upper
0,H9,1842,562,Intercept,7.026744,,,,
1,H9,1842,562,29,-0.234604,0.134466,0.081277,-0.763246,0.294039
2,H9,1842,562,35,0.410971,0.136966,0.002747,-0.127500,0.949443
3,H9,1842,562,65,-0.054637,0.134341,0.684290,-0.582787,0.473512
4,H9,1842,562,66,0.081023,0.134359,0.546595,-0.447200,0.609245
...,...,...,...,...,...,...,...,...,...
557,H9,1842,562,83848587,-0.309542,0.098131,0.001646,-0.695338,0.076253
558,H9,1842,562,838485112.1,-0.027278,0.094353,0.772552,-0.398220,0.343665
559,H9,1842,562,838487112.1,-0.180573,0.087270,0.038734,-0.523669,0.162522
560,H9,1842,562,838587112.1,-0.060625,0.088429,0.493107,-0.408277,0.287028


In [12]:
# combine dataframes
df_final = H1_df.append(H9_df, ignore_index=True)



# write to file
df_final.to_csv('model_coefs/6261_coefs.csv',index=False)
df_final  

Unnamed: 0,Antigen,Num_Sequences,Num_Parameters,Term,Coefficient,StdError,pValue,CI_Lower,CI_Upper
0,H1,1887,562,Intercept,6.868700,,,,
1,H1,1887,562,29,0.807303,0.079287,1.702347e-23,0.495627,1.118979
2,H1,1887,562,35,0.573613,0.086393,4.576728e-11,0.234003,0.913223
3,H1,1887,562,65,0.073511,0.079230,3.536715e-01,-0.237941,0.384963
4,H1,1887,562,66,-0.054930,0.079087,4.874615e-01,-0.365822,0.255962
...,...,...,...,...,...,...,...,...,...
1119,H9,1842,562,83848587,-0.309542,0.098131,1.645880e-03,-0.695338,0.076253
1120,H9,1842,562,838485112.1,-0.027278,0.094353,7.725524e-01,-0.398220,0.343665
1121,H9,1842,562,838487112.1,-0.180573,0.087270,3.873444e-02,-0.523669,0.162522
1122,H9,1842,562,838587112.1,-0.060625,0.088429,4.931071e-01,-0.408277,0.287028
