In [1]:
import numpy as np
import csv
import sys
from scipy.special import comb
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

from mutation_info import *



In [14]:
# set some things

# choose statistical or biochemical epistasis
ep_type = 'biochem' 
#ep_type = 'stat'

order_H1 = 5
order_H3 = 4
order_B = 1

num_term_list_H1 = np.array([int(comb(num_muts_H1,i)) for i in range(1,order_H1+1)])
total_params_H1 = sum(num_term_list_H1)
order_start_indices_H1 = list(np.cumsum(num_term_list_H1)+1)
order_start_indices_H1.insert(0,1)
print(num_term_list_H1,total_params_H1)
print(order_start_indices_H1)

num_term_list_H3 = np.array([int(comb(num_muts_H3,i)) for i in range(1,order_H3+1)])
total_params_H3 = sum(num_term_list_H3)
order_start_indices_H3 = list(np.cumsum(num_term_list_H3)+1)
order_start_indices_H3.insert(0,1)
print(num_term_list_H3,total_params_H3)
print(order_start_indices_H3)

num_term_list_B = np.array([int(comb(num_muts_B,i)) for i in range(1,order_B+1)])
total_params_B = sum(num_term_list_B)
order_start_indices_B = list(np.cumsum(num_term_list_B)+1)
order_start_indices_B.insert(0,1)
print(num_term_list_B,total_params_B)
print(order_start_indices_B)


[  16  120  560 1820 4368] 6884
[1, 17, 137, 697, 2517, 6885]
[ 13  78 286 715] 1092
[1, 14, 92, 378, 1093]
[8] 8
[1, 9]


In [15]:
# read in KD data, filter, and transform to numpy arrays

df = pd.read_csv('../Kd_meanbin/kd_processed/20210427_HA_unadj_fil_merg.csv',dtype={"variant": str})

all_genos = df[['variant']].values.flatten()

# H1
df_H1 = df.dropna(subset=['h1_mean'])
genos_H1 = np.array(df_H1[['pos'+x for x in H1_mutations]].copy(),dtype=np.float64)
phenos_H1 = df_H1[['h1_mean']].values.flatten()
print(genos_H1.shape,phenos_H1.shape)


# for H3, filter for the three required mutations and remove them
df_H3 = df.dropna(subset=['h3_mean'])
for mut in H3_required_mutations:
    df_H3 = df_H3.loc[df_H3['pos'+mut] == 1]

genos_H3 = np.array(df_H3[['pos'+x for x in H3_mutations]].copy(),dtype=np.float64)
phenos_H3 = df_H3[['h3_mean']].values.flatten()
print(genos_H3.shape,phenos_H3.shape)


# for FluB, filter for the eight required mutations and remove them
df_B = df.dropna(subset=['fluB_mean'])
for mut in B_required_mutations:
    df_B = df_B.loc[df_B['pos'+mut] == 1]
    
genos_B = np.array(df_B[['pos'+x for x in B_mutations]].copy(),dtype=np.float64)
phenos_B = df_B[['fluB_mean']].values.flatten()
print(genos_B.shape,phenos_B.shape)


# for statistical epistasis, shift genotypes from 0,1 to -1,1
if ep_type == 'stat':
    genos_H1 = 2*(genos_H1-0.5)
    genos_H3 = 2*(genos_H3-0.5)
    genos_B = 2*(genos_B-0.5)



(65094, 16) (65094,)
(8192, 13) (8192,)
(254, 8) (254,)


In [16]:
# read model coefficients from file

coefs_H1 = np.zeros(total_params_H1+1)
names_H1 = []
sig_H1 = np.full((total_params_H1+1),0)

with open('model_coefs/H1_'+str(order_H1)+'order_'+ep_type+'.txt','r') as readfile:
    coef_reader = csv.reader(readfile,delimiter='\t')
    num_params = int(next(coef_reader)[-1])
    r2 = float(next(coef_reader)[-1])
    header = next(coef_reader)
    for i in range(total_params_H1+1):
        row = next(coef_reader)
        names_H1.append(row[0])
        coefs_H1[i] = float(row[1])
        if i > 1:
            if float(row[3])*float(row[4]) > 0:
                sig_H1[i] = 1
    readfile.close()
            
print(len(coefs_H1))  

coefs_H3 = np.zeros(total_params_H3+1)
names_H3 = []
sig_H3 = np.full((total_params_H3+1),0)

with open('model_coefs/H3_'+str(order_H3)+'order_'+ep_type+'.txt','r') as readfile:
    coef_reader = csv.reader(readfile,delimiter='\t')
    num_params = int(next(coef_reader)[-1])
    r2 = float(next(coef_reader)[-1])
    header = next(coef_reader)
    for i in range(total_params_H3+1):
        row = next(coef_reader)
        names_H3.append(row[0])
        coefs_H3[i] = float(row[1])
        if i > 1:
            if float(row[3])*float(row[4]) > 0:
                sig_H3[i] = 1
    readfile.close()
            
print(len(coefs_H3)) 

coefs_B = np.zeros(total_params_B+1)
names_B = []
sig_B = np.full((total_params_B+1),0)

with open('model_coefs/B_'+str(order_B)+'order_'+ep_type+'.txt','r') as readfile:
    coef_reader = csv.reader(readfile,delimiter='\t')
    num_params = int(next(coef_reader)[-1])
    r2 = float(next(coef_reader)[-1])
    header = next(coef_reader)
    for i in range(total_params_B+1):
        row = next(coef_reader)
        names_B.append(row[0])
        coefs_B[i] = float(row[1])
        if i > 1:
            if float(row[3])*float(row[4]) > 0:
                sig_B[i] = 1
    readfile.close()
            
print(len(coefs_B)) 

6885
1093
9


In [17]:
poly_current_H1 = PolynomialFeatures(order_H1,interaction_only=True)
genos_current_H1 = poly_current_H1.fit_transform(genos_H1)
predicted_phenos_H1 = np.tensordot(genos_current_H1,coefs_H1,axes=1)

poly_current_H3 = PolynomialFeatures(order_H3,interaction_only=True)
genos_current_H3 = poly_current_H3.fit_transform(genos_H3)
predicted_phenos_H3 = np.tensordot(genos_current_H3,coefs_H3,axes=1)

poly_current_B = PolynomialFeatures(order_B,interaction_only=True)
genos_current_B = poly_current_B.fit_transform(genos_B)
predicted_phenos_B = np.tensordot(genos_current_B,coefs_B,axes=1)

print('done')

done


In [18]:
# write predicted phenotypes to file
num_H3 = 0
num_B = 0

with open('model_coefs/predicted_phenos_'+ep_type+'.csv','w') as writefile:
    pheno_writer = csv.writer(writefile,delimiter=',')
    pheno_writer.writerow(['Genotype','H1_Phenotype_'+str(order_H1),
                          'H3_Phenotype_'+str(order_H3),'FluB_Phenotype_'+str(order_B)])
    
    for i in range(len(all_genos)):
        if i%5000 == 0: print(i)
        geno_string = all_genos[i]
        geno = np.array([float(x) for x in geno_string])
        if ep_type == 'stat':
            geno = 2.0*(geno-0.5)
        
        # H1 prediction
        H1_index = np.where((genos_H1 == geno).all(axis=1))[0]
        if len(H1_index) == 1:
            H1_pred_pheno = predicted_phenos_H1[H1_index[0]]
        else:
            H1_pred_pheno = np.nan
        
        # H3 prediction, for relevant genotypes
        if np.sum(geno[H3_required_indices]) == float(len(H3_required_indices)):
            geno_trim = geno[H3_remaining_indices]
            H3_index = np.where((genos_H3 == geno_trim).all(axis=1))[0]
            if len(H3_index) == 1:
                H3_pred_pheno = predicted_phenos_H3[H3_index[0]]
                num_H3 += 1
            else:
                H3_pred_pheno = np.nan

        else:
            H3_pred_pheno = np.nan

        # FluB prediction, for relevant genotypes
        if np.sum(geno[B_required_indices]) == float(len(B_required_indices)):
            geno_trim = geno[B_remaining_indices]
            B_index = np.where((genos_B == geno_trim).all(axis=1))[0]
            if len(B_index) == 1:
                B_pred_pheno = predicted_phenos_B[B_index[0]]
                num_B += 1
            else:
                B_pred_pheno = np.nan
        else:
            B_pred_pheno = np.nan
        
        
        pheno_writer.writerow([geno_string,H1_pred_pheno,H3_pred_pheno,B_pred_pheno])
        
    writefile.close()

print(num_H3,num_B)    
print('done')
    

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
8192 254
done
