In [1]:
import numpy as np
import csv
import sys
from scipy.special import comb
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

from mutation_info import *



In [4]:
# set some things

# choose statistical or biochemical epistasis
ep_type = 'biochem' 
#ep_type = 'stat'


order_H1 = 4
order_H9 = 4

num_term_list_H1 = np.array([int(comb(num_muts_total,i)) for i in range(1,order_H1+1)])
total_params_H1 = sum(num_term_list_H1)
order_start_indices_H1 = list(np.cumsum(num_term_list_H1)+1)
order_start_indices_H1.insert(0,1)
print(num_term_list_H1,total_params_H1)
print(order_start_indices_H1)

num_term_list_H9 = np.array([int(comb(num_muts_total,i)) for i in range(1,order_H9+1)])
total_params_H9 = sum(num_term_list_H9)
order_start_indices_H9 = list(np.cumsum(num_term_list_H9)+1)
order_start_indices_H9.insert(0,1)
print(num_term_list_H9,total_params_H9)
print(order_start_indices_H9)



[ 11  55 165 330] 561
[1, 12, 67, 232, 562]
[ 11  55 165 330] 561
[1, 12, 67, 232, 562]


In [5]:
# read in KD data, filter, and transform to numpy arrays

df = pd.read_csv('../Kd_meanbin/kd_processed/20210323_6261_HA_unadj_fil_merg.csv',dtype={"variant": str})

all_genos = df[['variant']].values.flatten()

# H1
df_H1 = df.dropna(subset=['h1_mean'])
genos_H1 = np.array(df_H1[['pos'+x for x in mutations]].copy(),dtype=np.float64)
phenos_H1 = df_H1[['h1_mean']].values.flatten()
print(genos_H1.shape,phenos_H1.shape)

# H9
df_H9 = df.dropna(subset=['h9_mean'])
genos_H9 = np.array(df_H9[['pos'+x for x in mutations]].copy(),dtype=np.float64)
phenos_H9 = df_H9[['h9_mean']].values.flatten()
print(genos_H9.shape,phenos_H9.shape)


# for statistical epistasis, shift genotypes from 0,1 to -1,1
if ep_type == 'stat':
    genos_H1 = 2*(genos_H1-0.5)
    genos_H9 = 2*(genos_H9-0.5)



(1887, 11) (1887,)
(1842, 11) (1842,)


In [7]:
# read model coefficients from file

coefs_H1 = np.zeros(total_params_H1+1)
names_H1 = []
sig_H1 = np.full((total_params_H1+1),0)

with open('model_coefs/H1_'+str(order_H1)+'order_'+ep_type+'.txt','r') as readfile:
    coef_reader = csv.reader(readfile,delimiter='\t')
    num_params = int(next(coef_reader)[-1])
    r2_train = float(next(coef_reader)[-1])
    header = next(coef_reader)
    for i in range(total_params_H1+1):
        row = next(coef_reader)
        names_H1.append(row[0])
        coefs_H1[i] = float(row[1])
        if i > 1:
            if float(row[3])*float(row[4]) > 0:
                sig_H1[i] = 1
    readfile.close()
            
print(len(coefs_H1))  

coefs_H9 = np.zeros(total_params_H9+1)
names_H9 = []
sig_H9 = np.full((total_params_H9+1),0)

with open('model_coefs/H9_'+str(order_H9)+'order_'+ep_type+'.txt','r') as readfile:
    coef_reader = csv.reader(readfile,delimiter='\t')
    num_params = int(next(coef_reader)[-1])
    r2_train = float(next(coef_reader)[-1])
    header = next(coef_reader)
    for i in range(total_params_H9+1):
        row = next(coef_reader)
        names_H9.append(row[0])
        coefs_H9[i] = float(row[1])
        if i > 1:
            if float(row[3])*float(row[4]) > 0:
                sig_H9[i] = 1
    readfile.close()
            
print(len(coefs_H9)) 



562
562


In [8]:
poly_current_H1 = PolynomialFeatures(order_H1,interaction_only=True)
genos_current_H1 = poly_current_H1.fit_transform(genos_H1)
predicted_phenos_H1 = np.tensordot(genos_current_H1,coefs_H1,axes=1)

poly_current_H9 = PolynomialFeatures(order_H9,interaction_only=True)
genos_current_H9 = poly_current_H9.fit_transform(genos_H9)
predicted_phenos_H9 = np.tensordot(genos_current_H9,coefs_H9,axes=1)

print('done')

done


In [11]:
# write predicted phenotypes to file
num_H1 = 0
num_H9 = 0
with open('model_coefs/predicted_phenos_'+ep_type+'.csv','w') as writefile:
    pheno_writer = csv.writer(writefile,delimiter=',')
    pheno_writer.writerow(['Genotype','H1_Phenotype_'+str(order_H1),
                          'H9_Phenotype_'+str(order_H9)])
    for i in range(len(all_genos)):
        if i%500 == 0: print(i)
        geno_string = all_genos[i]
        geno = np.array([float(x) for x in geno_string])
        
        # H1 prediction
        H1_index = np.where((genos_H1 == geno).all(axis=1))[0]
        if len(H1_index) == 1:
            H1_pred_pheno = predicted_phenos_H1[H1_index[0]]
        else:
            H1_pred_pheno = np.nan
            
            
        # H9 prediction
        H9_index = np.where((genos_H9 == geno).all(axis=1))[0]
        if len(H9_index) == 1:
            H9_pred_pheno = predicted_phenos_H9[H9_index[0]]
        else:
            H9_pred_pheno = np.nan

        pheno_writer.writerow([geno_string,H1_pred_pheno,H9_pred_pheno])
        
    writefile.close()

print(num_H1,num_H9)    
print('done')
    

0
500
1000
1500
0 0
done
