In [1]:
import numpy as np
import matplotlib.pyplot as plt
import csv
import sys
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm
import pandas as pd

from mutation_info import *

In [24]:
# choose statistical or biochemical epistasis
ep_type = 'biochem' 
#ep_type = 'stat'


## Read in data

In [25]:
# read in KD data, filter, and transform to numpy arrays

df = pd.read_csv('../Kd_meanbin/kd_processed/20210323_6261_HA_unadj_fil_merg.csv',dtype={"variant": str})

all_genos = df[['variant']].values.flatten()

# H1
df_H1 = df.dropna(subset=['h1_mean'])
genos_H1 = np.array(df_H1[['pos'+x for x in mutations]].copy(),dtype=np.float64)
phenos_H1 = df_H1[['h1_mean']].values.flatten()
print(genos_H1.shape,phenos_H1.shape)

# H9
df_H9 = df.dropna(subset=['h9_mean'])
genos_H9 = np.array(df_H9[['pos'+x for x in mutations]].copy(),dtype=np.float64)
phenos_H9 = df_H9[['h9_mean']].values.flatten()
print(genos_H9.shape,phenos_H9.shape)


# for statistical epistasis, shift genotypes from 0,1 to -1,1
if ep_type == 'stat':
    genos_H1 = 2*(genos_H1-0.5)
    genos_H9 = 2*(genos_H9-0.5)



(1887, 11) (1887,)
(1842, 11) (1842,)


## CV to choose optimal order of interaction

### H1

In [17]:
num_folds = 10
max_order = 5

# set up permutation
np.random.seed(2112)
indices_permuted_H1 = np.random.permutation(np.arange(len(genos_H1)))
size_test_H1 = int(1.0/float(num_folds)*len(genos_H1))
size_train_H1 = len(genos_H1)-size_test_H1
print(size_test_H1,size_train_H1)

# lists to store r squared values
rsq_train_list_H1 = np.zeros((max_order+1,num_folds))
rsq_test_list_H1 = np.zeros((max_order+1,num_folds))


# loop over CV folds
for f in range(num_folds):

    # get train & test sets
    start = int(f*size_test_H1)
    stop = int((f+1)*size_test_H1)
    genos_train_H1 = np.concatenate((genos_H1[indices_permuted_H1[:start]],genos_H1[indices_permuted_H1[stop:]]))
    genos_test_H1 = genos_H1[indices_permuted_H1[start:stop]]
    phenos_train_H1 = np.concatenate((phenos_H1[indices_permuted_H1[:start]],phenos_H1[indices_permuted_H1[stop:]]))
    phenos_test_H1 = phenos_H1[indices_permuted_H1[start:stop]]
    
    print('Fold: ',f)
        
    # initialize zero-order (intercept-only) model
    genos_train_H1_previous = np.full(len(genos_train_H1),1.0)
    genos_test_H1_previous = np.full(len(genos_test_H1),1.0)

    reg_H1_previous = sm.OLS(phenos_train_H1,genos_train_H1_previous).fit()
    reg_H1_coefs_previous = reg_H1_previous.params

    rsquared_train_H1_previous = reg_H1_previous.rsquared
    rsquared_test_H1_previous = 1-np.sum((phenos_test_H1-reg_H1_previous.predict(genos_test_H1_previous))**2)/np.sum((phenos_test_H1-np.mean(phenos_test_H1))**2)
    rsq_train_list_H1[0,f] = rsquared_train_H1_previous
    rsq_test_list_H1[0,f] = rsquared_test_H1_previous

    mean_pheno_train = np.mean(phenos_train_H1)
    mean_pheno_test = np.mean(phenos_test_H1)


    # fit models of increasing order
    for order in range(1,max_order+1):
        #print('Order: ',str(order))
        poly_H1_current = PolynomialFeatures(order,interaction_only=True)
        genos_train_H1_current = poly_H1_current.fit_transform(genos_train_H1)
        genos_test_H1_current = poly_H1_current.fit_transform(genos_test_H1)

        reg_H1_current = sm.OLS(phenos_train_H1, genos_train_H1_current).fit()
        reg_H1_coefs_current = reg_H1_current.params
        reg_H1_CIs_current = reg_H1_current.conf_int(alpha=0.05, cols=None)
        reg_H1_stderr = reg_H1_current.bse
    
        rsquared_train_H1_current = reg_H1_current.rsquared
        rsquared_test_H1_current = 1-np.sum((phenos_test_H1-reg_H1_current.predict(genos_test_H1_current))**2)/np.sum((phenos_test_H1-np.mean(phenos_test_H1))**2)
        rsq_train_list_H1[order,f] = rsquared_train_H1_current
        rsq_test_list_H1[order,f] = rsquared_test_H1_current
        

# average over folds
mean_rsq_train_H1 = np.mean(rsq_train_list_H1,axis=1)
stdev_rsq_train_H1 = np.std(rsq_train_list_H1,axis=1)
mean_rsq_test_H1 = np.mean(rsq_test_list_H1,axis=1)
stdev_rsq_test_H1 = np.std(rsq_test_list_H1,axis=1)

optimal_H1_order = np.argmax(mean_rsq_test_H1)
print('Optimal order, H1: ',optimal_H1_order)


188 1699
Fold:  0
Fold:  1
Fold:  2
Fold:  3
Fold:  4
Fold:  5
Fold:  6
Fold:  7
Fold:  8
Fold:  9
Optimal order, H1:  4


In [18]:
# print CV results to file
with open('model_coefs/H1_CV_rsquared_'+ep_type+'.csv','w') as writefile:
    rsq_writer = csv.writer(writefile)
    rsq_writer.writerow(['Optimal order: '+str(optimal_H1_order)])
    rsq_writer.writerow(['Type','Order','Mean','Std'])
    for i in range(len(mean_rsq_train_H1)):
        rsq_writer.writerow(['Train',str(i),mean_rsq_train_H1[i],stdev_rsq_train_H1[i]])
    for i in range(len(mean_rsq_test_H1)):
        rsq_writer.writerow(['Test',str(i),mean_rsq_test_H1[i],stdev_rsq_test_H1[i]])
    writefile.close()



### H9

In [18]:
num_folds = 10
max_order = 5

# set up permutation
np.random.seed(2112)
indices_permuted_H9 = np.random.permutation(np.arange(len(genos_H9)))
size_test_H9 = int(1.0/float(num_folds)*len(genos_H9))
size_train_H9 = len(genos_H9)-size_test_H9
print(size_test_H9,size_train_H9)

# lists to store r squared values
rsq_train_list_H9 = np.zeros((max_order+1,num_folds))
rsq_test_list_H9 = np.zeros((max_order+1,num_folds))


# loop over CV folds
for f in range(num_folds):

    # get train & test sets
    start = int(f*size_test_H9)
    stop = int((f+1)*size_test_H9)
    genos_train_H9 = np.concatenate((genos_H9[indices_permuted_H9[:start]],genos_H9[indices_permuted_H9[stop:]]))
    genos_test_H9 = genos_H9[indices_permuted_H9[start:stop]]
    phenos_train_H9 = np.concatenate((phenos_H9[indices_permuted_H9[:start]],phenos_H9[indices_permuted_H9[stop:]]))
    phenos_test_H9 = phenos_H9[indices_permuted_H9[start:stop]]
    
    print('Fold: ',f)
        
    # initialize zero-order (intercept-only) model
    genos_train_H9_previous = np.full(len(genos_train_H9),1.0)
    genos_test_H9_previous = np.full(len(genos_test_H9),1.0)

    reg_H9_previous = sm.OLS(phenos_train_H9,genos_train_H9_previous).fit()
    reg_H9_coefs_previous = reg_H9_previous.params

    rsquared_train_H9_previous = reg_H9_previous.rsquared
    rsquared_test_H9_previous = 1-np.sum((phenos_test_H9-reg_H9_previous.predict(genos_test_H9_previous))**2)/np.sum((phenos_test_H9-np.mean(phenos_test_H9))**2)
    rsq_train_list_H9[0,f] = rsquared_train_H9_previous
    rsq_test_list_H9[0,f] = rsquared_test_H9_previous

    mean_pheno_train = np.mean(phenos_train_H9)
    mean_pheno_test = np.mean(phenos_test_H9)


    # fit models of increasing order
    for order in range(1,max_order+1):
        #print('Order: ',str(order))
        poly_H9_current = PolynomialFeatures(order,interaction_only=True)
        genos_train_H9_current = poly_H9_current.fit_transform(genos_train_H9)
        genos_test_H9_current = poly_H9_current.fit_transform(genos_test_H9)

        reg_H9_current = sm.OLS(phenos_train_H9, genos_train_H9_current).fit()
        reg_H9_coefs_current = reg_H9_current.params
        reg_H9_CIs_current = reg_H9_current.conf_int(alpha=0.05, cols=None)
        reg_H9_stderr = reg_H9_current.bse
    
        rsquared_train_H9_current = reg_H9_current.rsquared
        rsquared_test_H9_current = 1-np.sum((phenos_test_H9-reg_H9_current.predict(genos_test_H9_current))**2)/np.sum((phenos_test_H9-np.mean(phenos_test_H9))**2)
        rsq_train_list_H9[order,f] = rsquared_train_H9_current
        rsq_test_list_H9[order,f] = rsquared_test_H9_current
        

# average over folds
mean_rsq_train_H9 = np.mean(rsq_train_list_H9,axis=1)
stdev_rsq_train_H9 = np.std(rsq_train_list_H9,axis=1)
mean_rsq_test_H9 = np.mean(rsq_test_list_H9,axis=1)
stdev_rsq_test_H9 = np.std(rsq_test_list_H9,axis=1)

optimal_H9_order = np.argmax(mean_rsq_test_H9)
print('Optimal order, H9: ',optimal_H9_order)


184 1658
Fold:  0
Fold:  1
Fold:  2
Fold:  3
Fold:  4
Fold:  5
Fold:  6
Fold:  7
Fold:  8
Fold:  9
Optimal order, H9:  4


In [20]:
# print CV results to file
with open('model_coefs/H9_CV_rsquared_'+ep_type+'.csv','w') as writefile:
    rsq_writer = csv.writer(writefile)
    rsq_writer.writerow(['Optimal order: '+str(optimal_H9_order)])
    rsq_writer.writerow(['Type','Order','Mean','Std'])
    for i in range(len(mean_rsq_train_H9)):
        rsq_writer.writerow(['Train',str(i),mean_rsq_train_H9[i],stdev_rsq_train_H9[i]])
    for i in range(len(mean_rsq_test_H9)):
        rsq_writer.writerow(['Test',str(i),mean_rsq_test_H9[i],stdev_rsq_test_H9[i]])
    writefile.close()
    

## Fit final models

### H1

In [26]:
# fit models of increasing order
for order in range(1,optimal_H1_order+1):
    
    genos_H1_permuted = genos_H1[indices_permuted_H1]
    phenos_H1_permuted = phenos_H1[indices_permuted_H1]
    print('Order: ',str(order))
    poly_H1_current = PolynomialFeatures(order,interaction_only=True)
    genos_H1_current = poly_H1_current.fit_transform(genos_H1_permuted)

    # fit
    reg_H1_current = sm.OLS(phenos_H1_permuted,genos_H1_current).fit()
    reg_H1_coefs_current = reg_H1_current.params
    reg_H1_CIs_current = reg_H1_current.conf_int(alpha=0.05/float(len(reg_H1_coefs_current)), cols=None)
    reg_H1_stderr = reg_H1_current.bse
    reg_H1_pvalues = reg_H1_current.pvalues
    
    num_sig = len(np.where(reg_H1_pvalues < 0.05/float(len(reg_H1_coefs_current)))[0])
    print(num_sig)

    predicted_phenos_permuted_H1 = reg_H1_current.predict(genos_H1_current)
    rsquared_H1_current = reg_H1_current.rsquared
    print('Params: ',len(reg_H1_coefs_current))
    print('Performance: ',rsquared_H1_current)
     
    # write model to file
    if order > 0:
        coef_names = poly_H1_current.get_feature_names(input_features = mutations)
        with open('model_coefs/H1_'+str(order)+'order_'+ep_type+'.txt','w') as writefile:
            coef_writer = csv.writer(writefile,delimiter='\t')
            coef_writer.writerow(['Params: ',len(reg_H1_coefs_current)])
            coef_writer.writerow(['Performance: ',rsquared_H1_current])
            coef_writer.writerow(['Term','Coefficient','Standard Error','p-value','95% CI lower','95% CI upper'])
            coef_writer.writerow(['Intercept',reg_H1_coefs_current[0]])
            for i in range(1,len(reg_H1_coefs_current)):
                coef_writer.writerow([','.join(coef_names[i].split(' ')),reg_H1_coefs_current[i],reg_H1_stderr[i],
                                  reg_H1_pvalues[i],reg_H1_CIs_current[i][0],reg_H1_CIs_current[i][1]])
            writefile.close()


Order:  1
10
Params:  12
Performance:  0.7430261110294989
Order:  2
30
Params:  67
Performance:  0.9151429662861195
Order:  3
41
Params:  232
Performance:  0.957979520212946
Order:  4
69
Params:  562
Performance:  0.9827390892301666


### H9

In [27]:
# fit models of increasing order
for order in range(1,optimal_H9_order+1):
    
    genos_H9_permuted = genos_H9[indices_permuted_H9]
    phenos_H9_permuted = phenos_H9[indices_permuted_H9]
    print('Order: ',str(order))
    poly_H9_current = PolynomialFeatures(order,interaction_only=True)
    genos_H9_current = poly_H9_current.fit_transform(genos_H9_permuted)

    # fit
    reg_H9_current = sm.OLS(phenos_H9_permuted,genos_H9_current).fit()
    reg_H9_coefs_current = reg_H9_current.params
    reg_H9_CIs_current = reg_H9_current.conf_int(alpha=0.05/float(len(reg_H9_coefs_current)), cols=None)
    reg_H9_stderr = reg_H9_current.bse
    reg_H9_pvalues = reg_H9_current.pvalues
    
    num_sig = len(np.where(reg_H9_pvalues < 0.05/float(len(reg_H9_coefs_current)))[0])
    print(num_sig)

    predicted_phenos_permuted_H9 = reg_H9_current.predict(genos_H9_current)
    rsquared_H9_current = reg_H9_current.rsquared
    print('Params: ',len(reg_H9_coefs_current))
    print('Performance: ',rsquared_H9_current)
     
    # write model to file
    if order > 0:
        coef_names = poly_H9_current.get_feature_names(input_features = mutations)
        with open('model_coefs/H9_'+str(order)+'order_'+ep_type+'.txt','w') as writefile:
            coef_writer = csv.writer(writefile,delimiter='\t')
            coef_writer.writerow(['Params: ',len(reg_H9_coefs_current)])
            coef_writer.writerow(['Performance: ',rsquared_H9_current])
            coef_writer.writerow(['Term','Coefficient','Standard Error','p-value','95% CI lower','95% CI upper'])
            coef_writer.writerow(['Intercept',reg_H9_coefs_current[0]])
            for i in range(1,len(reg_H9_coefs_current)):
                coef_writer.writerow([','.join(coef_names[i].split(' ')),reg_H9_coefs_current[i],reg_H9_stderr[i],
                                  reg_H9_pvalues[i],reg_H9_CIs_current[i][0],reg_H9_CIs_current[i][1]])
            writefile.close()


Order:  1
11
Params:  12
Performance:  0.7924868580010378
Order:  2
24
Params:  67
Performance:  0.8743026327570844
Order:  3
32
Params:  232
Performance:  0.9292041206369122
Order:  4
41
Params:  562
Performance:  0.9680505884789233
