# Getting a full list of pairwise variants terms from per-dataset MoCHI model results for DTS dataset

This will later be used as features in the joint MoCHI model for all DTS datasets together

28.02.2024

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from pymochi.models import MochiTask
import pickle

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
results_dir = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/mochi_results/'

In [4]:
filedir = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/classifiable_only/corrected_fitness_abundance_files_Anna/'

# l2 regularisation factor value = 10^(-5), 26.02.2024 runs
# Reading output for Sigmoid fit, others should be the same
l2_regularisation_factor_value = '1e-05'

transformation = 'Sigmoid'

datasets = ['DTS01','DTS02','DTS05','DTS10','DTS11','DTS13','DTS14','DTS15']

In [5]:
predictions = {}

mochi_tasks = {}

for dataset in datasets:
    
    curr_path = results_dir + '20240226_max_terms_order_2_' + transformation + '_l2_regularization_factor_' + l2_regularisation_factor_value + '_' + dataset + '_all_variants'

    curr_key = transformation + '_2nd_order_l2_' + l2_regularisation_factor_value + '_' + dataset

    predictions[curr_key] = pd.read_csv(curr_path + '/predictions/predicted_phenotypes_all.txt',
                                                                       sep='\t') 
    predictions[curr_key].phenotype = [str(i) for i in predictions[curr_key].phenotype]

    # adding a column with residual values
    observed_phenotype_col = 'fitness'
    predicted_phenotype_col = 'fold_1'
    residuals_col = 'residual_pred_vs_obs'
    predictions[curr_key][residuals_col] = predictions[curr_key][predicted_phenotype_col] - predictions[curr_key][observed_phenotype_col]


    # loading previous task / model to have some of the parameters here
    mochi_tasks[curr_key] = MochiTask(directory = curr_path)


In [6]:
# loading weights of the models

weights_Nucleation = {}

for dataset in datasets:    

    curr_path = results_dir + '20240226_max_terms_order_2_' + transformation + '_l2_regularization_factor_' + l2_regularisation_factor_value + '_' + dataset + '_all_variants'
    
    curr_key = transformation + '_2nd_order_l2_' + l2_regularisation_factor_value + '_' + dataset
    weights_Nucleation[curr_key] = pd.read_csv(curr_path + '/weights/weights_Nucleation.txt',
                                sep='\t')


In [7]:
interacting_mutations = {}

for dataset in datasets:
    print(dataset)
    curr_key = transformation + '_2nd_order_l2_' + l2_regularisation_factor_value + '_' + dataset
    
    # only take interaction ids from id column
    interacting_mutations[dataset] = [idx for idx in list(weights_Nucleation[curr_key]['id']) if (('_' in idx) | (idx == 'WT'))]
    print(len(interacting_mutations[dataset]), 'interactions here \n')

DTS01
449 interactions here 

DTS02
2075 interactions here 

DTS05
241 interactions here 

DTS10
377 interactions here 

DTS11
477 interactions here 

DTS13
2219 interactions here 

DTS14
177 interactions here 

DTS15
3739 interactions here 



In [8]:
# taking a union of these interaction terms
interacting_mutations_all = []

for dataset in datasets:
    print(dataset)
    interacting_mutations_all = interacting_mutations_all + interacting_mutations[dataset]

DTS01
DTS02
DTS05
DTS10
DTS11
DTS13
DTS14
DTS15


In [11]:
len(interacting_mutations_all)

9754

In [12]:
np.unique(interacting_mutations_all, return_counts=True)

(array(['A21F_A30F', 'A21F_A30I', 'A21F_A30L', ..., 'V40M_I41M',
        'V40M_I41V', 'WT'], dtype='<U9'),
 array([2, 2, 2, ..., 2, 2, 8]))

In [13]:
interacting_mutations_all_unique = list(np.unique(interacting_mutations_all))

In [14]:
len(interacting_mutations_all_unique)

4512

In [15]:
for inter in interacting_mutations_all_unique:
    with open('./interacting_mutations_all_unique_for_mochi_20240228.txt', 'a') as the_file:
        the_file.write(inter + '\n')

In [16]:
# save dictionary to interacting_mutations_dict.pkl file
with open('./interacting_mutations_dict.pkl', 'wb') as fp:
    pickle.dump(interacting_mutations, fp)
    print('dictionary saved successfully to file')


dictionary saved successfully to file


In [9]:
# taking a union of these interaction terms
interacting_mutations_DTS01_DTS05_DTS14 = []

for dataset in ['DTS01','DTS05','DTS14']:
    print(dataset)
    interacting_mutations_DTS01_DTS05_DTS14 = interacting_mutations_DTS01_DTS05_DTS14 + interacting_mutations[dataset]

DTS01
DTS05
DTS14


In [11]:
len(interacting_mutations_DTS01_DTS05_DTS14)

867

In [13]:
np.unique(interacting_mutations_DTS01_DTS05_DTS14, return_counts=True)

(array(['F19I_F20I', 'F19I_F20L', 'F19I_F20M', 'F19I_F20V', 'F19I_G33F',
        'F19I_G33I', 'F19I_G33L', 'F19I_G33M', 'F19I_G33V', 'F19I_I31F',
        'F19I_I31L', 'F19I_I31M', 'F19I_I31V', 'F19I_I32F', 'F19I_I32L',
        'F19I_I32M', 'F19I_I32V', 'F19I_L34F', 'F19I_L34I', 'F19I_L34M',
        'F19I_L34V', 'F19I_V24F', 'F19I_V24I', 'F19I_V24L', 'F19I_V24M',
        'F19L_F20I', 'F19L_F20L', 'F19L_F20M', 'F19L_F20V', 'F19L_G33F',
        'F19L_G33I', 'F19L_G33L', 'F19L_G33M', 'F19L_G33V', 'F19L_I31F',
        'F19L_I31L', 'F19L_I31M', 'F19L_I31V', 'F19L_I32F', 'F19L_I32L',
        'F19L_I32M', 'F19L_I32V', 'F19L_L34F', 'F19L_L34I', 'F19L_L34M',
        'F19L_L34V', 'F19L_V24F', 'F19L_V24I', 'F19L_V24L', 'F19L_V24M',
        'F19M_F20I', 'F19M_F20L', 'F19M_F20M', 'F19M_F20V', 'F19M_G33F',
        'F19M_G33I', 'F19M_G33L', 'F19M_G33M', 'F19M_G33V', 'F19M_I31F',
        'F19M_I31L', 'F19M_I31M', 'F19M_I31V', 'F19M_I32F', 'F19M_I32L',
        'F19M_I32M', 'F19M_I32V', 'F19M_L34F', 'F19

In [14]:
interacting_mutations_DTS01_DTS05_DTS14_unique = list(np.unique(interacting_mutations_DTS01_DTS05_DTS14))

In [15]:
len(interacting_mutations_DTS01_DTS05_DTS14_unique)

801

In [16]:
interacting_mutations_DTS01_DTS05_DTS14_unique

['F19I_F20I',
 'F19I_F20L',
 'F19I_F20M',
 'F19I_F20V',
 'F19I_G33F',
 'F19I_G33I',
 'F19I_G33L',
 'F19I_G33M',
 'F19I_G33V',
 'F19I_I31F',
 'F19I_I31L',
 'F19I_I31M',
 'F19I_I31V',
 'F19I_I32F',
 'F19I_I32L',
 'F19I_I32M',
 'F19I_I32V',
 'F19I_L34F',
 'F19I_L34I',
 'F19I_L34M',
 'F19I_L34V',
 'F19I_V24F',
 'F19I_V24I',
 'F19I_V24L',
 'F19I_V24M',
 'F19L_F20I',
 'F19L_F20L',
 'F19L_F20M',
 'F19L_F20V',
 'F19L_G33F',
 'F19L_G33I',
 'F19L_G33L',
 'F19L_G33M',
 'F19L_G33V',
 'F19L_I31F',
 'F19L_I31L',
 'F19L_I31M',
 'F19L_I31V',
 'F19L_I32F',
 'F19L_I32L',
 'F19L_I32M',
 'F19L_I32V',
 'F19L_L34F',
 'F19L_L34I',
 'F19L_L34M',
 'F19L_L34V',
 'F19L_V24F',
 'F19L_V24I',
 'F19L_V24L',
 'F19L_V24M',
 'F19M_F20I',
 'F19M_F20L',
 'F19M_F20M',
 'F19M_F20V',
 'F19M_G33F',
 'F19M_G33I',
 'F19M_G33L',
 'F19M_G33M',
 'F19M_G33V',
 'F19M_I31F',
 'F19M_I31L',
 'F19M_I31M',
 'F19M_I31V',
 'F19M_I32F',
 'F19M_I32L',
 'F19M_I32M',
 'F19M_I32V',
 'F19M_L34F',
 'F19M_L34I',
 'F19M_L34M',
 'F19M_L34V',
 'F19M

In [20]:
# reading in pairwise features from the run with subsampled dead variants

interacting_mutations_DTS01_DTS05_DTS14_subsampled = list(pd.read_csv('/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/DTS_joint_analysis/selected_DTS_datasets_analysis/DTS01_DTS05_DTS14/run_with_subsampled_dead/weights_Nucleation_2nd_order_DTS01_DTS05_DTS14_with_subsampling_20240424.csv',
                                                                index_col=0)['id'])


In [22]:
len(interacting_mutations_DTS01_DTS05_DTS14_subsampled)

1000

In [23]:
len(interacting_mutations_DTS01_DTS05_DTS14_subsampled) == len(np.unique(interacting_mutations_DTS01_DTS05_DTS14_subsampled))

True

In [29]:
not_overlapping_1 = list(set(interacting_mutations_DTS01_DTS05_DTS14_unique) - set(interacting_mutations_DTS01_DTS05_DTS14_subsampled))
not_overlapping_2 = list(set(interacting_mutations_DTS01_DTS05_DTS14_subsampled) - set(interacting_mutations_DTS01_DTS05_DTS14_unique))
not_overlapping = not_overlapping_1 + not_overlapping_2

In [36]:
'N27I_F20L' in interacting_mutations_DTS01_DTS05_DTS14_unique

False

In [34]:
# stuff that was in the subsampled run interactions but not in the full
not_overlapping_2

['F20L_N27I',
 'F20M_N27M',
 'F20M_A42I',
 'N27M_K28R',
 'L34M_A42T',
 'F20M_N27L',
 'N27F_A42M',
 'F20L_A42I',
 'N27I_A42T',
 'L17V_L34I',
 'N27F_L34V',
 'L34I_A42T',
 'F20I_N27L',
 'L34F_A42S',
 'N27L_A42F',
 'F20L_A42T',
 'H14L_N27L',
 'N27M_L34V',
 'F20I_N27V',
 'L34V_A42I',
 'L17I_N27F',
 'H14I_L17F',
 'F20M_A42M',
 'H14F_F20M',
 'H14F_F20V',
 'H14V_F20L',
 'H14I_A42M',
 'F20I_A42V',
 'F20L_N27M',
 'L17M_L34F',
 'H14I_L34M',
 'H14V_L34I',
 'L17F_A42L',
 'L17V_A42M',
 'L17I_A42M',
 'H14L_F20I',
 'H14M_F20L',
 'K28R_A42V',
 'H14M_L17F',
 'H14V_N27F',
 'K28R_A42M',
 'H14F_L17F',
 'L34F_A42T',
 'F20M_N27V',
 'L34F_A42L',
 'L17F_N27F',
 'H14M_L17I',
 'L17V_A42T',
 'H14F_L17I',
 'N27V_A42S',
 'H14V_F20M',
 'H14L_A42L',
 'N27M_A42I',
 'N27M_L34M',
 'L17I_N27L',
 'N27M_L34I',
 'L17F_A42T',
 'F20L_N27V',
 'N27M_A42V',
 'H14L_N27M',
 'F20M_N27I',
 'H14V_F20I',
 'H14V_A42T',
 'L17I_N27M',
 'L34F_A42I',
 'K28R_L34V',
 'F20L_N27L',
 'H14I_K28R',
 'H14V_A42F',
 'L17M_L34V',
 'L17M_A42I',
 'F20I

In [26]:
interacting_mutations_DTS01_DTS05_DTS14_subsampled

['L17I_N27I',
 'I32F_V36L',
 'V39L_V40F',
 'F20I_N27L',
 'I31M_L34F',
 'K28R_L34M',
 'V39I_I41M',
 'F20L_V24I',
 'L34M_I41V',
 'I31V_I41L',
 'F19V_L34V',
 'V36F_V39L',
 'L17V_F20M',
 'H14I_A42F',
 'H14M_F20I',
 'V39L_V40L',
 'I31F_I41V',
 'F19V_L34M',
 'M35I_V40M',
 'I32M_V40M',
 'V40L_I41V',
 'F20V_I31L',
 'I31V_L34F',
 'F20V_V24M',
 'N27F_K28R',
 'V24I_I31V',
 'L17I_N27V',
 'N27F_A42V',
 'M35I_I41V',
 'I32M_M35F',
 'I31V_I41V',
 'I32L_I41V',
 'I31L_M35F',
 'M35V_I41M',
 'V36M_V40M',
 'F20M_I32M',
 'I31F_M35F',
 'L34F_M35L',
 'L17V_K28R',
 'F20I_N27V',
 'F20I_A42I',
 'N27I_A42I',
 'I32M_I41V',
 'H14F_L17V',
 'F20V_N27I',
 'F20I_A42V',
 'I32F_V39M',
 'I31M_L34M',
 'I31F_L34F',
 'M35V_V40L',
 'L17V_A42M',
 'L34V_V36I',
 'I31L_V39I',
 'H14I_F20M',
 'I31M_V39L',
 'V36F_V40I',
 'V40L_I41F',
 'I32M_M35I',
 'V24F_I32V',
 'V24L_I31V',
 'I32F_L34V',
 'L34F_V36L',
 'H14I_L34V',
 'H14I_K28R',
 'I32F_L34M',
 'F19I_L34F',
 'H14F_L17M',
 'V24I_I31L',
 'L34M_V40I',
 'M35V_V36M',
 'M35L_I41V',
 'F20L

In [25]:
interacting_mutations_DTS01_DTS05_DTS14_unique

['F19I_F20I',
 'F19I_F20L',
 'F19I_F20M',
 'F19I_F20V',
 'F19I_G33F',
 'F19I_G33I',
 'F19I_G33L',
 'F19I_G33M',
 'F19I_G33V',
 'F19I_I31F',
 'F19I_I31L',
 'F19I_I31M',
 'F19I_I31V',
 'F19I_I32F',
 'F19I_I32L',
 'F19I_I32M',
 'F19I_I32V',
 'F19I_L34F',
 'F19I_L34I',
 'F19I_L34M',
 'F19I_L34V',
 'F19I_V24F',
 'F19I_V24I',
 'F19I_V24L',
 'F19I_V24M',
 'F19L_F20I',
 'F19L_F20L',
 'F19L_F20M',
 'F19L_F20V',
 'F19L_G33F',
 'F19L_G33I',
 'F19L_G33L',
 'F19L_G33M',
 'F19L_G33V',
 'F19L_I31F',
 'F19L_I31L',
 'F19L_I31M',
 'F19L_I31V',
 'F19L_I32F',
 'F19L_I32L',
 'F19L_I32M',
 'F19L_I32V',
 'F19L_L34F',
 'F19L_L34I',
 'F19L_L34M',
 'F19L_L34V',
 'F19L_V24F',
 'F19L_V24I',
 'F19L_V24L',
 'F19L_V24M',
 'F19M_F20I',
 'F19M_F20L',
 'F19M_F20M',
 'F19M_F20V',
 'F19M_G33F',
 'F19M_G33I',
 'F19M_G33L',
 'F19M_G33M',
 'F19M_G33V',
 'F19M_I31F',
 'F19M_I31L',
 'F19M_I31M',
 'F19M_I31V',
 'F19M_I32F',
 'F19M_I32L',
 'F19M_I32M',
 'F19M_I32V',
 'F19M_L34F',
 'F19M_L34I',
 'F19M_L34M',
 'F19M_L34V',
 'F19M