# Correcting Mireia's files to be input into MoCHI

Problem identified - the `.tsv` files (generated previously) have NA and 1 in WT column, which can be mistaken for NA in the amino acid (aa) sequence language, so MoCHI raises an error
The WT column should have nothing (`""` in python terms) instead of NA (and 1 where it's 1)

So here I am reading the files and rewriting the correct version


In [1]:
import pandas as pd
import numpy as np
import glob

### Doing this for files with *all* variants (straight out of DiMSum)

In [2]:
original_files_dir = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/dimsum_outputs/'
corrected_files_dir = original_files_dir + 'corrected_fitness_abundance_files_Anna/'


In [3]:
original_files_dir

'/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/dimsum_outputs/'

In [4]:
corrected_files_dir

'/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/dimsum_outputs/corrected_fitness_abundance_files_Anna/'

In [5]:
# dataset keys
datasets = ['eLife_doubles','NNK_doubles','Nicking_doubles']

In [6]:
# files straight out of DiMSum

# all the datasets we will be working with
datasets = ['eLife_doubles','NNK_doubles','Nicking_doubles']

# here I went away and presaved files all_variants.csv file and variant_data_merge.csv from dimsum output in notebook RData2csv.ipynb
# explicitly declare paths to each results folder 
lib_paths_dimsum = {}
lib_paths_dimsum['eLife_doubles'] = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/dimsum_outputs/eLife_doubles/'
lib_paths_dimsum['NNK_doubles'] = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/dimsum_outputs/NNK_doubles/'
lib_paths_dimsum['Nicking_doubles'] = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/dimsum_outputs/Nicking_doubles/'

# tables with only classifiable variants - to check where WT is labelled
lib_paths_class_only = {}
lib_paths_class_only['eLife_doubles'] = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/classifiable_only/corrected_fitness_abundance_files_Anna/mochi_elife_classifiable_corrected.tsv'
lib_paths_class_only['NNK_doubles'] = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/classifiable_only/corrected_fitness_abundance_files_Anna/mochi_classifiable_NNKfu_corrected.tsv'
lib_paths_class_only['Nicking_doubles'] = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/classifiable_only/corrected_fitness_abundance_files_Anna/mochi_nicking_classifiable_corrected.tsv'


In [7]:
%%time
# reading in all_variants.csv and variant_data_merge.csv files
all_variants = {}
all_variants_class_only = {}

for dataset in datasets:
    print(dataset)
    
    if dataset == 'eLife_doubles':
        all_variants[dataset] = pd.read_csv(lib_paths_dimsum[dataset] + 'all_variants.df.csv',
                          index_col=0, keep_default_na=False)
    else:
        
        all_variants[dataset] = pd.read_csv(lib_paths_dimsum[dataset] + 'all_variants.csv',
                              index_col=0, keep_default_na=False)
    
    all_variants_class_only[dataset] = pd.read_csv(lib_paths_class_only[dataset],
                                                   sep='\t', #index_col=0, keep_default_na=False
                                                  )
        

eLife_doubles
NNK_doubles
Nicking_doubles
CPU times: user 462 ms, sys: 53.2 ms, total: 516 ms
Wall time: 525 ms


In [8]:
np.unique(all_variants['eLife_doubles']['WT'])

array(['NA', 'TRUE'], dtype=object)

In [9]:
np.unique([str(elem) for elem in all_variants_class_only['eLife_doubles']['WT']])

array(['True', 'nan'], dtype='<U4')

In [10]:
for dataset in datasets:
    print(dataset)
    all_variants[dataset]['WT'] = ['True' if str(value) == 'TRUE' else '' for value in all_variants[dataset]['WT']]
    all_variants_class_only[dataset]['WT'] = ['True' if str(value) == 'True' else '' for value in all_variants_class_only[dataset]['WT']]
    

eLife_doubles
NNK_doubles
Nicking_doubles


In [11]:
np.unique(all_variants['eLife_doubles']['WT'])

array(['', 'True'], dtype=object)

In [12]:
all_variants['eLife_doubles'][all_variants['eLife_doubles']['WT'] == 'True']

Unnamed: 0,merge_seq,aa_seq,Nham_nt,Nham_aa,Nmut_codons,WT,STOP,STOP_readthrough,mean_count,count_e1_s0,...,fitness3_uncorr,sigma3_uncorr,fitness,sigma,nscore_c,sigma_norm_first_toWT,category_sigma,zscore_mode,p.adjust_mode,category_dead
9270,gatgcagagttccgacatgactcaggatatgaagttcatcatcaaa...,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,0,0,0,True,False,False,5180649.0,6742700,...,0.148268232234454,0.309149356001861,-0.067406,0.052937,0.018891,0.015931,classifiable,0.356859,0.804018,dead


In [13]:
# wt that are marked in class only files

wt_class_only_files = {}
wt_all_vars_files = {} # just to see if they are the same as in class only files

for dataset in datasets:
    print('dataset:', dataset)
    print(np.unique(all_variants_class_only[dataset]['WT'], return_counts=True))
    
    wt_class_only_files[dataset] = list(all_variants_class_only[dataset][all_variants_class_only[dataset]['WT'] == 'True'].loc[:,'aa_seq'])[0]
    wt_all_vars_files[dataset] = list(all_variants[dataset][all_variants[dataset]['WT'] == 'True'].loc[:,'aa_seq'])[0]
    
    print(all_variants_class_only[dataset][all_variants_class_only[dataset]['WT'] == 'True'])
    print(all_variants[dataset][all_variants[dataset]['WT'] == 'True'])
    
    print('WT for current dataset is the same for class only file and all vars file: ', wt_class_only_files[dataset] == wt_all_vars_files[dataset])

dataset: eLife_doubles
(array(['', 'True'], dtype=object), array([15714,     1]))
                                          aa_seq  Nham_aa    WT   fitness  \
7808  DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA        0  True  0.018891   

         sigma  
7808  0.052937  
                                              merge_seq  \
9270  gatgcagagttccgacatgactcaggatatgaagttcatcatcaaa...   

                                          aa_seq  Nham_nt  Nham_aa  \
9270  DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA        0        0   

      Nmut_codons    WT   STOP  STOP_readthrough  mean_count count_e1_s0  ...  \
9270            0  True  False             False   5180649.0     6742700  ...   

        fitness3_uncorr      sigma3_uncorr   fitness     sigma  nscore_c  \
9270  0.148268232234454  0.309149356001861 -0.067406  0.052937  0.018891   

     sigma_norm_first_toWT category_sigma zscore_mode p.adjust_mode  \
9270              0.015931   classifiable    0.356859      0.804018   

     categ

In [14]:
wt_class_only_files

{'eLife_doubles': 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA',
 'NNK_doubles': 'KGAIIGLMVGGVVIA',
 'Nicking_doubles': 'DAEFRHDSGYEVHHQKLVFFAEDVGSNK'}

In [15]:
AB_WT = "DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA"

In [16]:
wt_class_only_files['eLife_doubles'] == AB_WT

True

In [17]:
wt_class_only_files['NNK_doubles'] in AB_WT

True

In [18]:
wt_class_only_files['Nicking_doubles'] in AB_WT

True

# Assigning the above variants WT in `all_variants_for_mochi`

In [19]:
# subset to columns needed in mochi
all_variants_for_mochi = {}
all_variants_class_only_for_mochi = {}

# in eLife taking the fitness column too (uncentered)

for dataset in datasets:
    print(dataset)
    all_variants_for_mochi[dataset] = all_variants[dataset].loc[:,['aa_seq','Nham_aa','WT','fitness','sigma']]
    all_variants_class_only_for_mochi[dataset] = all_variants_class_only[dataset].loc[:,['aa_seq','Nham_aa','WT','fitness','sigma']]
    

eLife_doubles
NNK_doubles
Nicking_doubles


In [20]:
# indices of rows where aa_seq is WT
wt_idx = {}

for dataset in datasets:
    print(dataset)
    wt_idx[dataset] = list(all_variants_for_mochi[dataset][all_variants_for_mochi[dataset]['aa_seq'] == wt_class_only_files[dataset]].index)

    print(len(wt_idx[dataset]), ' rows here with WT aa_seq')

eLife_doubles
2582  rows here with WT aa_seq
NNK_doubles
1  rows here with WT aa_seq
Nicking_doubles
61  rows here with WT aa_seq


## Assigning WT = True to the first occurence of the WT sequence in aa_seq

In [21]:
for dataset in datasets:
    print(dataset)
    all_variants[dataset].loc[wt_idx[dataset][0],'WT'] = 'True'
    all_variants_for_mochi[dataset].loc[wt_idx[dataset][0],'WT'] = 'True'

eLife_doubles
NNK_doubles
Nicking_doubles


In [22]:
all_variants['eLife_doubles'].columns

Index(['merge_seq', 'aa_seq', 'Nham_nt', 'Nham_aa', 'Nmut_codons', 'WT',
       'STOP', 'STOP_readthrough', 'mean_count', 'count_e1_s0', 'count_e2_s0',
       'count_e3_s0', 'count_e1_s1', 'count_e2_s1', 'count_e3_s1',
       'fitness1_uncorr', 'sigma1_uncorr', 'fitness2_uncorr', 'sigma2_uncorr',
       'fitness3_uncorr', 'sigma3_uncorr', 'fitness', 'sigma', 'nscore_c',
       'sigma_norm_first_toWT', 'category_sigma', 'zscore_mode',
       'p.adjust_mode', 'category_dead'],
      dtype='object')

In [23]:
all_variants['NNK_doubles'].columns

Index(['nt_seq', 'aa_seq', 'Nham_nt', 'Nham_aa', 'Nmut_codons', 'WT', 'indel',
       'STOP', 'STOP_readthrough', 'mean_count', 'count_e1_s0', 'count_e2_s0',
       'count_e3_s0', 'count_e1_s1', 'count_e2_s1', 'count_e3_s1',
       'fitness1_uncorr', 'fitness2_uncorr', 'fitness3_uncorr',
       'sigma1_uncorr', 'sigma2_uncorr', 'sigma3_uncorr', 'fitness', 'sigma'],
      dtype='object')

In [24]:
all_variants['Nicking_doubles'].columns

Index(['nt_seq', 'aa_seq', 'Nham_nt', 'Nham_aa', 'Nmut_codons', 'WT', 'indel',
       'STOP', 'STOP_readthrough', 'mean_count', 'count_e1_s0', 'count_e2_s0',
       'count_e3_s0', 'count_e1_s1', 'count_e2_s1', 'count_e3_s1',
       'fitness1_uncorr', 'sigma1_uncorr', 'fitness2_uncorr', 'sigma2_uncorr',
       'fitness3_uncorr', 'sigma3_uncorr', 'fitness', 'sigma'],
      dtype='object')

In [25]:
# save to then use in some figure making (Fig.1d)
for dataset in datasets:
    print(dataset)
    all_variants[dataset].to_csv(corrected_files_dir + 'all_variants_corrected_' + dataset + '_full_table_for_plotting.tsv')

eLife_doubles
NNK_doubles
Nicking_doubles


In [26]:
corrected_files_dir

'/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/dimsum_outputs/corrected_fitness_abundance_files_Anna/'

In [25]:
all_variants_for_mochi['eLife_doubles']

Unnamed: 0,aa_seq,Nham_aa,WT,fitness,sigma
1,KAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,1,,-0.117352,0.387033
2,NAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,1,,0.352500,0.062247
3,NTEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,2,,0.381524,0.420981
4,NEEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,2,,0.052856,0.450957
5,NAKFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,2,,0.495394,0.711069
...,...,...,...,...,...
18645,YAEFLHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,2,,-0.885994,0.894598
18646,YAVFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,2,,-2.245360,1.278978
18647,YVEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,2,,-3.576266,0.732316
18648,CAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,1,,0.066515,0.359042


In [35]:
np.unique(all_variants_for_mochi['NNK_doubles']['WT'], return_counts=True)

(array(['', 'True'], dtype=object), array([28529,     1]))

In [36]:
np.unique(all_variants_for_mochi['Nicking_doubles']['WT'], return_counts=True)

(array(['', 'True'], dtype=object), array([54708,     1]))

In [37]:
np.unique(all_variants_for_mochi['eLife_doubles']['WT'], return_counts=True)

(array(['', 'True'], dtype=object), array([18648,     1]))

In [38]:
# resaving

for dataset in datasets:
    print(dataset)
    curr_path = corrected_files_dir + 'mochi_' + dataset + '_all_variants_corrected.tsv'
    print('saving to', curr_path)
    all_variants_for_mochi[dataset].to_csv(curr_path,
                                    index=False,
                                    sep='\t')

eLife_doubles
saving to /lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/dimsum_outputs/corrected_fitness_abundance_files_Anna/mochi_eLife_doubles_all_variants_corrected.tsv
NNK_doubles
saving to /lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/dimsum_outputs/corrected_fitness_abundance_files_Anna/mochi_NNK_doubles_all_variants_corrected.tsv
Nicking_doubles
saving to /lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/dimsum_outputs/corrected_fitness_abundance_files_Anna/mochi_Nicking_doubles_all_variants_corrected.tsv
