In [1]:
from Bio import SeqIO
from tqdm import tqdm
from Bio.Seq import Seq, MutableSeq

import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import os, progressbar, time
import matplotlib.pyplot as plt

## 1. Collect protein IDs to retrieve their CDS

In [2]:
def collect_cds(fastaFile, cdsList, outFile):
    tmp = []
    for seqRecord in SeqIO.parse(fastaFile, format='fasta'):
        if seqRecord.id in cdsList :
            tmp.append(seqRecord)
    SeqIO.write(tmp, open(outFile, 'w'), 'fasta')

In [3]:
HG_fasta = '../../data/ortholog_dataset/uni_HG_orthologs.faa'
MM_fasta = '../../data/ortholog_dataset/uni_MM_orthologs.faa'

HG_cds_fasta = '/media/savvy/DATA3/savvy/Genomes/model/heterocephalus_glaber/h_glaber_cds_from_genomic.fna'
MM_cds_fasta = '/media/savvy/DATA3/savvy/Genomes/model/mus_musculus/m_musculus_cds_from_genomic.fna'

HG_IDs = [ seqRecord.id for seqRecord in SeqIO.parse(HG_fasta, format='fasta')]
MM_IDs = [ seqRecord.id for seqRecord in SeqIO.parse(MM_fasta, format='fasta')]

In [4]:
collect_cds(HG_cds_fasta, HG_IDs, '../../data/ortholog_dataset/uni_HG_cds_orthologs.faa')
collect_cds(MM_cds_fasta, MM_IDs, '../../data/ortholog_dataset/uni_MM_cds_orthologs.faa')

## 2. Collect the CDS 

In [5]:
HG_cds_fasta = '../../data/ortholog_dataset/uni_HG_cds_orthologs.faa'
MM_cds_fasta = '../../data/ortholog_dataset/uni_MM_cds_orthologs.faa'

#### All proteins

In [6]:
all_agg_scores = pd.read_csv('../../data/aggregation_propensity/HGMM_agg_scores.csv', sep=',') 
ortho_pairs = all_agg_scores[['proteinID_x', 'proteinID_y']]

#### Chaperone clients

In [7]:
uniprot_mapping = pd.read_csv('../../data/chaperone_clients/human_ensembl_to_uniprot.tab', sep='\t')
hs_mm_orthologs = pd.read_csv('../../data/chaperone_clients/HS_MM_uni_ortholog_groups.csv', sep='\t')
hs_mm_orthologs = hs_mm_orthologs[['proteinID_x', 'proteinID_y']]
mm_chap_clt = hs_mm_orthologs[hs_mm_orthologs['proteinID_x'].isin(uniprot_mapping['Entry'])]['proteinID_y']

chap_clt_sub = all_agg_scores[all_agg_scores['proteinID_y'].isin(list(mm_chap_clt))]

In [8]:
len(chap_clt_sub)

1298

In [9]:
#### Container of MM and HG sequences
HG_IDs = list(all_agg_scores['proteinID_x'])
MM_IDs = list(all_agg_scores['proteinID_y'])

In [10]:
HG_seq = {}
MM_seq = {}

for seqRecord in SeqIO.parse(HG_cds_fasta, format='fasta'):
    if seqRecord.id in HG_IDs :
        HG_seq[seqRecord.id] = seqRecord
        
for seqRecord in SeqIO.parse(MM_cds_fasta, format='fasta'):
    if seqRecord.id in MM_IDs :
        MM_seq[seqRecord.id] = seqRecord

In [11]:
print(f'MM: {len(MM_seq)}, HG: {len(HG_seq)}')

MM: 9518, HG: 9522


#### Check cds length / cds with non standard nucleotide

In [12]:
tmp = []
for key in MM_seq.keys() :
    cds_id = key
    cds = MM_seq[key].seq
    if len(cds) % 3 == 0 :
        if list(np.unique(cds)) != ['A', 'C', 'G', 'T']:
            tmp.append([cds_id, 'MM', 'non_standard_nucleotide', False])
        else: 
            tmp.append([cds_id,'MM', 'standard', True])
    else:
        tmp.append([cds_id, 'MM', 'non_standard_length', False])
        
for key in HG_seq.keys() :
    cds_id = key
    cds = HG_seq[key].seq
    if len(cds) % 3 == 0 :
        if list(np.unique(cds)) != ['A', 'C', 'G', 'T']:
            tmp.append([cds_id, 'HG', 'non_standard_nucleotide', False])
        else: 
            tmp.append([cds_id,'HG', 'standard', True])
    else:
        tmp.append([cds_id, 'HG', 'non_standard_length', False])

cds_validity = pd.DataFrame(tmp, columns=['proteinID', 'organism', 'description', 'valid_cds'])

In [13]:
len(cds_validity[cds_validity['valid_cds'] == True])

18944

In [14]:
HG_valids = cds_validity[(cds_validity['valid_cds'] == True) & (cds_validity['organism'] == 'HG')].rename(columns={'proteinID':'proteinID_x'})
MM_valids = cds_validity[(cds_validity['valid_cds'] == True) & (cds_validity['organism'] == 'MM')].rename(columns={'proteinID':'proteinID_y'})

In [15]:
HG_valids = HG_valids.merge(ortho_pairs, on='proteinID_x')
ortho_cds_valids = HG_valids.merge(MM_valids, on='proteinID_y')

In [16]:
ortho_cds_valids

Unnamed: 0,proteinID_x,organism_x,description_x,valid_cds_x,proteinID_y,organism_y,description_y,valid_cds_y
0,E3VX36,HG,standard,True,P51910,MM,standard,True
1,E3VX52,HG,standard,True,P23927,MM,standard,True
2,E3VX64,HG,standard,True,P34884,MM,standard,True
3,E3VX68,HG,standard,True,Q9D7M8,MM,standard,True
4,E3VX70,HG,standard,True,P61514,MM,standard,True
...,...,...,...,...,...,...,...,...
9417,G5CBQ5,HG,standard,True,Q3TYX3,MM,standard,True
9418,G5CBQ6,HG,standard,True,Q8R361,MM,standard,True
9419,G5CBQ8,HG,standard,True,Q04742,MM,standard,True
9420,G5CBQ9,HG,standard,True,Q64105,MM,standard,True


In [17]:
#### Number of valid cds pairs
ortho_cds_valids.to_csv('../../data/mutation_tolerance/ortho_valids_cds.csv', sep='\t', index=False)

In [18]:
#### Number of valid cds pairs (chaperone client proteins)
ortho_cds_valids[ortho_cds_valids['proteinID_y'].isin(mm_chap_clt)]

Unnamed: 0,proteinID_x,organism_x,description_x,valid_cds_x,proteinID_y,organism_y,description_y,valid_cds_y
0,E3VX36,HG,standard,True,P51910,MM,standard,True
2,E3VX64,HG,standard,True,P34884,MM,standard,True
21,G5AJR9,HG,standard,True,Q8BZ60,MM,standard,True
22,G5AJS0,HG,standard,True,Q99PM3,MM,standard,True
26,G5AJS7,HG,standard,True,Q62086,MM,standard,True
...,...,...,...,...,...,...,...,...
9397,G5CBL6,HG,standard,True,Q78XR0,MM,standard,True
9409,G5CBN2,HG,standard,True,Q9Z2F6,MM,standard,True
9412,G5CBP7,HG,standard,True,Q8C1M2,MM,standard,True
9413,G5CBQ1,HG,standard,True,Q8K4E0,MM,standard,True


#### Tango execution time

- Shortest protein (Q00LT2 - length=53 aa)
    - Time execution in Tango: 0.024s
- Longest protein (G5C996 - length=10159 aa)
    - Time execution in Tango: 5.116s


## 3. Check number of mutants per protein

In [19]:
HG_all_count = pd.read_csv('../../data/mutation_tolerance/HG_all_mutants_counts.csv').rename(columns={'proteinID':'proteinID_x'})
MM_all_count = pd.read_csv('../../data/mutation_tolerance/MM_all_mutants_counts.csv').rename(columns={'proteinID':'proteinID_y'})

In [20]:
HG_all_count[(HG_all_count['unique_MT'] > 15000)].merge(all_agg_scores[['proteinID_x','length_x']], on='proteinID_x').describe()

Unnamed: 0,all_MT,unique_MT,length_x
count,127.0,127.0,127.0
mean,29626.023622,20348.606299,3290.755906
std,9488.011921,6485.892454,1054.229951
min,21709.0,15025.0,2411.0
25%,23396.5,16056.5,2598.5
50%,26038.0,17923.0,2892.0
75%,33022.0,22818.0,3668.5
max,91441.0,61617.0,10159.0


In [21]:
MM_all_count[(MM_all_count['unique_MT'] > 10000)].merge(all_agg_scores[['proteinID_y','length_y']], on='proteinID_y').describe()

Unnamed: 0,all_MT,unique_MT,length_y
count,395.0,395.0,395.0
mean,21111.468354,14501.881013,2356.888608
std,7499.921152,5176.663823,843.934391
min,14473.0,10012.0,1607.0
25%,16201.0,11111.5,1804.0
50%,18325.0,12623.0,2055.0
75%,23338.0,15996.5,2602.5
max,66547.0,46086.0,7393.0


In [22]:
MM_all_count[(MM_all_count['unique_MT'] > 10000) & (MM_all_count['unique_MT'] < 20000)]

Unnamed: 0,proteinID_y,all_MT,unique_MT
4,A0A087WQ44,29449,19754
13,A0A0G2JG52,18595,12759
24,A0A140LI88,15895,10996
26,A0A140LIW3,15976,10879
27,A0A140LIY9,21043,14347
...,...,...,...
10579,Q9WVF7,20557,14203
10643,Q9Z0R4,15436,10618
10724,Q9Z1T6,18883,13047
10792,Q9Z2U2,24292,16872


In [23]:
HG_all_count[HG_all_count['unique_MT'] > 10000]


Unnamed: 0,proteinID_x,all_MT,unique_MT
6,G5AJK4,23347,16368
36,G5AJU2,14797,10217
43,G5AJV3,15634,10690
74,G5AK08,15625,10776
76,G5AK10,17758,12367
...,...,...,...
10808,G5CB46,18901,13108
10825,G5CB89,15589,10901
10881,G5CBI8,16264,11155
10890,G5CBK3,18073,12419


In [24]:
ALL_HG = sum(HG_all_count[HG_all_count['proteinID_x'].isin(ortho_cds_valids['proteinID_x'])]['unique_MT'])
ALL_MM = sum(MM_all_count[MM_all_count['proteinID_y'].isin(ortho_cds_valids['proteinID_y'])]['unique_MT'])
ALL_HG + ALL_MM

69896045

In [25]:
HG_chap_clt_count = pd.read_csv('../../data/mutation_tolerance/HG_chap_client_mutants_counts.csv').rename(columns={'proteinID':'proteinID_x'})
MM_chap_clt_count = pd.read_csv('../../data/mutation_tolerance/MM_chap_client_mutants_counts.csv').rename(columns={'proteinID':'proteinID_y'})

In [26]:
CHAP_CLT_HG = sum(HG_chap_clt_count[HG_chap_clt_count['proteinID_x'].isin(ortho_cds_valids['proteinID_x'])]['unique_MT'])
CHAP_CLT_MM = sum(MM_chap_clt_count[MM_chap_clt_count['proteinID_y'].isin(ortho_cds_valids['proteinID_y'])]['unique_MT'])
CHAP_CLT_HG + CHAP_CLT_MM

10071232

In [27]:
HG_others_count = HG_all_count[~HG_all_count['proteinID_x'].isin(HG_chap_clt_count['proteinID_x'])]
MM_others_count = MM_all_count[~MM_all_count['proteinID_y'].isin(MM_chap_clt_count['proteinID_y'])]


##### Directory list for sbatch

In [None]:
# f = open('../../data/mutation_tolerance/tango_directory_list/MM_chap_list', 'w')
# for ID in MM_chap_clt_count['proteinID_y'].values:
#     f.write(f'{ID}\n')
# f.close()    

# f = open('../../data/mutation_tolerance/tango_directory_list/HG_chap_list', 'w')
# for ID in HG_chap_clt_count['proteinID_x'].values:
#     f.write(f'{ID}\n')
# f.close()    

# f = open('../../data/mutation_tolerance/tango_directory_list/MM_others_list', 'w')
# for ID in MM_others_count['proteinID_y'].values:
#     f.write(f'{ID}\n')
# f.close()    

# f = open('../../data/mutation_tolerance/tango_directory_list//HG_others_list', 'w')
# for ID in HG_others_count['proteinID_x'].values:
#     f.write(f'{ID}\n')
# f.close()    

## 4. Calculation of mutation tolerance

In [29]:
MUT_RES = '/media/savvy/DATA3/savvy/project_2018/computational_mutagenesis/FINAL_RESULTS'

In [30]:
def count_final_agg_score(proteinID, MM_all_count, HG_all_count, MUT_RES):
    if proteinID in MM_all_count['proteinID_y'].values:
        exp = int(MM_all_count[ MM_all_count['proteinID_y'] == proteinID ]['unique_MT'].values)
    elif proteinID in HG_all_count['proteinID_x'].values:
        exp = int(HG_all_count[ HG_all_count['proteinID_x'] == proteinID ]['unique_MT'].values)

    if f'{proteinID}_aggregation.txt' in os.listdir(MUT_RES):
        agg_tmp = pd.read_csv(os.path.join(MUT_RES,f'{proteinID}_aggregation.txt'), sep='\t')
        real = len(agg_tmp)
            
    if exp == real :
        return True
    else:
        return False

def get_Seq(seqID, org):
    if 'MM' in org :
        for seqRecord in SeqIO.parse(MM_cds_fasta, format='fasta'):
            if seqRecord.id == seqID :
                return seqRecord
    elif 'HG' in org : 
        for seqRecord in SeqIO.parse(HG_cds_fasta, format='fasta'):
            if seqRecord.id == seqID :
                return seqRecord

def calculate_mutation_metrics(agg_table, seqRecord, threshold):
    ### Normalized delta Mutational Aggregation Propensity
    seq_length = len(seqRecord.seq.translate().replace('*', ''))
    agg_table['deltaAgg'] = agg_table['Aggregation'] / seq_length - agg_table['Aggregation'][0] / seq_length
    agg_table = agg_table[1:]
    mutTol = len(agg_table[agg_table['deltaMut'] == 0 ]) / len(agg_table)
    mutVul = len(agg_table[agg_table['deltaMut'] > threshold]) / len(agg_table)
    return mutTol, mutVul


def mutation_metrics_table(all_agg_scores, threshold):
    tmp = []
    bar = progressbar.ProgressBar()
    for X, Y in bar(all_agg_scores[['proteinID_x', 'proteinID_y']].values):
        try:
            if f'{X}_aggregation.txt' in os.listdir(MUT_RES):
                HG_tmp = pd.read_csv(os.path.join(MUT_RES,f'{X}_aggregation.txt'), sep='\t')
                MM_tmp = pd.read_csv(os.path.join(MUT_RES,f'{Y}_aggregation.txt'), sep='\t')
                mutTol_x, mutVul_x = calculate_mutation_metrics(HG_tmp, get_Seq(X, 'HG'), threshold)
                mutTol_y, mutVul_y = calculate_mutation_metrics(MM_tmp, get_Seq(Y, 'MM'),  threshold)
                tmp.append([X, mutTol_x, mutVul_x, Y, mutTol_y, mutVul_y])
        except:
            pass
    mutation_metrics_table = pd.DataFrame(tmp, columns=['proteinID_x', 'mutTol_x', 'mutVul_x' ,'proteinID_y', 'mutTol_y', 'mutVul_y'])
    
    aggregation_table = all_agg_scores[['proteinID_x', 'Aggregation_x', 'length_x', 'proteinID_y', 'Aggregation_y', 'length_y']]
    aggregation_table = aggregation_table[aggregation_table['proteinID_x'].isin(mutation_metrics_table['proteinID_x'])]
    
    TABLE = mutation_metrics_table.merge(aggregation_table, on=['proteinID_x', 'proteinID_y']) 
    return TABLE

In [31]:
tmp = []
bar = progressbar.ProgressBar()
for agg_table in tqdm(os.listdir(MUT_RES)):
    protein = agg_table.split('_')[0]
    tmp.append([protein, count_final_agg_score(protein, MM_all_count, HG_all_count, MUT_RES)])
check_final_mutants = pd.DataFrame(tmp, columns=['proteinID', 'all_tango_run'])

100%|██████████| 21651/21651 [04:55<00:00, 73.20it/s]


In [33]:
mutant_size = []
for ID in check_final_mutants[check_final_mutants['all_tango_run'] == False]['proteinID']:
    if ID in MM_all_count['proteinID_y'].values:
        size = MM_all_count[MM_all_count['proteinID_y'] == ID]['unique_MT'].values[0]
    elif ID in HG_all_count['proteinID_x'].values:
        size = HG_all_count[HG_all_count['proteinID_x'] == ID]['unique_MT'].values[0]
    mutant_size.append([ID, size])
check_bug_mutants = pd.DataFrame(mutant_size, columns=['proteinID', 'nb_mutants'])

In [34]:
check_bug_mutants

Unnamed: 0,proteinID,nb_mutants
0,Q91ZU6,46086
1,G5B601,36782
2,B1AR51,27783
3,G5B5V9,30563
4,G5AMU8,29212
5,G5C8Z7,34644
6,E9Q1W3,45464
7,A2ANY6,34546
8,G5AK10,12367
9,G5C996,61617


In [75]:
MM_ATX = ['P28658', 'Q9CVD2']
MT_2_ATX = mutation_metrics_table(all_agg_scores[all_agg_scores['proteinID_y'].isin(MM_ATX)], 2)

100% (2 of 2) |##########################| Elapsed Time: 0:00:00 Time:  0:00:00


In [41]:
%%time
MT_2 = mutation_metrics_table(all_agg_scores, 2)
MT_2 = MT_2[~(MT_2['proteinID_x'].isin(check_bug_mutants['proteinID'])) | (MT_2['proteinID_y'].isin(check_bug_mutants['proteinID']))]

100% (9522 of 9522) |####################| Elapsed Time: 0:33:22 Time:  0:33:22


CPU times: user 31min 38s, sys: 2min 1s, total: 33min 39s
Wall time: 33min 22s


In [None]:
MT_2.to_csv('../../data/mutation_tolerance/all_mt_scores.csv', sep='\t', index=False)

__________

## Data for revisions

#### Variations of definition in mutation tolerance

In [52]:
def get_Seq(seqID, org):
    if 'MM' in org :
        for seqRecord in SeqIO.parse(MM_cds_fasta, format='fasta'):
            if seqRecord.id == seqID :
                return seqRecord
    elif 'HG' in org : 
        for seqRecord in SeqIO.parse(HG_cds_fasta, format='fasta'):
            if seqRecord.id == seqID :
                return seqRecord

def isValid(seq_id, seqRecord):
    i = int(seq_id.split('_')[1])
    ALT = seq_id.split('_')[2][1]

    REF_SEQ = str(seqRecord.seq.translate(to_stop=True))
    mutant = MutableSeq(str(seqRecord.seq))
    mutant[i] = ALT
    ALT_SEQ = str(Seq(str(mutant)).translate(to_stop=True))

    if len(REF_SEQ) != len(ALT_SEQ) :
        return False 
    elif str(ALT_SEQ)[0] != 'M' :
        return False 
    else :
        return True 

def calculate_mutation_metrics(agg_table, seqRecord, threshold):
    ### Normalized delta Mutational Aggregation Propensity
    seq_length = len(seqRecord.seq.translate(to_stop=True))
    agg_table['deltaAgg'] = agg_table['Aggregation'] / seq_length - agg_table['Aggregation'][0] / seq_length
    
    agg_table = agg_table[1:].reset_index(drop=True)
    agg_table['valid_mutant'] = agg_table['Sequence'].apply(isValid, args=(seqRecord,))
    
    valid_mutants = agg_table[agg_table['valid_mutant'] == True]
    mutTol_v1 = len(valid_mutants[valid_mutants['deltaAgg'] == 0 ]) / len(valid_mutants)
    mutTol_v2 = len(valid_mutants[valid_mutants['deltaAgg'] <= 0 ]) / len(valid_mutants)
    mutTol_v3 = len(valid_mutants[valid_mutants['deltaAgg'] >= 0 ]) / len(valid_mutants)
    bm_r = len(valid_mutants[valid_mutants['deltaAgg'] < - threshold ]) / len(valid_mutants)
    dm_r = len(valid_mutants[valid_mutants['deltaAgg'] > threshold ]) / len(valid_mutants)
    return mutTol_v1, mutTol_v2, mutTol_v3, bm_r, dm_r


def get_mutTol_scores(proteinID, org, threshold):
     tmp = pd.read_csv(os.path.join(MUT_RES,f'{proteinID}_aggregation.txt'), sep='\t')
     seqRecord = get_Seq(proteinID, org)
     mutTol_v1, mutTol_v2, mutTol_v3, bm_r, dm_r = calculate_mutation_metrics(tmp, seqRecord,  threshold)
     return mutTol_v1, mutTol_v2, mutTol_v3, bm_r, dm_r


def mutation_metrics_table(all_agg_scores, threshold):
    tmp = []
    bar = progressbar.ProgressBar()
    for X, Y in tqdm(all_agg_scores[['proteinID_x', 'proteinID_y']].values):
        try:
            if f'{X}_aggregation.txt' in os.listdir(MUT_RES):
                mutTol_v1_x, mutTol_v2_x, mutTol_v3_x, bm_r_x, dm_r_x = get_mutTol_scores(X, 'HG', threshold)
                mutTol_v1_y, mutTol_v2_y, mutTol_v3_y, bm_r_y, dm_r_y = get_mutTol_scores(Y, 'MM', threshold)
                tmp.append([X, mutTol_v1_x, mutTol_v2_x, mutTol_v3_x, bm_r_x, dm_r_x, Y, mutTol_v1_y, mutTol_v2_y, mutTol_v3_y, bm_r_y, dm_r_y])
        except:
            pass
    cols=['proteinID_x', 'mutTol_0_x', 'mutTol_minus0_x', 'mutTol_plus0_x', 'bm_r_x', 'dm_r_x', 'proteinID_y', 'mutTol_0_y', 'mutTol_minus0_y', 'mutTol_plus0_y', 'bm_r_y', 'dm_r_y']
    mutation_metrics_table = pd.DataFrame(tmp, columns=cols)
    
    aggregation_table = all_agg_scores[['proteinID_x', 'Aggregation_x', 'length_x', 'proteinID_y', 'Aggregation_y', 'length_y']]
    aggregation_table = aggregation_table[aggregation_table['proteinID_x'].isin(mutation_metrics_table['proteinID_x'])]
    
    TABLE = mutation_metrics_table.merge(aggregation_table, on=['proteinID_x', 'proteinID_y']) 
    return TABLE

In [53]:
# MM_ATX = ['P28658', 'Q9CVD2']
MT_var = mutation_metrics_table(all_agg_scores, 1)

100%|██████████| 9522/9522 [14:41:21<00:00,  5.55s/it]   


In [54]:
MT_var = MT_var[~(MT_var['proteinID_x'].isin(check_bug_mutants['proteinID'])) | (MT_var['proteinID_y'].isin(check_bug_mutants['proteinID']))]

In [55]:
MT_var.to_csv('../../data/mutation_tolerance/all_mt_scores_for_revisions.csv', sep='\t', index=False)