In [1]:
from Bio import SeqIO
from Bio.Seq import Seq

import numpy as np
import pandas as pd
import seaborn as sns
import os, progressbar, time

## 1. Collect protein IDs to retrieve their CDS

In [2]:
def collect_cds(fastaFile, cdsList, outFile):
    tmp = []
    for seqRecord in SeqIO.parse(fastaFile, format='fasta'):
        if seqRecord.id in cdsList :
            tmp.append(seqRecord)
    SeqIO.write(tmp, open(outFile, 'w'), 'fasta')

In [3]:
HG_fasta = '../../data/ortholog_dataset/uni_HG_orthologs.faa'
MM_fasta = '../../data/ortholog_dataset/uni_MM_orthologs.faa'

HG_cds_fasta = '/media/savvy/DATA3/savvy/Genomes/model/heterocephalus_glaber/h_glaber_cds_from_genomic.fna'
MM_cds_fasta = '/media/savvy/DATA3/savvy/Genomes/model/mus_musculus/m_musculus_cds_from_genomic.fna'

HG_IDs = [ seqRecord.id for seqRecord in SeqIO.parse(HG_fasta, format='fasta')]
MM_IDs = [ seqRecord.id for seqRecord in SeqIO.parse(MM_fasta, format='fasta')]

In [5]:
collect_cds(HG_cds_fasta, HG_IDs, '../../data/ortholog_dataset/uni_HG_cds_orthologs.faa')
collect_cds(MM_cds_fasta, MM_IDs, '../../data/ortholog_dataset/uni_MM_cds_orthologs.faa')

## 2. Collect the CDS 

In [6]:
HG_cds_fasta = '../../data/ortholog_dataset/uni_HG_cds_orthologs.faa'
MM_cds_fasta = '../../data/ortholog_dataset/uni_MM_cds_orthologs.faa'

#### All proteins

In [7]:
all_agg_scores = pd.read_csv('../../data/aggregation_propensity/HGMM_agg_scores.csv', sep=',') 
ortho_pairs = all_agg_scores[['proteinID_x', 'proteinID_y']]

#### Chaperone clients

In [8]:
uniprot_mapping = pd.read_csv('../../data/chaperone_clients/human_ensembl_to_uniprot.tab', sep='\t')
hs_mm_orthologs = pd.read_csv('../../data/chaperone_clients/HS_MM_uni_ortholog_groups.csv', sep='\t')
hs_mm_orthologs = hs_mm_orthologs[['proteinID_x', 'proteinID_y']]
mm_chap_clt = hs_mm_orthologs[hs_mm_orthologs['proteinID_x'].isin(uniprot_mapping['Entry'])]['proteinID_y']

chap_clt_sub = all_agg_scores[all_agg_scores['proteinID_y'].isin(list(mm_chap_clt))]

In [9]:
len(chap_clt_sub)

1298

In [18]:
#### Container of MM and HG sequences
HG_IDs = list(all_agg_scores['proteinID_x'])
MM_IDs = list(all_agg_scores['proteinID_y'])

In [19]:
HG_seq = {}
MM_seq = {}

for seqRecord in SeqIO.parse(HG_cds_fasta, format='fasta'):
    if seqRecord.id in HG_IDs :
        HG_seq[seqRecord.id] = seqRecord
        
for seqRecord in SeqIO.parse(MM_cds_fasta, format='fasta'):
    if seqRecord.id in MM_IDs :
        MM_seq[seqRecord.id] = seqRecord

In [20]:
print(f'MM: {len(MM_seq)}, HG: {len(HG_seq)}')

MM: 9518, HG: 9522


#### Check cds length / cds with non standard nucleotide

In [21]:
tmp = []
for key in MM_seq.keys() :
    cds_id = key
    cds = MM_seq[key].seq
    if len(cds) % 3 == 0 :
        if list(np.unique(cds)) != ['A', 'C', 'G', 'T']:
            tmp.append([cds_id, 'MM', 'non_standard_nucleotide', False])
        else: 
            tmp.append([cds_id,'MM', 'standard', True])
    else:
        tmp.append([cds_id, 'MM', 'non_standard_length', False])
        
for key in HG_seq.keys() :
    cds_id = key
    cds = HG_seq[key].seq
    if len(cds) % 3 == 0 :
        if list(np.unique(cds)) != ['A', 'C', 'G', 'T']:
            tmp.append([cds_id, 'HG', 'non_standard_nucleotide', False])
        else: 
            tmp.append([cds_id,'HG', 'standard', True])
    else:
        tmp.append([cds_id, 'HG', 'non_standard_length', False])

cds_validity = pd.DataFrame(tmp, columns=['proteinID', 'organism', 'description', 'valid_cds'])

In [22]:
len(cds_validity[cds_validity['valid_cds'] == True])

18944

In [23]:
HG_valids = cds_validity[(cds_validity['valid_cds'] == True) & (cds_validity['organism'] == 'HG')].rename(columns={'proteinID':'proteinID_x'})
MM_valids = cds_validity[(cds_validity['valid_cds'] == True) & (cds_validity['organism'] == 'MM')].rename(columns={'proteinID':'proteinID_y'})

In [24]:
HG_valids = HG_valids.merge(ortho_pairs, on='proteinID_x')
ortho_cds_valids = HG_valids.merge(MM_valids, on='proteinID_y')

In [25]:
ortho_cds_valids

Unnamed: 0,proteinID_x,organism_x,description_x,valid_cds_x,proteinID_y,organism_y,description_y,valid_cds_y
0,E3VX36,HG,standard,True,P51910,MM,standard,True
1,E3VX52,HG,standard,True,P23927,MM,standard,True
2,E3VX64,HG,standard,True,P34884,MM,standard,True
3,E3VX68,HG,standard,True,Q9D7M8,MM,standard,True
4,E3VX70,HG,standard,True,P61514,MM,standard,True
...,...,...,...,...,...,...,...,...
9417,G5CBQ5,HG,standard,True,Q3TYX3,MM,standard,True
9418,G5CBQ6,HG,standard,True,Q8R361,MM,standard,True
9419,G5CBQ8,HG,standard,True,Q04742,MM,standard,True
9420,G5CBQ9,HG,standard,True,Q64105,MM,standard,True


In [40]:
#### Number of valid cds pairs
ortho_cds_valids.to_csv('../../data/mutation_tolerance/ortho_valids_cds.csv', sep='\t', index=False)


Unnamed: 0,proteinID_x,organism_x,description_x,valid_cds_x,proteinID_y,organism_y,description_y,valid_cds_y
0,E3VX36,HG,standard,True,P51910,MM,standard,True
1,E3VX52,HG,standard,True,P23927,MM,standard,True
2,E3VX64,HG,standard,True,P34884,MM,standard,True
3,E3VX68,HG,standard,True,Q9D7M8,MM,standard,True
4,E3VX70,HG,standard,True,P61514,MM,standard,True
...,...,...,...,...,...,...,...,...
9417,G5CBQ5,HG,standard,True,Q3TYX3,MM,standard,True
9418,G5CBQ6,HG,standard,True,Q8R361,MM,standard,True
9419,G5CBQ8,HG,standard,True,Q04742,MM,standard,True
9420,G5CBQ9,HG,standard,True,Q64105,MM,standard,True


In [26]:
#### Number of valid cds pairs (chaperone client proteins)
ortho_cds_valids[ortho_cds_valids['proteinID_y'].isin(mm_chap_clt)]

Unnamed: 0,proteinID_x,organism_x,description_x,valid_cds_x,proteinID_y,organism_y,description_y,valid_cds_y
0,E3VX36,HG,standard,True,P51910,MM,standard,True
2,E3VX64,HG,standard,True,P34884,MM,standard,True
21,G5AJR9,HG,standard,True,Q8BZ60,MM,standard,True
22,G5AJS0,HG,standard,True,Q99PM3,MM,standard,True
26,G5AJS7,HG,standard,True,Q62086,MM,standard,True
...,...,...,...,...,...,...,...,...
9397,G5CBL6,HG,standard,True,Q78XR0,MM,standard,True
9409,G5CBN2,HG,standard,True,Q9Z2F6,MM,standard,True
9412,G5CBP7,HG,standard,True,Q8C1M2,MM,standard,True
9413,G5CBQ1,HG,standard,True,Q8K4E0,MM,standard,True


In [None]:
f = open('../../data/mutation_tolerance/tango_directory_list/MM_chap_list', 'w')
for ID in MM_chap_clt_stats['proteinID_y'].values:
    f.write(f'{ID}\n')
f.close()    

f = open('../../data/mutation_tolerance/tango_directory_list/HG_chap_list', 'w')
for ID in HG_chap_clt_stats['proteinID_x'].values:
    f.write(f'{ID}\n')
f.close()    

In [None]:
f = open('../../data/mutation_tolerance/tango_directory_list/MM_others_list', 'w')
for ID in MM_others_stats['proteinID_y'].values:
    f.write(f'{ID}\n')
f.close()    

f = open('../../data/mutation_tolerance/tango_directory_list//HG_others_list', 'w')
for ID in HG_others_stats['proteinID_x'].values:
    f.write(f'{ID}\n')
f.close()    

#### Tango execution time

- Shortest protein (Q00LT2 - length=53 aa)
    - Time execution in Tango: 0.024s
- Longest protein (G5C996 - length=10159 aa)
    - Time execution in Tango: 5.116s


## 3. Check number of mutants per protein

In [29]:
HG_all_count = pd.read_csv('../../data/mutation_tolerance/HG_all_mutants_counts.csv').rename(columns={'proteinID':'proteinID_x'})
MM_all_count = pd.read_csv('../../data/mutation_tolerance/MM_all_mutants_counts.csv').rename(columns={'proteinID':'proteinID_y'})

In [30]:
HG_all_count[HG_all_count['unique_MT'] > 10000].sort_values('unique_MT')

Unnamed: 0,proteinID_x,all_MT,unique_MT
2475,G5AYN3,14473,10007
8576,G5BYG3,14581,10015
2176,G5AX45,14788,10022
10502,G5C9E9,15058,10025
920,G5AQ52,14518,10046
...,...,...,...
10426,G5C8Z7,50365,34644
3747,G5B601,52138,36782
7260,G5BR15,56656,39544
8958,G5C0E1,79435,55187


In [31]:
MM_all_count[MM_all_count['unique_MT'] > 10000].sort_values('unique_MT')

Unnamed: 0,proteinID_y,all_MT,unique_MT
1295,O88572,14527,10012
65,A0A338P6S8,14743,10012
4022,Q5NCJ1,14491,10021
1404,P02468,14473,10048
1403,P02463,15031,10061
...,...,...,...
181,A2AN08,46630,31953
5541,Q80W93,46396,32033
183,A2ANY6,50311,34546
561,E9Q1W3,64378,45464


In [32]:
ALL_HG = sum(HG_all_count[HG_all_count['proteinID_x'].isin(ortho_cds_valids['proteinID_x'])]['unique_MT'])
ALL_MM = sum(MM_all_count[MM_all_count['proteinID_y'].isin(ortho_cds_valids['proteinID_y'])]['unique_MT'])

ALL_HG + ALL_MM


69896045

In [33]:
HG_chap_clt_counts = pd.read_csv('../../data/mutation_tolerance/HG_chap_client_mutants_counts.csv').rename(columns={'proteinID':'proteinID_x'})
MM_chap_clt_counts = pd.read_csv('../../data/mutation_tolerance/MM_chap_client_mutants_counts.csv').rename(columns={'proteinID':'proteinID_y'})

In [34]:
CHAP_CLT_HG = sum(HG_chap_clt_counts[HG_chap_clt_counts['proteinID_x'].isin(ortho_cds_valids['proteinID_x'])]['unique_MT'])
CHAP_CLT_MM = sum(MM_chap_clt_counts[MM_chap_clt_counts['proteinID_y'].isin(ortho_cds_valids['proteinID_y'])]['unique_MT'])
CHAP_CLT_HG + CHAP_CLT_MM

10071232

### Directory list for sbatch

In [35]:
HG_others_stats = HG_all_stats[~HG_all_stats['proteinID_x'].isin(HG_chap_clt_stats['proteinID_x'])]
MM_others_stats = MM_all_stats[~MM_all_stats['proteinID_y'].isin(MM_chap_clt_stats['proteinID_y'])]

NameError: name 'HG_all_stats' is not defined

### Checking number of mutants

In [26]:
sum(HG_all_stats['unique_MT'])

35014973

In [27]:
sum(MM_all_stats['unique_MT'])

34559953

In [31]:
MM_all_stats[MM_chap_clt_stats['proteinID_y'] == 'A2AN08']

Unnamed: 0,proteinID_y,all_MT,unique_MT
12,A2AN08,46630,31953


In [50]:
HG_all_stats[HG_chap_clt_stats['proteinID_x'] == 'G5AWV7']

Unnamed: 0,proteinID_x,all_MT,unique_MT
291,G5AWV7,40375,28247


## 4. Calculation of mutation tolerance

In [10]:
MUT_RES = '/media/savvy/DATA3/savvy/project_2018/computational_mutagenesis/FINAL_RESULTS'

In [12]:
  def count_final_agg_score(proteinID, MM_all_count, HG_all_count, MUT_RES):
    if proteinID in MM_all_count['proteinID_y'].values:
        exp = int(MM_all_count[ MM_all_count['proteinID_y'] == proteinID ]['unique_MT'].values)
    elif proteinID in HG_all_count['proteinID_x'].values:
        exp = int(HG_all_count[ HG_all_count['proteinID_x'] == proteinID ]['unique_MT'].values)
    
    if f'{proteinID}_aggregation.txt' in os.listdir(MUT_RES):
        agg_tmp = pd.read_csv(os.path.join(MUT_RES,f'{proteinID}_aggregation.txt'), sep='\t')
        real = len(agg_tmp)
            
    if exp == real :
        return True
    else:
        return False


def calculate_mutation_metrics(agg_table, threshold):
    agg_table['deltaMut'] = agg_table['Aggregation'] - agg_table['Aggregation'][0]
    agg_table = agg_table[1:]
    mutTol = len(agg_table[agg_table['deltaMut'] == 0 ]) / len(agg_table)
    mutVul = len(agg_table[agg_table['deltaMut'] > threshold]) / len(agg_table)
    return mutTol, mutVul


def mutation_metrics_table(all_agg_scores, threshold):
    tmp = []
    bar = progressbar.ProgressBar()
    for X, Y in bar(all_agg_scores[['proteinID_x', 'proteinID_y']].values):
        try:
            if f'{X}_aggregation.txt' in os.listdir(MUT_RES):
                HG_tmp = pd.read_csv(os.path.join(MUT_RES,f'{X}_aggregation.txt'), sep='\t')
                MM_tmp = pd.read_csv(os.path.join(MUT_RES,f'{Y}_aggregation.txt'), sep='\t')
                mutTol_x, mutVul_x = calculate_mutation_metrics(HG_tmp, threshold)
                mutTol_y, mutVul_y = calculate_mutation_metrics(MM_tmp, threshold)
                tmp.append([X, mutTol_x, mutVul_x, Y, mutTol_y, mutVul_y])
        except:
            pass
    mutation_metrics_table = pd.DataFrame(tmp, columns=['proteinID_x', 'mutTol_x', 'mutVul_x' ,'proteinID_y', 'mutTol_y', 'mutVul_y'])
    
    aggregation_table = all_agg_scores[['proteinID_x', 'Aggregation_x', 'length_x', 'proteinID_y', 'Aggregation_y', 'length_y']]
    aggregation_table = aggregation_table[aggregation_table['proteinID_x'].isin(mutation_metrics_table['proteinID_x'])]
    
    TABLE = mutation_metrics_table.merge(aggregation_table, on=['proteinID_x', 'proteinID_y']) 
    return TABLE

In [36]:
tmp = []
bar = progressbar.ProgressBar()
for agg_table in bar(os.listdir(MUT_RES)):
    protein = agg_table.split('_')[0]
    tmp.append([protein, count_final_agg_score(protein, MM_all_count, HG_all_count, MUT_RES)])
check_final_mutants = pd.DataFrame(tmp, columns=['proteinID', 'all_tango_run'])

100% (21651 of 21651) |##################| Elapsed Time: 0:09:13 Time:  0:09:13


In [37]:
check_final_mutants[check_final_mutants['all_tango_run'] == True]

Unnamed: 0,proteinID,all_tango_run
0,Q7TSI0,True
1,P39061,True
2,A2AWP8,True
3,G5AX95,True
4,G5B286,True
...,...,...
21646,Q61120,True
21647,G5BLD0,True
21648,G5B8Q4,True
21649,G5AVI8,True


In [38]:
mutant_size = []
for ID in check_final_mutants[check_final_mutants['all_tango_run'] == False]['proteinID']:
    if ID in MM_all_count['proteinID_y'].values:
        size = MM_all_count[MM_all_count['proteinID_y'] == ID]['unique_MT'].values[0]
    elif ID in HG_all_count['proteinID_x'].values:
        size = HG_all_count[HG_all_count['proteinID_x'] == ID]['unique_MT'].values[0]
    mutant_size.append([ID, size])
check_bug_mutants = pd.DataFrame(mutant_size, columns=['proteinID', 'nb_mutants'])

In [39]:
check_bug_mutants

Unnamed: 0,proteinID,nb_mutants
0,Q91ZU6,46086
1,G5B601,36782
2,B1AR51,27783
3,G5B5V9,30563
4,G5AMU8,29212
5,G5C8Z7,34644
6,E9Q1W3,45464
7,A2ANY6,34546
8,G5AK10,12367
9,G5C996,61617


In [41]:
%%time
MT_2 = mutation_metrics_table(all_agg_scores, 2)
MT_2 = MT_2[~(MT_2['proteinID_x'].isin(check_bug_mutants['proteinID'])) | (MT_2['proteinID_y'].isin(check_bug_mutants['proteinID']))]

CPU times: user 5.39 ms, sys: 896 µs, total: 6.28 ms
Wall time: 5.31 ms


In [42]:
MT_2

Unnamed: 0,proteinID_x,mutTol_x,mutVul_x,proteinID_y,mutTol_y,mutVul_y,Aggregation_x,length_x,Aggregation_y,length_y
0,G5B678,0.390741,0.114815,Q3UHI4,0.512232,0.094858,4.147276,261,3.305945,326
1,G5AQ71,0.362447,0.134984,A2CG49,0.355629,0.135774,3.033646,2400,3.007453,2964
2,G5C1Q7,0.199303,0.225087,A0A1W2P7S4,0.328440,0.174312,2.982352,230,1.082609,437
3,G5BQT4,0.256410,0.127739,Q9QYK9,0.303271,0.140187,4.179275,346,4.195598,343
4,G5CB95,0.499500,0.098420,Q8C120,0.462818,0.106876,2.197379,821,1.644278,878
...,...,...,...,...,...,...,...,...,...,...
9344,G5CAQ3,0.464617,0.088457,E9Q1A5,0.463676,0.087782,0.594638,555,0.563788,584
9345,G5AWC8,0.236297,0.111469,Q3UXZ6,0.382700,0.118829,0.408910,346,0.761354,364
9346,G5C5H8,0.509112,0.102506,Q62231,0.503127,0.104605,0.613761,284,0.619046,284
9347,G5AR65,0.411550,0.070175,Q80VJ2,0.529825,0.056140,0.573265,223,0.547302,232


In [None]:
MT_2.to_csv('../../data/mutation_tolerance/all_mt_scores.csv', sep='\t', index=False)

__________

## Data for revisions

#### Variations of definition in mutation tolerance

In [43]:
def calculate_mutation_metrics(agg_table, threshold):
    agg_table['deltaMut'] = agg_table['Aggregation'] - agg_table['Aggregation'][0]
    agg_table = agg_table[1:]
    mutTol_v1 = len(agg_table[agg_table['deltaMut'] == 0 ]) / len(agg_table)
    mutTol_v2 = len(agg_table[agg_table['deltaMut'] <= 0 ]) / len(agg_table)
    mutTol_v3 = len(agg_table[agg_table['deltaMut'] < - threshold ]) / len(agg_table)
    return mutTol_v1, mutTol_v2, mutTol_v3


def mutation_metrics_table(all_agg_scores, threshold):
    tmp = []
    bar = progressbar.ProgressBar()
    for X, Y in bar(all_agg_scores[['proteinID_x', 'proteinID_y']].values):
        try:
            if f'{X}_aggregation.txt' in os.listdir(MUT_RES):
                HG_tmp = pd.read_csv(os.path.join(MUT_RES,f'{X}_aggregation.txt'), sep='\t')
                MM_tmp = pd.read_csv(os.path.join(MUT_RES,f'{Y}_aggregation.txt'), sep='\t')
                mutTol_v1_x, mutTol_v2_x, mutTol_v3_x = calculate_mutation_metrics(HG_tmp, threshold)
                mutTol_v1_y, mutTol_v2_y, mutTol_v3_y = calculate_mutation_metrics(MM_tmp, threshold)
                tmp.append([X, mutTol_v1_x, mutTol_v2_x, mutTol_v3_x, Y, mutTol_v1_y, mutTol_v2_y, mutTol_v3_y])
        except:
            pass
    mutation_metrics_table = pd.DataFrame(tmp, columns=['proteinID_x', 'mutTol_v1_x', 'mutTol_v2_x', 'mutTol_v3_x' ,'proteinID_y', 'mutTol_v1_y', 'mutTol_v2_y', 'mutTol_v3_y'])
    
    aggregation_table = all_agg_scores[['proteinID_x', 'Aggregation_x', 'length_x', 'proteinID_y', 'Aggregation_y', 'length_y']]
    aggregation_table = aggregation_table[aggregation_table['proteinID_x'].isin(mutation_metrics_table['proteinID_x'])]
    
    TABLE = mutation_metrics_table.merge(aggregation_table, on=['proteinID_x', 'proteinID_y']) 
    return TABLE

In [44]:
MT_var = mutation_metrics_table(all_agg_scores, 2)
MT_var = MT_var[~(MT_var['proteinID_x'].isin(check_bug_mutants['proteinID'])) | (MT_var['proteinID_y'].isin(check_bug_mutants['proteinID']))]

100% (9522 of 9522) |####################| Elapsed Time: 0:02:43 Time:  0:02:43


In [45]:
MT_var

Unnamed: 0,proteinID_x,mutTol_v1_x,mutTol_v2_x,mutTol_v3_x,proteinID_y,mutTol_v1_y,mutTol_v2_y,mutTol_v3_y,Aggregation_x,length_x,Aggregation_y,length_y
0,G5B678,0.390741,0.716667,0.187037,Q3UHI4,0.512232,0.729406,0.098852,4.147276,261,3.305945,326
1,G5AQ71,0.362447,0.651965,0.121372,A2CG49,0.355629,0.651509,0.122526,3.033646,2400,3.007453,2964
2,G5C1Q7,0.199303,0.574216,0.238328,A0A1W2P7S4,0.328440,0.605138,0.142018,2.982352,230,1.082609,437
3,G5BQT4,0.256410,0.719347,0.310490,Q9QYK9,0.303271,0.635981,0.170093,4.179275,346,4.195598,343
4,G5CB95,0.499500,0.711942,0.078616,Q8C120,0.462818,0.722720,0.124626,2.197379,821,1.644278,878
...,...,...,...,...,...,...,...,...,...,...,...,...
9344,G5CAQ3,0.464617,0.747390,0.087587,E9Q1A5,0.463676,0.716841,0.082554,0.594638,555,0.563788,584
9345,G5AWC8,0.236297,0.652234,0.137725,Q3UXZ6,0.382700,0.676278,0.093927,0.408910,346,0.761354,364
9346,G5C5H8,0.509112,0.716970,0.091686,Q62231,0.503127,0.707789,0.085276,0.613761,284,0.619046,284
9347,G5AR65,0.411550,0.752193,0.046053,Q80VJ2,0.529825,0.755789,0.047719,0.573265,223,0.547302,232


In [47]:
MT_var.to_csv('../../data/mutation_tolerance/all_mt_scores_for_revisions.csv', sep='\t', index=False)