# Justifying Sampling - 2

In [1]:
import pandas as pd
from scipy.stats.stats import pearsonr   

In [2]:
training_data = pd.read_csv("../../processed_data_740.csv")

In [3]:
training_data.head()

Unnamed: 0,Mutation_Effect_Label,UniProt_ID,Mutation,Interactor_UniProt_ID,Template_sequence_identity,Alignment_score,Interactor_template_sequence_identity,Interactor_alignment_score,Final_ddG,ProtBert_score,...,number_of_residues_mut,IntraclashesEnergy1_wt,IntraclashesEnergy1_mut,IntraclashesEnergy2_wt,IntraclashesEnergy2_mut,Interface_hydrophobic_area,Interface_hydrophilic_area,Interface_total_area,Interface_contact_distance_wt,Interface_contact_distance_mut
0,0,Q9BPZ3,F118A,P11940,1.0,1.0,1.0,0.96875,1.2197,-0.092273,...,118.0,5.30636,3.79101,52.4675,52.4675,427.775,194.715,622.495,3.35293,3.35293
1,0,P01116,Y40C,P50749,0.9277,0.931315,0.3553,0.1725,2.07695,0.998749,...,329.0,47.6177,47.2116,70.2902,70.2461,464.47,400.03,864.505,3.07827,3.59123
2,0,Q96QK1,F534D,Q9UBQ0,1.0,0.360743,1.0,0.994505,2.69571,0.288297,...,453.0,68.2108,68.1971,59.2321,59.1774,510.69,298.82,809.51,3.03938,3.24435
3,0,Q96QK1,F534D,Q9UBQ0-2,1.0,0.360743,1.0,0.973118,2.69571,0.288297,...,453.0,68.2108,68.1971,59.2321,59.1774,510.69,298.82,809.51,3.03938,3.24435
4,0,O43521-3,G66A,Q07812,1.0,0.714286,0.3007,0.222028,0.683334,-0.051025,...,202.0,3.74828,3.76142,41.5937,41.5937,405.51,273.74,679.245,3.37301,3.22505


In [4]:
def convert_isoform(protein_param):
    return protein_param.split('-')[0]

In [5]:
# training_data["Interactor_UniProt_ID"] = training_data["Interactor_UniProt_ID"].apply(lambda x: convert_isoform(x))

In [6]:
training_data.head()

Unnamed: 0,Mutation_Effect_Label,UniProt_ID,Mutation,Interactor_UniProt_ID,Template_sequence_identity,Alignment_score,Interactor_template_sequence_identity,Interactor_alignment_score,Final_ddG,ProtBert_score,...,number_of_residues_mut,IntraclashesEnergy1_wt,IntraclashesEnergy1_mut,IntraclashesEnergy2_wt,IntraclashesEnergy2_mut,Interface_hydrophobic_area,Interface_hydrophilic_area,Interface_total_area,Interface_contact_distance_wt,Interface_contact_distance_mut
0,0,Q9BPZ3,F118A,P11940,1.0,1.0,1.0,0.96875,1.2197,-0.092273,...,118.0,5.30636,3.79101,52.4675,52.4675,427.775,194.715,622.495,3.35293,3.35293
1,0,P01116,Y40C,P50749,0.9277,0.931315,0.3553,0.1725,2.07695,0.998749,...,329.0,47.6177,47.2116,70.2902,70.2461,464.47,400.03,864.505,3.07827,3.59123
2,0,Q96QK1,F534D,Q9UBQ0,1.0,0.360743,1.0,0.994505,2.69571,0.288297,...,453.0,68.2108,68.1971,59.2321,59.1774,510.69,298.82,809.51,3.03938,3.24435
3,0,Q96QK1,F534D,Q9UBQ0-2,1.0,0.360743,1.0,0.973118,2.69571,0.288297,...,453.0,68.2108,68.1971,59.2321,59.1774,510.69,298.82,809.51,3.03938,3.24435
4,0,O43521-3,G66A,Q07812,1.0,0.714286,0.3007,0.222028,0.683334,-0.051025,...,202.0,3.74828,3.76142,41.5937,41.5937,405.51,273.74,679.245,3.37301,3.22505


In [7]:
def get_entries_with_pair(protein_param, mutation_param, data):
    query_data = data[
        (data["UniProt_ID"] == protein_param) &
        (data["Mutation"] == mutation_param)
    ].copy()
    
    return query_data

In [8]:
def is_same_interactor(data):
    unique_interactors = data["Interactor_UniProt_ID"].unique()
    unique_interactors = set(map(convert_isoform, unique_interactors))
    return len(unique_interactors) == 1
    

In [10]:
unique_pairs = list(zip(training_data["UniProt_ID"], training_data["Mutation"]))
unique_pairs[:5]

[('Q9BPZ3', 'F118A'),
 ('P01116', 'Y40C'),
 ('Q96QK1', 'F534D'),
 ('Q96QK1', 'F534D'),
 ('O43521-3', 'G66A')]

In [11]:
unique_pairs_correlation_data = pd.DataFrame(unique_pairs, columns=["PROTEIN", "MUTATION"])
unique_pairs_correlation_data.head()

Unnamed: 0,PROTEIN,MUTATION
0,Q9BPZ3,F118A
1,P01116,Y40C
2,Q96QK1,F534D
3,Q96QK1,F534D
4,O43521-3,G66A


In [12]:
from itertools import combinations

In [13]:
from tqdm.notebook import tqdm

In [14]:
corr_scores = []
for pair in tqdm(unique_pairs):
    query = get_entries_with_pair(pair[0], pair[1], training_data)
    if is_same_interactor(query):
        pair_corr_score = "NOT APPLICABLE"
    else:
        interactors = get_interactors(query)
        for interactor in interactors:
        
            corr_score_values = []
            data_indices = query.index
            for a, b in combinations(data_indices, 2):
                corr_score_values.append(
                    pearsonr(query.loc[a, :][4:], query.loc[b, :][4:])[0]
                )
        
        corr_score = round( (sum(corr_score_values) / len(corr_score_values)), 2 )
    
    corr_scores.append(corr_score)
    

  0%|          | 0/740 [00:00<?, ?it/s]

In [15]:
unique_pairs_correlation_data["PEARSON_CORR"] = corr_scores

In [16]:
unique_pairs_correlation_data["PEARSON_CORR"].value_counts()

NOT APPLICABLE    567
1.0                77
0.99               51
0.48               14
0.66                5
0.88                5
0.77                5
0.94                4
0.5                 3
0.87                3
0.93                2
0.98                2
0.52                2
Name: PEARSON_CORR, dtype: int64