# IntAct - Elaspic Match

**Match**


<b><i class="fa fa-folder-o" area-hidden="true" style="color:#1976D2"> </i>&nbsp; File Location</b><br>
<p style="background:#F5F5F5; text-indent: 1em;">
<code style="background:#F5F5F5; color:#404040; font-weight:bold; font-size:12px">C:\Users\ibrah\Documents\GitHub\Predicting-Mutation-Effects\src\helpers\helpers_training_data</code>
</p>

<b><i class="far fa-file" area-hidden="true" style="color:#1976D2"> </i>&nbsp; File Name</b>
<p style="background:#F5F5F5; text-indent: 1em;">
<code style="background:#F5F5F5; color:#404040; font-weight:bold; font-size:12px">Match.ipynb</code>
</p>

<b><i class="far fa-calendar-alt" area-hidden="true" style="color:#1976D2"> </i>&nbsp; Last Edited</b>
<p style="background:#F5F5F5; text-indent: 1em;">
<code style="background:#F5F5F5; color:#404040; font-weight:bold; font-size:12px">October 06th, 2021</code>
</p>


<div class="alert alert-block" style="background-color: #F5F5F5; border: 1px solid; padding: 10px; border-color: #E0E0E0">
    <b><i class="fa fa-compass" aria-hidden="true" style="color:#404040"></i></b>&nbsp; <b style="color: #404040">Purpose </b> <br>
<div>

- [x] Identify overlapping triplets of $(protein, mutation, interactor)$ between *ELASPIC* and *IntAct* datasets.
    - The dash is omitted both in $protein$ and $interactor$, if any.
    
    
- [x] Find associated labels of overlapping *valid* triplets.
    - If a triplet results in having both *disrupting* and *decreasing*, it is counted as *disrupting*.
    
    
- [x] Create *training_data* to be used in Machine Learning.
   
    
    
##### Dependent files:
* IntAct Processed File: *processed_data_v3.1.csv*
    
* ELASPIC_RESULTS: *data/allresults_mutations_merged_cleaned_v5.txt*

## Setup

In [1]:
# Imports
import pandas as pd
import sys
import os
import random
from collections import defaultdict
from IPython.display import display
from tqdm.notebook import tqdm

# Notebook Settings
pd.set_option('display.max_colwidth', 1000)

# Paths
INTACT_PROCESSED_FILE_PATH = "processed_data_v3.1_rs.csv"
ELASPIC_RESULTS_V5 = "elaspic_results_combined_rs/allresults_mutations_merged_cleaned_v5_rs.txt"

# Necessary functions
def progressbar(it, prefix="", size=20, file=sys.stdout):
    count = size
    size = size // 1_000

    def show(j):
        x = int(size * j / count)
        file.write("%s[%s%s] %i/%i\r" % (prefix, "#" * x, "-" * (size - x), j, count))
        file.flush()

    show(0)
    for i, item in enumerate(it):
        yield item
        show(i + 1)
    file.write("\n")
    file.flush()

def print_annotation(s):
    print(f"\n{s}\n{'-' * len(s)}")

## Read files

### 1. IntAct

Processed version of IntAct dataset.\
*(36170, 4)* 

In [2]:
# Read IntAct Data
intact_data = pd.read_csv(INTACT_PROCESSED_FILE_PATH)

# Size of dataframe
print(intact_data.shape)

# First 5 entries
intact_data.head()

(36170, 4)


Unnamed: 0,Mutation Effect Label,Affected Protein AC,Mutation,Interaction Participant
0,mutation causing(MI:2227),Q06124-2,D425A,Q6P1J9
1,mutation causing(MI:2227),Q06124-2,C459S,Q6P1J9
2,mutation causing(MI:2227),Q06124-2,Y279C,Q6P1J9
3,mutation causing(MI:2227),P04264,L161P,P37198
4,mutation causing(MI:2227),P45381,E285A,Q14145


### 2. ELASPIC Results

ELASPIC results v5.\
*(16481, 103)*

In [3]:
# Read Elaspic results data
elaspic_data = pd.read_table(ELASPIC_RESULTS_V5, delimiter="\t", low_memory=False)

# Drop duplicated entries..
elaspic_data.drop_duplicates(keep="first", inplace=True)

# Reset index of the dataframe to avoid any possible errors
elaspic_data.reset_index(drop=True, inplace=True)

# Size of dataframe
print(elaspic_data.shape)

# First 5 entries
elaspic_data.head(3)

(16481, 103)


Unnamed: 0,Input_identifier,UniProt_ID,Mutation,Status,Type,COSMIC_mut_ID,ClinVar_mut_ID,UniProt_mut_ID,Domain_name,Domain_clan,...,number_of_residues_mut,IntraclashesEnergy1_wt,IntraclashesEnergy1_mut,IntraclashesEnergy2_wt,IntraclashesEnergy2_mut,Interface_hydrophobic_area,Interface_hydrophilic_area,Interface_total_area,Interface_contact_distance_wt,Interface_contact_distance_mut
0,Q9UBQ0,Q9UBQ0,A141D,done,interface,-,-,-,Metallophos_2,Metallophos_2,...,453.0,59.0065,59.008,68.3251,68.3251,510.69,298.82,809.51,3.66366,3.34983
1,P11474,P11474,Q262E,done,core,-,-,-,zf-C4+Hormone_recep,zf-C4+Hormone_recep,...,350.0,-,-,-,-,-,-,-,-,-
2,Q92783,Q92783,D31Q,done,interface,-,-,-,VHS,VHS,...,278.0,41.2761,41.2109,52.3361,52.3361,223.165,130.435,353.6,2.61438,2.91481


## Helper Functions

In [4]:
def get_intact_entries(uniprot_id_param, mutation_param, interactor_uniprot_id_param, intact_data_param=intact_data):
    """
    Return the entires in IntAct where given conditions are met.
    """
    queried_data = intact_data_param[(intact_data_param["Affected Protein AC"].apply(lambda x: x.split('-')[0]) == uniprot_id_param) & 
                                     (intact_data_param["Mutation"] == mutation_param) &
                                     (intact_data_param["Interaction Participant"].apply(lambda x: x.split('-')[0]) == interactor_uniprot_id_param)]
    
    return queried_data

In [5]:
def get_elaspic_entries(uniprot_id_param, mutation_param, interactor_uniprot_id_param, elaspic_data_param=elaspic_data):
    """
    Return the entries in ELASPIC where given conditions are met.
    """
    queried_data = elaspic_data_param[(elaspic_data_param["UniProt_ID"].apply(lambda x: x.split('-')[0]) == uniprot_id_param) &
                                      (elaspic_data_param["Mutation"] == mutation_param) &
                                      (elaspic_data_param["Interactor_UniProt_ID"].apply(lambda x: x.split('-')[0]) == interactor_uniprot_id_param)]
    
    return queried_data

In [6]:
def get_elaspic_entries_triplet(uniprot_id_param, mutation_param, interactor_uniprot_id_param, elaspic_data_param=elaspic_data):
    """
    Data is minimized to have just three columns, i.e. triplet
    """
    queried_data = get_elaspic_entries(uniprot_id_param, mutation_param, interactor_uniprot_id_param, elaspic_data_param)
    
    return queried_data[["UniProt_ID", "Mutation", "Interactor_UniProt_ID"]]

In [7]:
def count_labels(triplets_param, only_multiple=False):
    """
    Iterate over the triplets and count associated labels in IntAct.
    Returns a dictionary that matches labels to their counts.
    """
    # triplets_param = set(triplets_param)
    label_combination_count = defaultdict(int)
    
    for triplet in triplets_param:
        triplet_protein, triplet_mutation, triplet_interactor = triplet
        mutation_effect = get_intact_entries(triplet_protein, triplet_mutation, triplet_interactor)["Mutation Effect Label"]
        label_combination = tuple(sorted(set(mutation_effect)))
        
        # Counting only multiple labels.
        if only_multiple:
            if len(label_combination) > 1:
                ## Adding to dictionary
                label_combination_count[label_combination] += 1
        
        # Counting all label counts.
        else:
            ## Adding to dictionary
            label_combination_count[label_combination] += 1
        
    return label_combination_count

In [8]:
def get_valid_triplets_and_counts(triplets_param):
    """
    Iterate over the triplets. Check if there is an invalid combination of labels and count associated labels in IntAct.
    Returns:
        - A list of triplets which did not encounter any invalid label combinations, i.e. valid triplets.
        - A dictionary that matches labels to their counts.
    """
    
    # Combination of labels such that they are not subset of another, i.e. different hierarchy level.
    invalid_label_combinations = [('mutation decreasing strength(MI:1133)', 'mutation disrupting strength(MI:1128)', 'mutation with no effect(MI:2226)'),
                                  ('mutation decreasing strength(MI:1133)', 'mutation with no effect(MI:2226)'),
                                  ('mutation disrupting strength(MI:1128)', 'mutation with no effect(MI:2226)'),
                                  ('mutation decreasing strength(MI:1133)', 'mutation increasing strength(MI:1132)'),
                                  ('mutation disrupting strength(MI:1128)', 'mutation disrupting(MI:0573)', 'mutation with no effect(MI:2226)'),
                                  ('mutation disrupting(MI:0573)', 'mutation increasing(MI:0382)')]
    
    valid_triplets_unique = set()
    valid_label_combinations_to_count = dict()
    
    for triplet in triplets_param:
        triplet_protein, triplet_mutation, triplet_interactor = triplet
        mutation_effect = get_intact_entries(triplet_protein, triplet_mutation, triplet_interactor)["Mutation Effect Label"]
        label_combination = tuple(sorted(set(mutation_effect)))
        
        # Labels are OK
        if label_combination not in invalid_label_combinations:
            valid_triplets_unique.add(triplet)

            if label_combination not in valid_label_combinations_to_count:
                valid_label_combinations_to_count[label_combination] = 1
            else:
                valid_label_combinations_to_count[label_combination] += 1
                
    return valid_triplets_unique, valid_label_combinations_to_count


In [9]:
def check_multiple_versions(triplet_param, elaspic_data_param=elaspic_data):
    """
    Check if there will be a protein such that ELASPIC result contains multiple versions of a protein. 
    E.g: 
        Seach for (P14598, P363A, P19878)
        Found:
            P14598, P363A, P19878
            P14598, P363A, P19878-3
            P14598, P363A, P19878-4
    """
    
    uniprot_id_param, mutation_param, interactor_uniprot_id_param = triplet_param
    queried_data = get_elaspic_entries(uniprot_id_param, mutation_param, interactor_uniprot_id_param)
    
    # Return triplet if there exists more than one protein (either in self or in interactor) for a queried data with a triplet.
    if len(set(queried_data["UniProt_ID"])) > 1 or len(set(queried_data["Interactor_UniProt_ID"])) > 1:
        return uniprot_id_param, mutation_param, interactor_uniprot_id_param
    
    return False

## Used with following:
# for triplet in valid_triplets_unique:
#     query_protein, query_mutation, query_interactor = triplet
#     if get_elaspic_entries(query_protein, query_mutation, query_interactor).shape[0] == 2:
#         print(triplet)

In [10]:
def find_match(elaspic_data_param):
    """
    Find the matched list of (protein.mutation.interactor) in given ELASPIC and IntAct datasets.
    Returns unique triplets, i.e. set.
    """
    
    found_triplets = []
    
    for index, row in tqdm(elaspic_data_param.iterrows(), desc="Searching..: ", total=len(elaspic_data_param)):
        uniprot_id = row["UniProt_ID"]
        mutation = row["Mutation"]
        interactor_uniprot_id = row["Interactor_UniProt_ID"]
        
        # Skip if interactor_uniprot_id is empty ("-")
        if interactor_uniprot_id == "-":
            continue

        # Get rid of dash (if any) in both protein and interactor_protein ("-")  E.g. P60953-1 → P60953
        uniprot_id = uniprot_id.split('-')[0]
        interactor_uniprot_id = interactor_uniprot_id.split('-')[0]
        
        queried_data = get_intact_entries(uniprot_id, mutation, interactor_uniprot_id)

        if queried_data.shape[0]:
            # print(uniprot_id, mutation, interactor_uniprot_id)
            found_triplets.append((uniprot_id, mutation, interactor_uniprot_id))
            
    print("Total found triplets:", len(found_triplets))
    print("Unique:", len(set(found_triplets)))
    
    return set(found_triplets)
    

## IntAct vs ELASPIC Matching

Go though the entries in ELASPIC and see whether we have a match.

<div class="alert alert-block" style="background-color: white; border: 2px solid; padding: 10px; border-color: #F57C00">
    <b style="color: #F57C00"><i class="fa fa-warning" aria-hidden="true"></i>&nbsp; Warning</b><br>
<div>
    
###### The process of searching for matched triplets takes approx. 6 minutes

In [11]:
# Searching for matched triplets.
found_triplets = find_match(elaspic_data)

Searching..:   0%|          | 0/16481 [00:00<?, ?it/s]

In [None]:
# A few samples.
list(found_triplets)[:5]

[('O15111', 'K44A', 'P25963'),
 ('Q13241', 'F114A', 'P17693'),
 ('P42773', 'F37I', 'Q00534'),
 ('P07998', 'R67L', 'P13489'),
 ('P35225', 'K137A', 'P78552')]

## Check for "Bad Instances"

### Multiple Label Counts

By looking at the multiple labels, (that is for a given triplet $t_1$, mutation effect label contains more than one item), undesired combination of labels can be determined. As shown table below, among $652$ unique triplet, $17$ of them will be removed as they cause *invalid* multiple mutation labels.

In [None]:
# Multiple label counts
multiple_label_combinations_to_count = count_labels(found_triplets, only_multiple=True)

# Number of multiple-label category
print_annotation(f"Number of multiple-labeled combinations: {len(multiple_label_combinations_to_count)}")

## Printing multiple-label and its count
# for key, value in multiple_label_combinations_to_count.items():
#     print("{} → {}".format(key, value))


Number of multiple-labeled combinations: 10
-------------------------------------------


  
    
| # | Multiple-label | Count | 
| --- | --- | --- |
|  1 | ('mutation decreasing(MI:0119)', 'mutation disrupting(MI:0573)') | 24 |
|  2 | **('mutation decreasing strength(MI:1133)', 'mutation with no effect(MI:2226)')** | **6** |
|  3 | ('mutation decreasing strength(MI:1133)', 'mutation disrupting strength(MI:1128)') | 8 |
|  4 | **('mutation disrupting strength(MI:1128)', 'mutation disrupting(MI:0573)', 'mutation with no effect(MI:2226)')** | **2** |
|  5 | ('mutation decreasing(MI:0119)', 'mutation disrupting strength(MI:1128)') | 1 |
|  6 | **('mutation disrupting strength(MI:1128)', 'mutation with no effect(MI:2226)')** | **4** |
|  7 | **('mutation disrupting(MI:0573)', 'mutation increasing(MI:0382)')** | **1** |
|  8 | ('mutation disrupting strength(MI:1128)', 'mutation disrupting(MI:0573)') | 1 |
|  9 | **('mutation decreasing strength(MI:1133)', 'mutation increasing strength(MI:1132)')** | **3** |
| 10 | **('mutation decreasing strength(MI:1133)', 'mutation disrupting strength(MI:1128)', 'mutation with no effect(MI:2226)')** | **1** |\\

$$\textit{Number of invalid triplets} : 6 + 2 + 4 + 1 + 3 + 1 = 17 $$

### All Label Counts

In [None]:
# Counting all label combinations
label_combinations_to_count = count_labels(found_triplets)

## Number of all-label category
# print_annotation(f"Number of all label combinations: {len(label_combinations_to_count)}")

## Printing all label combinations and its count
# for key, value in label_combinations_to_count.items():
#     print("{} → {}".format(key, value))

# Length of found_triplets_unique
print("Length of valid_triplets_unique:", len(found_triplets))

# Sum of values
print("Total label counted:", pd.DataFrame(dict(sorted(label_combinations_to_count.items(), key=lambda item: item[1], reverse=True)).items(), columns=['Mutation Effect Label', 'Value Count'])['Value Count'].sum())

# Confirm that we count all entries in IntAct and its labels
sum([v for k,v in label_combinations_to_count.items()]) == len(found_triplets)

Length of valid_triplets_unique: 652
Total label counted: 652


True

Double checking if all labels are counted. \
$ Number\ of\ unique\ triplets\ = 652$ \
$Sum\ of\ values\ for\ all\ label\ combinations = 652$

### Valid Triplets

A *valid triplet* is a triplet such that it has either single label or multi labels with subsetted manner.

In [None]:
# Obtain valid triplet set, and their label counts
valid_triplets_unique, valid_label_combinations_to_count = get_valid_triplets_and_counts(found_triplets)

# print(len(valid_label_combinations_to_count))

print_annotation("Value counts of labels of valid_triplets in IntAct:")
display(pd.DataFrame(dict(sorted(valid_label_combinations_to_count.items(), key=lambda item: item[1], reverse=True)).items(), columns=['Mutation Effect Label', 'Value Count']))

# Sum of values
print("Total label counted:", pd.DataFrame(dict(sorted(valid_label_combinations_to_count.items(), key=lambda item: item[1], reverse=True)).items(), columns=['Mutation Effect Label', 'Value Count'])['Value Count'].sum())

# Length of valid_triplets_unique
print("Length of valid_triplets_unique:", len(valid_triplets_unique))



Value counts of labels of valid_triplets in IntAct:
---------------------------------------------------


Unnamed: 0,Mutation Effect Label,Value Count
0,"(mutation disrupting(MI:0573),)",167
1,"(mutation decreasing(MI:0119),)",131
2,"(mutation disrupting strength(MI:1128),)",114
3,"(mutation with no effect(MI:2226),)",82
4,"(mutation decreasing strength(MI:1133),)",54
5,"(mutation increasing(MI:0382),)",27
6,"(mutation decreasing(MI:0119), mutation disrupting(MI:0573))",24
7,"(mutation increasing strength(MI:1132),)",11
8,"(mutation decreasing rate(MI:1130),)",10
9,"(mutation decreasing strength(MI:1133), mutation disrupting strength(MI:1128))",8


Total label counted: 635
Length of valid_triplets_unique: 635


Double checking if all labels are counted. \
$\textit{Number of unique}\ \underline{valid}\ \textit{triplets}  = 635$ \
$\textit{Sum of values for}\ \underline{valid\_label\_combinations}\  = 635$ 

In [None]:
# A few samples.
list(valid_triplets_unique)[:5]

[('O15111', 'K44A', 'P25963'),
 ('Q13241', 'F114A', 'P17693'),
 ('P42773', 'F37I', 'Q00534'),
 ('P07998', 'R67L', 'P13489'),
 ('P35225', 'K137A', 'P78552')]

<div class="alert alert-block" style="background-color: white; border: 2px solid; padding: 10px; border-color: #0097A7">
    <b style="color: #0097A7"><i class="fa fa-info-circle" aria-hidden="true"></i>&nbsp; Info</b><br>
<div>
    
Number of `valid_triplets_unique` : **635**

## How many entries there will be in ELASPIC?

The *training data* will be formed by following:

$$
  \textit{training data} = \underbrace{[\textit{mutation label}]}_\text{IntAct} +
      \underbrace{[\textit{Feature } 1] \dots [\textit{Feature } 103]}_\text{ELASPIC}
$$

Therefore, value counts as well as the total number of entries of training data will be determined by ELASPIC.

In [None]:
elaspic_training_entry_count = 0
for triplet in valid_triplets_unique:
    triplet_protein, triplet_mutation, triplet_interactor = triplet
    queried_data = get_elaspic_entries(triplet_protein, triplet_mutation, triplet_interactor)
    
    # Add number of entries.
    elaspic_training_entry_count += queried_data.shape[0]

print(elaspic_training_entry_count)

1101


Numbers are increasing. This implies that for a given triplet, there will be more number of entries in ELASPIC then IntAct. In other words, there might be more than one entry in ELASPIC.

# Training Data Label Values

**Note:** In the case of labels being *decreasing* and *disrupting*, corresponding label is decided to be *disrupting*. The behavior can be changed in `label_selector` function, below.

In [None]:
def label_selector(labels):
    """
    In the case of there are multiple (valid) labels, control and decide which label we desire to use.
    """
    label_set_to_single_label = {
        ('mutation disrupting strength(MI:1128)', 'mutation disrupting(MI:0573)'): "disrupting", 
        ('mutation decreasing(MI:0119)', 'mutation disrupting strength(MI:1128)'): "disrupting", 
        ('mutation decreasing(MI:0119)', 'mutation disrupting(MI:0573)'): "disrupting", 
        ('mutation decreasing strength(MI:1133)', 'mutation disrupting strength(MI:1128)'): "disrupting"
    }
    
    return label_set_to_single_label[labels]

The mutation effect labels in the training data will be as follows:

In [None]:
train_data_label_count = defaultdict(int)

for triplet in valid_triplets_unique:
    triplet_protein, triplet_mutation, triplet_interactor = triplet
    
    mutation_effect = get_intact_entries(triplet_protein, triplet_mutation, triplet_interactor)["Mutation Effect Label"]
    label_combination = tuple(sorted(set(mutation_effect)))
        
    if len(label_combination) > 1:
        # print('label_combination', label_combination, "added as:", label_selector(label_combination))
        corresponding_label = label_selector(label_combination)
    else:
        corresponding_label = list(label_combination)[0]
    
    queried_elaspic_data = get_elaspic_entries(triplet_protein, triplet_mutation, triplet_interactor)
    
    # Update dictionary vals
    train_data_label_count[corresponding_label] += queried_elaspic_data.shape[0]


# print(train_data_label_count)

print_annotation("Training data label counts:")
display(pd.DataFrame(dict(sorted(train_data_label_count.items(), key=lambda item: item[1], reverse=True)).items(), columns=['Mutation Effect Label', 'Value Count']))

## Copying to clipboard
# pd.DataFrame(dict(sorted(train_data_label_count.items(), key=lambda item: item[1], reverse=True)).items(), columns=['Mutation Effect Label', 'Value Count']).to_clipboard()

print_annotation(f"Sum value counts: {pd.DataFrame(dict(sorted(train_data_label_count.items(), key=lambda item: item[1], reverse=True)).items(), columns=['Mutation Effect Label', 'Value Count'])['Value Count'].sum()}")


Training data label counts:
---------------------------


Unnamed: 0,Mutation Effect Label,Value Count
0,mutation disrupting(MI:0573),268
1,mutation decreasing(MI:0119),210
2,mutation disrupting strength(MI:1128),198
3,mutation decreasing strength(MI:1133),128
4,mutation with no effect(MI:2226),118
5,disrupting,69
6,mutation increasing(MI:0382),57
7,mutation increasing strength(MI:1132),24
8,mutation decreasing rate(MI:1130),21
9,mutation disrupting rate(MI:1129),5



Sum value counts: 1101
----------------------


# Construction of Training Data

The training data will be constructed as a dataframe `training_data`, then will be exported to a file.

In [None]:
def get_corresponding_label(triplet):
    """
    Given a triplet, return corresponding mutation effect label in IntAct.
    Assumes the given triplet is a valid triplet, i.e. associated label set is 
    either contain single label or multiple labels which are subset of each other.
    """
    
    triplet_protein, triplet_mutation, triplet_interactor = triplet

    mutation_effect = get_intact_entries(triplet_protein, triplet_mutation, triplet_interactor)["Mutation Effect Label"]
    label_combination = tuple(sorted(set(mutation_effect)))

    if len(label_combination) > 1:
        # print('label_combination', label_combination, "added as:", label_selector(label_combination))
        corresponding_label = label_selector(label_combination)
    elif len(label_combination) == 1:
        corresponding_label = list(label_combination)[0]
    else:
        raise ValueError(f'Invalid Triplet: length={len(label_combination)}')
        
    return corresponding_label
    

In [None]:
def constuct_training_data(triplets_param):
    """
    Construct a training data, having dimensios of (N, 104), containing 103 feature columns and 1 label column.
    Iterate over triplets_param and for each triplet, query the ELASPIC data and gather corresponding entries in a dataframe form. 
    These quiered dataframes are appended to a `entries` list for efficieny purpose.
    Return concatenation of list of dataframes.
    """
    
    # The list of dataframes that contains each queried data will be appended.
    data_frames = []
    
    for triplet in triplets_param:
        triplet_protein, triplet_mutation, triplet_interactor = triplet
        
        # Query the ELASPIC 
        queried_data = get_elaspic_entries(triplet_protein, triplet_mutation, triplet_interactor)
        
        # Get corresponding label
        corresponding_label = get_corresponding_label(triplet)
        
        # Put corresponding label as the 0th column, namely 'Mutation_Effect_Label'
        queried_data.insert(0, "Mutation_Effect_Label", corresponding_label)
        
        # Append queried data to the list.
        data_frames.append(queried_data)
        
    # Concatenate the dataframes.
    data_frame_concatenated = pd.concat(data_frames, ignore_index=True)
    
    return data_frame_concatenated
        

In [None]:
# Construct the training data
training_data = constuct_training_data(valid_triplets_unique)

### Training data at a glance

In [None]:
# Size of dataframe
print(training_data.shape)

print_annotation("First 5 rows:")
display(training_data.head())

print_annotation("First 5 rows (Simplified view):")
display(training_data.head()[["Mutation_Effect_Label", "UniProt_ID", "Mutation", "Interactor_UniProt_ID"]])

(1101, 104)

First 5 rows:
-------------


Unnamed: 0,Mutation_Effect_Label,Input_identifier,UniProt_ID,Mutation,Status,Type,COSMIC_mut_ID,ClinVar_mut_ID,UniProt_mut_ID,Domain_name,...,number_of_residues_mut,IntraclashesEnergy1_wt,IntraclashesEnergy1_mut,IntraclashesEnergy2_wt,IntraclashesEnergy2_mut,Interface_hydrophobic_area,Interface_hydrophilic_area,Interface_total_area,Interface_contact_distance_wt,Interface_contact_distance_mut
0,mutation disrupting(MI:0573),O15111,O15111,K44A,done,interface,-,-,-,Pkinase,...,563.0,231.514,231.178,155.166,155.166,1540.9,807.345,2348.25,2.75538,4.03856
1,mutation disrupting strength(MI:1128),Q13241,Q13241,F114A,done,interface,-,-,-,Lectin_C,...,387.0,71.9462,71.719,153.581,153.581,,388.64,,3.22741,6.73322
2,mutation disrupting(MI:0573),P42773,P42773,F37I,done,interface,-,-,-,Ank,...,455.0,53.3824,53.547,62.1929,62.274,315.075,253.54,568.615,3.02357,2.36604
3,mutation decreasing(MI:0119),P07998,P07998,R67L,done,interface,-,-,-,RnaseA,...,584.0,36.6283,35.0562,144.855,145.174,445.115,433.935,879.045,2.91678,2.96829
4,mutation decreasing(MI:0119),P35225,P35225,K137A,done,interface,-,-,-,IL13,...,332.0,97.5241,97.4666,164.758,164.758,,147.43,,2.63635,3.88839



First 5 rows (Simplified view):
-------------------------------


Unnamed: 0,Mutation_Effect_Label,UniProt_ID,Mutation,Interactor_UniProt_ID
0,mutation disrupting(MI:0573),O15111,K44A,P25963
1,mutation disrupting strength(MI:1128),Q13241,F114A,P17693
2,mutation disrupting(MI:0573),P42773,F37I,Q00534
3,mutation decreasing(MI:0119),P07998,R67L,P13489
4,mutation decreasing(MI:0119),P35225,K137A,P78552


### Exporting the `training_data`

In [None]:
# Name of the output file
OUTPUT_FILE_PATH = "training_data_M1_rs_based_pandas_1.1.4.txt"

In [None]:
# Exporting
training_data.to_csv(OUTPUT_FILE_PATH, sep='\t', index=0)

---

## Tests

Ensuring everything is as expected.

In [39]:
def confirm_triplets(triplets_param, training_data_param=training_data):
    """
    With a randomly selected triplet in triplets_param, 
    display the IntAct entries of this triplet and
    return training_data_fragment which is a part of 
    training data constructed by given triplet only.
    """
       
    # Sample a triplet
    sample_triplet = random.sample(list(triplets_param), 1)[0]
    
    # Unpack triplet
    triplet_protein, triplet_mutation, triplet_interactor = sample_triplet

    # Display the IntAct entries
    print_annotation("Displaying the IntAct entries:")
    display(get_intact_entries(triplet_protein, triplet_mutation, triplet_interactor))
    
    # Display the ELASPIC entries
    print_annotation("Displaying the ELASPIC entries:")
    display(get_elaspic_entries_triplet(triplet_protein, triplet_mutation, triplet_interactor))
    
    # Get ELASPIC Entries and put associated table. 
    sample_training_data_expected = get_elaspic_entries(triplet_protein, triplet_mutation, triplet_interactor)
    sample_training_data_expected.insert(0, "Mutation_Effect_Label", get_corresponding_label(sample_triplet))
    sample_training_data_expected.reset_index(drop=True, inplace=True)
    
    # Display the expected training entries
    print_annotation("Displaying the expected training entries:")
    display(sample_training_data_expected)
    
    # Find entries in the actual training data
    sample_training_data_actual = get_training_entries(triplet_protein, triplet_mutation, triplet_interactor)
    sample_training_data_actual.reset_index(drop=True, inplace=True)
    
    # Display the actual training entries
    print_annotation("Displaying the actual training entries:")
    display(sample_training_data_actual)
    
    if sample_training_data_expected.equals(sample_training_data_actual):
        print_annotation("Test OK")
    else:
        print_annotation("Test FAILED")
    

In [40]:
def get_training_entries(uniprot_id_param, mutation_param, interactor_uniprot_id_param, training_data_param=training_data):
    return get_elaspic_entries(uniprot_id_param, mutation_param, interactor_uniprot_id_param, training_data_param)

In [41]:
confirm_triplets(valid_triplets_unique)


Displaying the IntAct entries:
------------------------------


Unnamed: 0,Mutation Effect Label,Affected Protein AC,Mutation,Interaction Participant
6309,mutation disrupting(MI:0573),Q9BV40,V20P,O43752



Displaying the ELASPIC entries:
-------------------------------


Unnamed: 0,UniProt_ID,Mutation,Interactor_UniProt_ID
14028,Q9BV40,V20P,O43752



Displaying the expected training entries:
-----------------------------------------


Unnamed: 0,Mutation_Effect_Label,Input_identifier,UniProt_ID,Mutation,Status,Type,COSMIC_mut_ID,ClinVar_mut_ID,UniProt_mut_ID,Domain_name,...,number_of_residues_mut,IntraclashesEnergy1_wt,IntraclashesEnergy1_mut,IntraclashesEnergy2_wt,IntraclashesEnergy2_mut,Interface_hydrophobic_area,Interface_hydrophilic_area,Interface_total_area,Interface_contact_distance_wt,Interface_contact_distance_mut
0,mutation disrupting(MI:0573),Q9BV40,Q9BV40,V20P,done,interface,-,-,-,Synaptobrevin,...,134.0,17.339,24.4837,6.14527,6.14527,666.82,255.485,922.305,3.78479,3.83218



Displaying the actual training entries:
---------------------------------------


Unnamed: 0,Mutation_Effect_Label,Input_identifier,UniProt_ID,Mutation,Status,Type,COSMIC_mut_ID,ClinVar_mut_ID,UniProt_mut_ID,Domain_name,...,number_of_residues_mut,IntraclashesEnergy1_wt,IntraclashesEnergy1_mut,IntraclashesEnergy2_wt,IntraclashesEnergy2_mut,Interface_hydrophobic_area,Interface_hydrophilic_area,Interface_total_area,Interface_contact_distance_wt,Interface_contact_distance_mut
0,mutation disrupting(MI:0573),Q9BV40,Q9BV40,V20P,done,interface,-,-,-,Synaptobrevin,...,134.0,17.339,24.4837,6.14527,6.14527,666.82,255.485,922.305,3.78479,3.83218



Test OK
-------


In [42]:
get_training_entries("Q495A1", "P114A", "Q92692")

Unnamed: 0,Mutation_Effect_Label,Input_identifier,UniProt_ID,Mutation,Status,Type,COSMIC_mut_ID,ClinVar_mut_ID,UniProt_mut_ID,Domain_name,...,number_of_residues_mut,IntraclashesEnergy1_wt,IntraclashesEnergy1_mut,IntraclashesEnergy2_wt,IntraclashesEnergy2_mut,Interface_hydrophobic_area,Interface_hydrophilic_area,Interface_total_area,Interface_contact_distance_wt,Interface_contact_distance_mut
179,mutation decreasing strength(MI:1133),Q495A1,Q495A1,P114A,done,interface,-,-,-,V-set,...,233.0,20.8889,23.068,51.8571,51.8571,339.65,159.3,498.945,3.31469,3.31469
180,mutation decreasing strength(MI:1133),Q495A1,Q495A1,P114A,done,interface,-,-,-,V-set,...,233.0,24.1325,26.3301,49.4333,49.4333,319.135,155.845,474.98,3.39998,3.39998
