# Mutations IntAct TCGA

_The number of mutations predicted to influence an interaction in the TCGA datasets that were already present in the training set should be mentioned?
We also should give the list of the 164 proteins included in the training dataset_.

In [1]:
from datetime import datetime

print("\033[32m{}\033[0m".format(datetime.now().strftime("%B %d, %Y %H:%M:%S")))

[32mJune 21, 2022 14:56:24[0m


In [2]:
import pandas as pd
import os

from src.helpers.helpers_predator.common import load_predator
from src.helpers.helpers_analysis.convert_primary_isomer import convert_primary_isomer

PREDATOR_PATH = "PredatorModels/PredatorModel_2022-06-16/cc84a54e/predator.pkl"

os.chdir("../../")

# Reflect changes in the modules immediately.
%load_ext autoreload
%autoreload 2

# Interactions in IntAct data

In [3]:
predator = load_predator(PREDATOR_PATH)

2022-06-21 14:56:26 |[32m INFO     [0m| src.helpers.helpers_predator.common | Predator object PredatorModels/PredatorModel_2022-06-16/cc84a54e/predator.pkl is loaded successfully.


In [4]:
predator.data_materials.random_seeds[:10]

[83811, 14593, 3279, 97197, 36049, 32099, 29257, 18290, 96531, 13435]

In [5]:
intact_data = predator.data_materials["train_data_processed"][["UniProt_ID", "Mutation", "Interactor_UniProt_ID"]].copy()
intact_data

Unnamed: 0,UniProt_ID,Mutation,Interactor_UniProt_ID
0,Q9BPZ3,F118A,P11940
1,P01116,Y40C,P50749
2,Q96QK1,F534D,Q9UBQ0
3,Q96QK1,F534D,Q9UBQ0-2
4,O43521-3,G66A,Q07812
...,...,...,...
735,P84243,Q94A,Q9UER7-3
736,Q96QK1,L589D,Q9UBQ0
737,Q96QK1,L589D,Q9UBQ0-2
738,P23297,F72L,P25815


In [6]:
intact_data = convert_primary_isomer("UniProt_ID", intact_data)
intact_data = convert_primary_isomer("Interactor_UniProt_ID", intact_data)

In [7]:
intact_data_unique_interactions = intact_data.drop_duplicates().reset_index(drop=True)
intact_data_unique_interactions

Unnamed: 0,UniProt_ID,Mutation,Interactor_UniProt_ID
0,Q9BPZ3,F118A,P11940
1,P01116,Y40C,P50749
2,Q96QK1,F534D,Q9UBQ0
3,O43521,G66A,Q07812
4,Q13418,Q362H,Q9NVD7
...,...,...,...
434,P35908,L484P,O76015
435,P84243,Q94A,Q9UER7
436,Q96QK1,L589D,Q9UBQ0
437,P23297,F72L,P25815


# Interactions in TCGA data

In [8]:
# I look for only predicted interactions (remember that we do not have predictions for *all* interactions in a given TCGA cohort)

In [9]:
BRCA_PREDICTION_PATH = "../data/predictions_datasets/brca_prediction_2022-06-17/ed35a3a3/predictions_soft_2022-06-17.csv"
COAD_PREDICTION_PATH = "../data/predictions_datasets/coad_prediction_2022-06-17/84fd283a/predictions_soft_2022-06-17.csv"
ESCA_PREDICTION_PATH = "../data/predictions_datasets/esca_prediction_2022-06-17/f2d1e99a/predictions_soft_2022-06-17.csv"
GBM_PREDICTION_PATH = "../data/predictions_datasets/gbm_prediction_2022-06-17/8d7f7caa/predictions_soft_2022-06-17.csv"
HNSC_PREDICTION_PATH = "../data/predictions_datasets/hnsc_prediction_2022-06-17/76f498d9/predictions_soft_2022-06-17.csv"
OV_PREDICTION_PATH = "../data/predictions_datasets/ov_prediction_2022-06-17/865d1897/predictions_soft_2022-06-17.csv"

In [10]:
brca_predictions = pd.read_csv(BRCA_PREDICTION_PATH)
coad_predictions = pd.read_csv(COAD_PREDICTION_PATH)
esca_predictions = pd.read_csv(ESCA_PREDICTION_PATH)
gbm_predictions = pd.read_csv(GBM_PREDICTION_PATH)
hnsc_predictions = pd.read_csv(HNSC_PREDICTION_PATH)
ov_predictions = pd.read_csv(OV_PREDICTION_PATH)

In [11]:
tcga_predictions = pd.concat(
    [
        brca_predictions,
        coad_predictions,
        esca_predictions,
        gbm_predictions,
        hnsc_predictions,
        ov_predictions,
    ]
)

In [12]:
tcga_interactions = tcga_predictions[["UniProt_ID", "Mutation", "Interactor_UniProt_ID"]].copy()
tcga_interactions

Unnamed: 0,UniProt_ID,Mutation,Interactor_UniProt_ID
0,P28062,R216W,P40306
1,Q15842,E237K,Q14654
2,Q15842,E237K,P63252
3,Q9UKS6,R24H,Q9BY11
4,Q9UKS6,R24H,Q9UNF0
...,...,...,...
2496,P62942,F100L,Q5T7S2
2497,P42336,H1047R,Q13535
2498,P42336,H1047R,P27986
2499,P42336,H1047R,O00459


In [13]:
tcga_interactions_unique = tcga_interactions.drop_duplicates().reset_index(drop=True)
tcga_interactions_unique

Unnamed: 0,UniProt_ID,Mutation,Interactor_UniProt_ID
0,P28062,R216W,P40306
1,Q15842,E237K,Q14654
2,Q15842,E237K,P63252
3,Q9UKS6,R24H,Q9BY11
4,Q9UKS6,R24H,Q9UNF0
...,...,...,...
21184,P24821,T959A,P02751
21185,P11802,T177I,Q14012
21186,P11802,T177I,Q16539
21187,Q86U06,D277H,Q9BWF3


In [14]:
21189

21189

In [15]:
pd.merge(intact_data_unique_interactions, tcga_interactions_unique, how='inner')

Unnamed: 0,UniProt_ID,Mutation,Interactor_UniProt_ID
0,P42773,T69A,Q00534
1,Q8IYM1,D197N,Q8WYJ6
2,Q8IYM1,D197N,Q14141
3,P21860,G284R,P04626


In [16]:
print(pd.merge(intact_data_unique_interactions, tcga_interactions_unique, how='inner').to_markdown())

|    | UniProt_ID   | Mutation   | Interactor_UniProt_ID   |
|---:|:-------------|:-----------|:------------------------|
|  0 | P42773       | T69A       | Q00534                  |
|  1 | Q8IYM1       | D197N      | Q8WYJ6                  |
|  2 | Q8IYM1       | D197N      | Q14141                  |
|  3 | P21860       | G284R      | P04626                  |


In [17]:
# alternative way of finding the common triplets using
def get_array_of_tuples(df):
    return list(zip(*map(df.get, df)))

print(
    [
        interaction for interaction
        in get_array_of_tuples(tcga_interactions_unique)
        if interaction in get_array_of_tuples(intact_data_unique_interactions)
    ]
)

[('P21860', 'G284R', 'P04626'), ('P42773', 'T69A', 'Q00534'), ('Q8IYM1', 'D197N', 'Q14141'), ('Q8IYM1', 'D197N', 'Q8WYJ6')]
