In [1]:
# Common imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys
import random

from pprint import pprint
from IPython.display import display
from tqdm.notebook import tqdm

from helpers.helpers_predator.displayers import display_label_counts, display_labels, visualize_label_counts
from helpers.helpers_predator.visualizers import (
    visualize_sampled_train_datasets_label_counts
)

NUM_EXPERIMENT_REPEAT = 10_000 # 10, 25, 50

SEED = 42
random.seed(SEED)
random_seeds = random.sample(range(1, 99999), NUM_EXPERIMENT_REPEAT)

# PATHS
PROJECT_COMMON_FILE_DIR = "../data/"
MUTATIONS_PATH = "training_data_M1.txt"
INITIAL_COLUMNS_PATH = "../data/initial_columns_59.csv"
BRCA_PATH = "../data/BRCA_INTERFACE_A2.txt"

# Reflect changes in the modules immediately.
%load_ext autoreload
%autoreload 2

In [2]:
from Predator import Predator

In [3]:
predator = Predator(
    project_common_file_dir=PROJECT_COMMON_FILE_DIR, 
    mutations_path=MUTATIONS_PATH, 
    tcga_code_path_pairs=[('brca', BRCA_PATH)],
    initial_columns_path=INITIAL_COLUMNS_PATH, 
    n_experiment=NUM_EXPERIMENT_REPEAT,
    eliminate_models=False
)

2021-09-24 14:21:20 |[36m DEBUG    [0m| Predator | Initializing Predator ..
2021-09-24 14:21:20 |[36m DEBUG    [0m| helpers.helpers_predator.data_materials | Initialize `train_data` ..
2021-09-24 14:21:20 |[36m DEBUG    [0m| helpers.helpers_predator.data_materials | Initialize `train_data_processed` ..
2021-09-24 14:21:20 |[36m DEBUG    [0m| helpers.helpers_predator.data_materials | Initialize `brca` ..
2021-09-24 14:21:20 |[36m DEBUG    [0m| helpers.helpers_predator.data_materials | Initialize `target_brca_data` ..


## Datasets

### Original Datasets

#### 1. Training Data: Mutations

In [4]:
predator.data_materials["train_data"].head(3)

Unnamed: 0,Mutation_Effect_Label,Input_identifier,UniProt_ID,Mutation,Status,Type,COSMIC_mut_ID,ClinVar_mut_ID,UniProt_mut_ID,Domain_name,...,number_of_residues_mut,IntraclashesEnergy1_wt,IntraclashesEnergy1_mut,IntraclashesEnergy2_wt,IntraclashesEnergy2_mut,Interface_hydrophobic_area,Interface_hydrophilic_area,Interface_total_area,Interface_contact_distance_wt,Interface_contact_distance_mut
0,mutation decreasing rate(MI:1130),P63000,P63000,Q61L,done,interface,-,-,-,Ras,...,378.0,40.0165,40.2638,61.9441,61.9441,,,,3.7681,3.21627
1,mutation decreasing rate(MI:1130),P63000,P63000,Q61L,done,interface,-,-,-,Ras,...,378.0,40.0165,40.2638,61.9441,61.9441,,,,3.7681,3.21627
2,mutation decreasing rate(MI:1130),P63000,P63000,Q61L,done,interface,-,-,-,Ras,...,378.0,40.0165,40.2638,61.9441,61.9441,,,,3.7681,3.21627


=====

In [5]:
df = predator.data_materials["train_data_processed"][["Mutation_Effect_Label", "UniProt_ID", "Mutation", "Interactor_UniProt_ID"]]
df

Unnamed: 0,Mutation_Effect_Label,UniProt_ID,Mutation,Interactor_UniProt_ID
0,0,Q9BPZ3,F118A,P11940
1,0,P01116,Y40C,P50749
2,0,Q96QK1,F534D,Q9UBQ0
3,0,Q96QK1,F534D,Q9UBQ0-2
4,0,O43521-3,G66A,Q07812
...,...,...,...,...
735,0,P84243,Q94A,Q9UER7-3
736,0,Q96QK1,L589D,Q9UBQ0
737,0,Q96QK1,L589D,Q9UBQ0-2
738,0,P23297,F72L,P25815


In [6]:
df['UniProt_ID'].nunique()

164

In [7]:
df2 = df[["UniProt_ID", "Mutation", "Interactor_UniProt_ID"]].copy()

In [8]:
df3 = df[["Mutation_Effect_Label", "UniProt_ID", "Mutation", "Interactor_UniProt_ID"]].copy()

In [9]:
df3[df3["Mutation_Effect_Label"] == 0]['UniProt_ID'].nunique()

111

In [10]:
df3[df3["Mutation_Effect_Label"] == 1]['UniProt_ID'].nunique()

65

In [11]:
111+65

176

In [12]:
unique_proteins = list(df['UniProt_ID'].unique())
print(len(unique_proteins))
unique_proteins[:5]

164


['Q9BPZ3', 'P01116', 'Q96QK1', 'O43521-3', 'Q13418']

In [13]:
proteins_with_two_labels = []
for protein in unique_proteins:
    if df[df['UniProt_ID'] == protein]['Mutation_Effect_Label'].nunique() > 1:
        proteins_with_two_labels.append(protein)
        
print(len(proteins_with_two_labels))
proteins_with_two_labels

12


['Q13418',
 'P11802',
 'Q13241',
 'P42773',
 'P61587',
 'Q71DI3',
 'Q12918',
 'P01903',
 'P63279',
 'O15111',
 'P51587',
 'P42771']

In [14]:
sampled_entries = []
for protein in unique_proteins:
    sampled_entry = df[df['UniProt_ID'] == protein].sample()
    sampled_entries.append(sampled_entry)

sampled_data = pd.concat(sampled_entries, axis=0)
sampled_data.reset_index(drop=True, inplace=True)
print(sampled_data["Mutation_Effect_Label"].value_counts())
sampled_data

0    106
1     58
Name: Mutation_Effect_Label, dtype: int64


Unnamed: 0,Mutation_Effect_Label,UniProt_ID,Mutation,Interactor_UniProt_ID
0,0,Q9BPZ3,F118A,P11940
1,0,P01116,Y40C,P50749
2,0,Q96QK1,R637D,Q9UBQ0
3,0,O43521-3,G66A,P10415
4,1,Q13418,Q362H,Q9NVD7
...,...,...,...,...
159,0,Q0P5N6,G66D,O43924
160,1,Q7Z3B4,R385K,P37198
161,1,P19429,R145Q,P02585
162,1,P21860,G284R,P04626


In [15]:
df[df['UniProt_ID'] == "Q13418"]

Unnamed: 0,Mutation_Effect_Label,UniProt_ID,Mutation,Interactor_UniProt_ID
11,1,Q13418,Q362H,Q9NVD7
150,0,Q13418,M402A,Q9NVD7


In [16]:
df3

Unnamed: 0,Mutation_Effect_Label,UniProt_ID,Mutation,Interactor_UniProt_ID
0,0,Q9BPZ3,F118A,P11940
1,0,P01116,Y40C,P50749
2,0,Q96QK1,F534D,Q9UBQ0
3,0,Q96QK1,F534D,Q9UBQ0-2
4,0,O43521-3,G66A,Q07812
...,...,...,...,...
735,0,P84243,Q94A,Q9UER7-3
736,0,Q96QK1,L589D,Q9UBQ0
737,0,Q96QK1,L589D,Q9UBQ0-2
738,0,P23297,F72L,P25815


In [17]:
df3[df3.duplicated()]

Unnamed: 0,Mutation_Effect_Label,UniProt_ID,Mutation,Interactor_UniProt_ID
172,0,Q9HAU4,C716A,P0CG47
233,1,P01133,H980Y,P00533-3
334,1,Q14203,Q93R,P30622
336,1,Q14203,Q93R,P30622-1
338,1,Q14203,Q93R,P30622-2
372,0,Q13542,L59A,P06730-2
456,0,Q13542,Y54A,P06730-2
470,1,Q14203,Q93K,P30622
472,1,Q14203,Q93K,P30622-1
474,1,Q14203,Q93K,P30622-2


In [18]:
df2[df2.duplicated()].shape

(13, 3)

=====

In [19]:
164+13

177

In [41]:
unique_proteins = list(df['UniProt_ID'].unique())
print(len(unique_proteins))
unique_proteins[:5]

164


['Q9BPZ3', 'P01116', 'Q96QK1', 'O43521-3', 'Q13418']

In [None]:
proteins_with_two_labels = []
for protein in unique_proteins:
    if df[df['UniProt_ID'] == protein]['Mutation_Effect_Label'].nunique() > 1:
        proteins_with_two_labels.append(protein)
        
print(len(proteins_with_two_labels))
proteins_with_two_labels

12


['Q13418',
 'P11802',
 'Q13241',
 'P42773',
 'P61587',
 'Q71DI3',
 'Q12918',
 'P01903',
 'P63279',
 'O15111',
 'P51587',
 'P42771']

In [21]:
min_val_count_class_0 = float('+inf')
max_val_count_class_0 = float('-inf')

min_val_count_class_1 = float('+inf')
max_val_count_class_1 = float('-inf')

for i in range(predator.n_experiment):
    val = predator.data_materials["sampled_train_data_list"][i]['Mutation_Effect_Label'].value_counts()[0]
    min_val_count = min(min_val_count, val)
    max_val_count = max(max_val_count, val)
    
print(min_val_count)
print(max_val_count)

KeyError: 'sampled_train_data_list'