# Unique Proteins in TCGA

_It is not clear how the mutations observed in the TCGA dataset are pre-processed to give the triplets and how many different proteins are represented in those triplets._

In [1]:
from datetime import datetime

print("\033[32m{}\033[0m".format(datetime.now().strftime("%B %d, %Y %H:%M:%S")))

[32mJune 21, 2022 15:04:26[0m


In [2]:
import pandas as pd
import os

os.chdir("../../")

# Reflect changes in the modules immediately.
%load_ext autoreload
%autoreload 2

# Interactions in TCGA data

In [3]:
BRCA_PREDICTION_PATH = "../data/predictions_datasets/brca_prediction_2022-06-17/ed35a3a3/predictions_soft_2022-06-17.csv"
COAD_PREDICTION_PATH = "../data/predictions_datasets/coad_prediction_2022-06-17/84fd283a/predictions_soft_2022-06-17.csv"
ESCA_PREDICTION_PATH = "../data/predictions_datasets/esca_prediction_2022-06-17/f2d1e99a/predictions_soft_2022-06-17.csv"
GBM_PREDICTION_PATH = "../data/predictions_datasets/gbm_prediction_2022-06-17/8d7f7caa/predictions_soft_2022-06-17.csv"
HNSC_PREDICTION_PATH = "../data/predictions_datasets/hnsc_prediction_2022-06-17/76f498d9/predictions_soft_2022-06-17.csv"
OV_PREDICTION_PATH = "../data/predictions_datasets/ov_prediction_2022-06-17/865d1897/predictions_soft_2022-06-17.csv"

In [4]:
brca_predictions = pd.read_csv(BRCA_PREDICTION_PATH)
coad_predictions = pd.read_csv(COAD_PREDICTION_PATH)
esca_predictions = pd.read_csv(ESCA_PREDICTION_PATH)
gbm_predictions = pd.read_csv(GBM_PREDICTION_PATH)
hnsc_predictions = pd.read_csv(HNSC_PREDICTION_PATH)
ov_predictions = pd.read_csv(OV_PREDICTION_PATH)

In [5]:
len(brca_predictions["UniProt_ID"].unique())

1198

In [6]:
print("# of unique proteins in predicted interactions in TCGA datasets")
df = pd.DataFrame(
    {
        "BRCA": len(brca_predictions["UniProt_ID"].unique()),
        "COAD": len(coad_predictions["UniProt_ID"].unique()),
        "ESCA": len(esca_predictions["UniProt_ID"].unique()),
        "GBM": len(gbm_predictions["UniProt_ID"].unique()),
        "HNSC": len(hnsc_predictions["UniProt_ID"].unique()),
        "OV": len(ov_predictions["UniProt_ID"].unique()),
    }, index=["# of unique proteins"]
)

df

# of unique proteins in predicted interactions in TCGA datasets


Unnamed: 0,BRCA,COAD,ESCA,GBM,HNSC,OV
# of unique proteins,1198,1864,413,975,1074,820


In [7]:
print(df.T.to_latex())

\begin{tabular}{lr}
\toprule
{} &  \# of unique proteins \\
\midrule
BRCA &                  1198 \\
COAD &                  1864 \\
ESCA &                   413 \\
GBM  &                   975 \\
HNSC &                  1074 \\
OV   &                   820 \\
\bottomrule
\end{tabular}



# Unique proteins in IntAct

In [8]:
intact_data = pd.read_csv("processed_data_740.csv")

In [9]:
intact_unique_proteins = intact_data["UniProt_ID"].unique()

In [10]:
tcga_unique_proteins = \
set(brca_predictions["UniProt_ID"].unique()) | \
set(coad_predictions["UniProt_ID"].unique()) | \
set(esca_predictions["UniProt_ID"].unique()) | \
set(gbm_predictions["UniProt_ID"].unique()) | \
set(hnsc_predictions["UniProt_ID"].unique()) | \
set(ov_predictions["UniProt_ID"].unique())

In [11]:
len(tcga_unique_proteins)

3035

In [12]:
len(set(intact_unique_proteins).intersection(tcga_unique_proteins))

118