# Preliminary Data

### Setup

In [1]:
from datetime import datetime

print("\033[32m{}\033[0m".format(datetime.now().strftime("%B %d, %Y %H:%M:%S")))

[32mMarch 04, 2022 12:38:27[0m


In [1]:
import pandas as pd
from pandas import DataFrame

# Preliminary Data Paths
BRCA_PRELIMINARY_DATA_PATH = r"../../data/predictions_datasets/brca_prediction_2021-11-17/558297b6/brca_preliminary_data_cgc_2021-11-17.xlsx"
ESCA_PRELIMINARY_DATA_PATH = r"../../data/predictions_datasets/esca_prediction_2021-11-17/59544312/esca_preliminary_data_cgc_2021-11-17.xlsx"
GBM_PRELIMINARY_DATA_PATH = r"../../data/predictions_datasets/gbm_prediction_2021-11-17/b9cfc7ea/gbm_preliminary_data_cgc_2021-11-17.xlsx"
HNSC_PRELIMINARY_DATA_PATH = r"../../data/predictions_datasets/hnsc_prediction_2021-11-17/53930f30/hnsc_preliminary_data_cgc_2021-11-17.xlsx"
OV_PRELIMINARY_DATA_PATH = r"../../data/predictions_datasets/ov_prediction_2021-11-17/37bf1637/ov_preliminary_data_cgc_2021-11-17.xlsx"
COAD_PRELIMINARY_DATA_PATH = r"../../data/predictions_datasets/coad_prediction_2022-01-06/9789393f/coad_preliminary_data_cgc_2022-02-16.xlsx"

# Reflect changes in the modules immediately.
%load_ext autoreload
%autoreload 2

### Filtering Condition

### Updated:
[16:46, 23/12/2021] Hilal Kazan (ABU CS Prof. ): sonuç olarak LATS1'den bahsetmek istiyorum
[16:46, 23/12/2021] Hilal Kazan (ABU CS Prof. ): interface count >=5 yapmamız gerek en güncel halinde

<s>

[11:21, 13/12/2021] Hilal Kazan (ABU CS Prof. ): BRCA ve bütün diğer kohortlar için şu filtreleme daha uygun olacak sanıyorum
[11:21, 13/12/2021] Hilal Kazan (ABU CS Prof. ): interaction count > 5
[11:21, 13/12/2021] Hilal Kazan (ABU CS Prof. ): disruptive ratio > 0.7

    data_filtered = data_filtered[
        (data_filtered["NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR"] >= 5) &
        (data_filtered["RATIO_(prev_two_col)"] >= 0.7)
    ].copy()

</s>

In [2]:
def round_decimals(x, n_decimals=2):
    if isinstance(x, float):
        try:
            return round(x, n_decimals)
        except TypeError:
            print(f"VALUE X={x}")
    else:
        return x

In [3]:
def filterer(data, tcga) -> DataFrame:
    data_filtered = data.copy()
    # Disruptive ratio
    data_filtered["RATIO_(prev_two_col)"] = data_filtered["NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR"] / data_filtered["PATIENT_INTERFACE_COUNT"]
    # Round the ratio values.
    data_filtered["RATIO_(prev_two_col)"] = data_filtered["RATIO_(prev_two_col)"].apply(lambda x: round_decimals(x))
    # Filtering based on given conditions.
    data_filtered = data_filtered[
        (data_filtered["PATIENT_INTERFACE_COUNT"] >= 5) &
        (data_filtered["RATIO_(prev_two_col)"] >= 0.7)
    ].copy()

    # Keep only relevant columns
    data_filtered = data_filtered[
        ["PROTEIN", "GENE", "PATIENT_CORE_COUNT", "PATIENT_INTERFACE_COUNT", "NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR", "RATIO_(prev_two_col)", "NUM_UNIQUE_INTERACTORS", "CGC_STATUS"]
    ].copy()

    data_filtered.insert(0, "TCGA", tcga)

    data_filtered = data_filtered.sort_values("RATIO_(prev_two_col)", ascending=False)

    return data_filtered


In [4]:
brca_preliminary_data = pd.read_excel(BRCA_PRELIMINARY_DATA_PATH)
esca_preliminary_data = pd.read_excel(ESCA_PRELIMINARY_DATA_PATH)
gbm_preliminary_data = pd.read_excel(GBM_PRELIMINARY_DATA_PATH)
hnsc_preliminary_data = pd.read_excel(HNSC_PRELIMINARY_DATA_PATH)
ov_preliminary_data = pd.read_excel(OV_PRELIMINARY_DATA_PATH)
coad_preliminary_data = pd.read_excel(COAD_PRELIMINARY_DATA_PATH)

In [5]:
brca_preliminary_data_filtered = filterer(brca_preliminary_data, "BRCA")
esca_preliminary_data_filtered = filterer(esca_preliminary_data, "ESCA")
gbm_preliminary_data_filtered = filterer(gbm_preliminary_data, "GBM")
hnsc_preliminary_data_filtered = filterer(hnsc_preliminary_data, "HNSC")
ov_preliminary_data_filtered = filterer(ov_preliminary_data, "OV")
coad_preliminary_data_filtered = filterer(coad_preliminary_data, "COAD")

In [7]:
brca_preliminary_data_filtered

Unnamed: 0,TCGA,PROTEIN,GENE,PATIENT_CORE_COUNT,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS,CGC_STATUS
2465,BRCA,P42336,PIK3CA,35,124,120,0.97,6,+
1204,BRCA,P04637,TP53,141,55,52,0.95,17,+
3126,BRCA,P68431,H3C1,1,13,12,0.92,11,-
3091,BRCA,P62805,H4C1,6,11,10,0.91,16,-
2534,BRCA,P45985,MAP2K4,3,8,7,0.88,7,+
1201,BRCA,P04626,ERBB2,12,14,12,0.86,16,+
1447,BRCA,P0CG48,UBC,0,7,6,0.86,7,-


In [8]:
esca_preliminary_data_filtered

Unnamed: 0,TCGA,PROTEIN,GENE,PATIENT_CORE_COUNT,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS,CGC_STATUS
627,ESCA,P04637,TP53,56,35,34,0.97,17,+
1571,ESCA,P62805,H4C1,2,7,6,0.86,11,-


In [9]:
gbm_preliminary_data_filtered

Unnamed: 0,TCGA,PROTEIN,GENE,PATIENT_CORE_COUNT,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS,CGC_STATUS
1050,GBM,P04637,TP53,52,35,35,1.0,19,+
1449,GBM,P15056,BRAF,1,5,4,0.8,5,+
2925,GBM,Q05655,PRKCD,2,5,4,0.8,10,-


In [10]:
hnsc_preliminary_data_filtered

Unnamed: 0,TCGA,PROTEIN,GENE,PATIENT_CORE_COUNT,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS,CGC_STATUS
1012,HNSC,P01112,HRAS,1,26,26,1.0,26,+
3410,HNSC,Q14790,CASP8,13,5,5,1.0,5,+
1499,HNSC,P15056,BRAF,0,6,6,1.0,6,+
2792,HNSC,P63000,RAC1,0,11,11,1.0,8,+
2381,HNSC,P49336,CDK8,2,6,6,1.0,3,-
1099,HNSC,P04637,TP53,134,59,58,0.98,16,+
5467,HNSC,Q969H0,FBXW7,2,18,17,0.94,3,+
2220,HNSC,P42336,PIK3CA,13,14,13,0.93,5,+
2236,HNSC,P42771,CDKN2A,1,19,17,0.89,23,+
1754,HNSC,P22607,FGFR3,0,6,5,0.83,20,+


In [11]:
ov_preliminary_data_filtered

Unnamed: 0,TCGA,PROTEIN,GENE,PATIENT_CORE_COUNT,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS,CGC_STATUS
927,OV,P04637,TP53,147,80,80,1.0,19,+
782,OV,O95835,LATS1,1,5,4,0.8,7,+
2436,OV,P62805,H4C1,8,8,6,0.75,18,-


In [12]:
coad_preliminary_data_filtered

Unnamed: 0,TCGA,PROTEIN,GENE,PATIENT_CORE_COUNT,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS,CGC_STATUS
212,COAD,O00238,BMPR1B,2,6,6,1.0,16,-
3713,COAD,P61586,RHOA,0,5,5,1.0,41,+
9855,COAD,Q9UKV0,HDAC9,7,5,5,1.0,3,-
7578,COAD,Q96E17,RAB3C,1,6,6,1.0,4,-
2110,COAD,P16885,PLCG2,4,6,6,1.0,14,-
2228,COAD,P19784,CSNK2A2,2,9,9,1.0,10,-
4709,COAD,Q15365,PCBP1,4,9,9,1.0,5,+
2968,COAD,P41743,PRKCI,2,5,5,1.0,13,-
4337,COAD,Q13310,PABPC4,2,5,5,1.0,5,-
3870,COAD,P84022,SMAD3,5,6,6,1.0,5,+


In [13]:
concated_preliminary_filtered_data = pd.concat(
    [
        brca_preliminary_data_filtered,
        esca_preliminary_data_filtered,
        gbm_preliminary_data_filtered,
        hnsc_preliminary_data_filtered,
        ov_preliminary_data_filtered,
        coad_preliminary_data_filtered,
    ], ignore_index=True
)

In [14]:
concated_preliminary_filtered_data

Unnamed: 0,TCGA,PROTEIN,GENE,PATIENT_CORE_COUNT,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS,CGC_STATUS
0,BRCA,P42336,PIK3CA,35,124,120,0.97,6,+
1,BRCA,P04637,TP53,141,55,52,0.95,17,+
2,BRCA,P68431,H3C1,1,13,12,0.92,11,-
3,BRCA,P62805,H4C1,6,11,10,0.91,16,-
4,BRCA,P45985,MAP2K4,3,8,7,0.88,7,+
...,...,...,...,...,...,...,...,...,...
61,COAD,Q16478,GRIK5,13,5,4,0.80,4,-
62,COAD,Q16659,MAPK6,2,5,4,0.80,4,-
63,COAD,P07948,LYN,3,8,6,0.75,14,-
64,COAD,Q9BXA7,TSSK1B,7,7,5,0.71,3,-


In [14]:
concated_preliminary_filtered_data

Unnamed: 0,TCGA,PROTEIN,GENE,PATIENT_CORE_COUNT,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS,CGC_STATUS
0,BRCA,P42336,PIK3CA,35,124,120,0.97,6,+
1,BRCA,P04637,TP53,141,55,52,0.95,17,+
2,BRCA,P68431,H3C1,1,13,12,0.92,11,-
3,BRCA,P62805,H4C1,6,11,10,0.91,16,-
4,BRCA,P45985,MAP2K4,3,8,7,0.88,7,+
...,...,...,...,...,...,...,...,...,...
61,COAD,Q16478,GRIK5,13,5,4,0.80,4,-
62,COAD,Q16659,MAPK6,2,5,4,0.80,4,-
63,COAD,P07948,LYN,3,8,6,0.75,14,-
64,COAD,Q9BXA7,TSSK1B,7,7,5,0.71,3,-


In [12]:
concated_preliminary_filtered_data.to_clipboard()

- - -

## Construct Table (Relatively simplified)

In [9]:
brca_data = preliminary_data_filtered[
    ["GENE", "PATIENT_CORE_COUNT", "PATIENT_INTERFACE_COUNT", "NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR"]
].copy()
brca_data["RATIO_(prev_two_col)"] = brca_data["NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR"] / brca_data["PATIENT_INTERFACE_COUNT"]
brca_data["RATIO_(prev_two_col)"] = brca_data["RATIO_(prev_two_col)"].apply(lambda x: round(x, 2))
brca_data["NUM_UNIQUE_INTERACTORS"] = preliminary_data_filtered["NUM_UNIQUE_INTERACTORS"]
brca_data = brca_data.sort_values("RATIO_(prev_two_col)", ascending=False)

In [10]:
brca_data

Unnamed: 0,GENE,PATIENT_CORE_COUNT,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS
2465,PIK3CA,35,124,120,0.97,6
1204,TP53,141,55,52,0.95,17
3126,H3C1,1,13,12,0.92,11
3091,H4C1,6,11,10,0.91,16
2534,MAP2K4,3,8,7,0.88,7
1201,ERBB2,12,14,12,0.86,16
1447,UBC,0,7,6,0.86,7


In [11]:
brca_data.to_clipboard()

In [21]:
brca_data = brca_data.reset_index(drop=True)
brca_data = brca_data.set_index("PATIENT_CORE_COUNT")
brca_data

Unnamed: 0_level_0,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS
PATIENT_CORE_COUNT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,14,12,0.857143,16
141,55,52,0.945455,17
0,7,6,0.857143,7
35,124,120,0.967742,6
3,8,7,0.875,7
6,11,10,0.909091,16
1,13,12,0.923077,11


In [22]:
print(brca_data.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  PATIENT\_INTERFACE\_COUNT &  NUM\_INTERFACE\_PATIENTS\_DISRUPTIVE\_INTERACTOR &  RATIO\_(prev\_two\_col) &  NUM\_UNIQUE\_INTERACTORS \\
PATIENT\_CORE\_COUNT &                          &                                               &                       &                         \\
\midrule
12                 &                       14 &                                            12 &              0.857143 &                      16 \\
141                &                       55 &                                            52 &              0.945455 &                      17 \\
0                  &                        7 &                                             6 &              0.857143 &                       7 \\
35                 &                      124 &                                           120 &              0.967742 &                       6 \\
3                  &                        8 &                                 