# Preliminary Data

### Setup

In [1]:
import pandas as pd
from pandas import DataFrame

# Preliminary Data Paths
BRCA_PRELIMINARY_DATA_PATH = r"../../data/predictions_datasets/brca_prediction_2021-11-17/558297b6/brca_preliminary_data_cgc_2021-11-17.xlsx"
ESCA_PRELIMINARY_DATA_PATH = r"../../data/predictions_datasets/esca_prediction_2021-11-17/59544312/esca_preliminary_data_cgc_2021-11-17.xlsx"
GBM_PRELIMINARY_DATA_PATH = r"../../data/predictions_datasets/gbm_prediction_2021-11-17/b9cfc7ea/gbm_preliminary_data_cgc_2021-11-17.xlsx"
HNSC_PRELIMINARY_DATA_PATH = r"../../data/predictions_datasets/hnsc_prediction_2021-11-17/53930f30/hnsc_preliminary_data_cgc_2021-11-17.xlsx"
OV_PRELIMINARY_DATA_PATH = r"../../data/predictions_datasets/ov_prediction_2021-11-17/37bf1637/ov_preliminary_data_cgc_2021-11-17.xlsx"

# Reflect changes in the modules immediately.
%load_ext autoreload
%autoreload 2

### Filtering Condition

[11:21, 13/12/2021] Hilal Kazan (ABU CS Prof. ): BRCA ve bütün diğer kohortlar için şu filtreleme daha uygun olacak sanıyorum
[11:21, 13/12/2021] Hilal Kazan (ABU CS Prof. ): interaction count > 5
[11:21, 13/12/2021] Hilal Kazan (ABU CS Prof. ): disruptive ratio > 0.7

In [2]:
def round_decimals(x, n_decimals=2):
    if isinstance(x, float):
        try:
            return round(x, n_decimals)
        except TypeError:
            print(f"VALUE X={x}")
    else:
        return x

In [3]:
def filterer(data, tcga) -> DataFrame:
    data_filtered = data.copy()
    # Disruptive ratio
    data_filtered["RATIO_(prev_two_col)"] = data_filtered["NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR"] / data_filtered["PATIENT_INTERFACE_COUNT"]
    # Round the ratio values.
    data_filtered["RATIO_(prev_two_col)"] = data_filtered["RATIO_(prev_two_col)"].apply(lambda x: round_decimals(x))
    # Filtering based on given conditions.
    data_filtered = data_filtered[
        (data_filtered["NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR"] >= 5) &
        (data_filtered["RATIO_(prev_two_col)"] >= 0.7)
    ].copy()

    # Keep only relevant columns
    data_filtered = data_filtered[
        ["PROTEIN", "GENE", "PATIENT_CORE_COUNT", "PATIENT_INTERFACE_COUNT", "NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR", "RATIO_(prev_two_col)", "NUM_UNIQUE_INTERACTORS", "CGC_STATUS"]
    ].copy()

    data_filtered.insert(0, "TCGA", tcga)

    data_filtered = data_filtered.sort_values("RATIO_(prev_two_col)", ascending=False)

    return data_filtered


In [4]:
brca_preliminary_data = pd.read_excel(BRCA_PRELIMINARY_DATA_PATH)
esca_preliminary_data = pd.read_excel(ESCA_PRELIMINARY_DATA_PATH)
gbm_preliminary_data = pd.read_excel(GBM_PRELIMINARY_DATA_PATH)
hnsc_preliminary_data = pd.read_excel(HNSC_PRELIMINARY_DATA_PATH)
ov_preliminary_data = pd.read_excel(OV_PRELIMINARY_DATA_PATH)

In [5]:
brca_preliminary_data_filtered = filterer(brca_preliminary_data, "BRCA")
esca_preliminary_data_filtered = filterer(esca_preliminary_data, "ESCA")
gbm_preliminary_data_filtered = filterer(gbm_preliminary_data, "GBM")
hnsc_preliminary_data_filtered = filterer(hnsc_preliminary_data, "HNSC")
ov_preliminary_data_filtered = filterer(ov_preliminary_data, "OV")

In [6]:
brca_preliminary_data_filtered

Unnamed: 0,TCGA,PROTEIN,GENE,PATIENT_CORE_COUNT,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS,CGC_STATUS
2465,BRCA,P42336,PIK3CA,35,124,120,0.97,6,+
1204,BRCA,P04637,TP53,141,55,52,0.95,17,+
3126,BRCA,P68431,H3C1,1,13,12,0.92,11,-
3091,BRCA,P62805,H4C1,6,11,10,0.91,16,-
2534,BRCA,P45985,MAP2K4,3,8,7,0.88,7,+
1201,BRCA,P04626,ERBB2,12,14,12,0.86,16,+
1447,BRCA,P0CG48,UBC,0,7,6,0.86,7,-


In [7]:
concated_preliminary_filtered_data = pd.concat(
    [
        brca_preliminary_data_filtered,
        esca_preliminary_data_filtered,
        gbm_preliminary_data_filtered,
        hnsc_preliminary_data_filtered,
        ov_preliminary_data_filtered,
    ], ignore_index=True
)

In [8]:
concated_preliminary_filtered_data

Unnamed: 0,TCGA,PROTEIN,GENE,PATIENT_CORE_COUNT,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS,CGC_STATUS
0,BRCA,P42336,PIK3CA,35,124,120,0.97,6,+
1,BRCA,P04637,TP53,141,55,52,0.95,17,+
2,BRCA,P68431,H3C1,1,13,12,0.92,11,-
3,BRCA,P62805,H4C1,6,11,10,0.91,16,-
4,BRCA,P45985,MAP2K4,3,8,7,0.88,7,+
5,BRCA,P04626,ERBB2,12,14,12,0.86,16,+
6,BRCA,P0CG48,UBC,0,7,6,0.86,7,-
7,ESCA,P04637,TP53,56,35,34,0.97,17,+
8,ESCA,P62805,H4C1,2,7,6,0.86,11,-
9,GBM,P04637,TP53,52,35,35,1.0,19,+


In [9]:
concated_preliminary_filtered_data.to_clipboard()

- - -

## Construct Table (Relatively simplified)

In [9]:
brca_data = preliminary_data_filtered[
    ["GENE", "PATIENT_CORE_COUNT", "PATIENT_INTERFACE_COUNT", "NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR"]
].copy()
brca_data["RATIO_(prev_two_col)"] = brca_data["NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR"] / brca_data["PATIENT_INTERFACE_COUNT"]
brca_data["RATIO_(prev_two_col)"] = brca_data["RATIO_(prev_two_col)"].apply(lambda x: round(x, 2))
brca_data["NUM_UNIQUE_INTERACTORS"] = preliminary_data_filtered["NUM_UNIQUE_INTERACTORS"]
brca_data = brca_data.sort_values("RATIO_(prev_two_col)", ascending=False)

In [10]:
brca_data

Unnamed: 0,GENE,PATIENT_CORE_COUNT,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS
2465,PIK3CA,35,124,120,0.97,6
1204,TP53,141,55,52,0.95,17
3126,H3C1,1,13,12,0.92,11
3091,H4C1,6,11,10,0.91,16
2534,MAP2K4,3,8,7,0.88,7
1201,ERBB2,12,14,12,0.86,16
1447,UBC,0,7,6,0.86,7


In [11]:
brca_data.to_clipboard()

In [21]:
brca_data = brca_data.reset_index(drop=True)
brca_data = brca_data.set_index("PATIENT_CORE_COUNT")
brca_data

Unnamed: 0_level_0,PATIENT_INTERFACE_COUNT,NUM_INTERFACE_PATIENTS_DISRUPTIVE_INTERACTOR,RATIO_(prev_two_col),NUM_UNIQUE_INTERACTORS
PATIENT_CORE_COUNT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,14,12,0.857143,16
141,55,52,0.945455,17
0,7,6,0.857143,7
35,124,120,0.967742,6
3,8,7,0.875,7
6,11,10,0.909091,16
1,13,12,0.923077,11


In [22]:
print(brca_data.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  PATIENT\_INTERFACE\_COUNT &  NUM\_INTERFACE\_PATIENTS\_DISRUPTIVE\_INTERACTOR &  RATIO\_(prev\_two\_col) &  NUM\_UNIQUE\_INTERACTORS \\
PATIENT\_CORE\_COUNT &                          &                                               &                       &                         \\
\midrule
12                 &                       14 &                                            12 &              0.857143 &                      16 \\
141                &                       55 &                                            52 &              0.945455 &                      17 \\
0                  &                        7 &                                             6 &              0.857143 &                       7 \\
35                 &                      124 &                                           120 &              0.967742 &                       6 \\
3                  &                        8 &                                 