# Beautify the Wext Results (Add gene ID, etc.)

In [1]:
from datetime import datetime

print("\033[32m{}\033[0m".format(datetime.now().strftime("%B %d, %Y %H:%M:%S")))

[32mMarch 09, 2022 12:22:26[0m


In [2]:
import pandas as pd
import os.path as op

from src.helpers.helpers_analysis.gene_id_retrieval import GeneIDFetcher

UNIPROT_GENE_MAPPING_PATH = "../../../helpers/helpers_analysis/gene_retrieval/UNIPROT_GENE_MAPPING.csv"

BRCA_WEXT_OUTPUT_PATH = "BRCA_wext_raw_output_t0_k2.tsv"
COAD_WEXT_OUTPUT_PATH = "COAD_wext_raw_output_t0_k2.tsv"
ESCA_WEXT_OUTPUT_PATH = "ESCA_wext_raw_output_t0_k2.tsv"
GBM_WEXT_OUTPUT_PATH = "GBM_wext_raw_output_t0_k2.tsv"
HNSC_WEXT_OUTPUT_PATH = "HNSC_wext_raw_output_t0_k2.tsv"
OV_WEXT_OUTPUT_PATH = "OV_wext_raw_output_t0_k2.tsv"

# Reflect changes in the modules immediately.
%load_ext autoreload
%autoreload 2

In [3]:
gene_id_fetcher = GeneIDFetcher(UNIPROT_GENE_MAPPING_PATH)

In [7]:
gene_id_fetcher.fetch("P22415")

'USF1'

In [6]:
gene_id_fetcher.fetch("Q13748")



'TUBA3C'

In [8]:
def read_wext_datasets(data_path):
    data = pd.read_csv(data_path, sep="\t")
    data.rename({"#Gene set": "UNIPROT_PAIRS"}, axis="columns", inplace=True)
    return data

def _get_gene_id_pairs(protein_pairs):
    p1, p2 = protein_pairs.split(",")
    p1 = p1.strip()
    p2 = p2.strip()

    g1 = gene_id_fetcher.fetch(p1)
    g2 = gene_id_fetcher.fetch(p2)

    return f"{g1}, {g2}"


def add_gene_id_column(data: pd.DataFrame):
    converted_pairs = data["UNIPROT_PAIRS"].apply(lambda x: _get_gene_id_pairs(x))
    data.insert(
        loc=1,
        column="GENE_ID_PAIRS",
        value=converted_pairs
    )
    return data


def export_data(tcga: str, data: pd.DataFrame):
    file_date = datetime.today().strftime('%Y-%m-%d')
    file_name = f"{tcga}_wext_output_gene_added_{file_date}.tsv"
    if op.isfile(file_name):
        raise FileExistsError

    else:
        data.to_csv(file_name, sep="\t", index=False)
        print(f"Data is exported to {file_name} successfully.")


In [9]:
brca_wext_data = read_wext_datasets(BRCA_WEXT_OUTPUT_PATH)
coad_wext_data = read_wext_datasets(COAD_WEXT_OUTPUT_PATH)
esca_wext_data = read_wext_datasets(ESCA_WEXT_OUTPUT_PATH)
gbm_wext_data = read_wext_datasets(GBM_WEXT_OUTPUT_PATH)
hnsc_wext_data = read_wext_datasets(HNSC_WEXT_OUTPUT_PATH)
ov_wext_data = read_wext_datasets(OV_WEXT_OUTPUT_PATH)

TCGA_TO_WEXT_DATASETS = {
    "BRCA": brca_wext_data,
    "COAD": coad_wext_data,
    "ESCA": esca_wext_data,
    "GBM": gbm_wext_data,
    "HNSC": hnsc_wext_data,
    "OV": ov_wext_data,
}

In [10]:
for tcga, tcga_wext_data in TCGA_TO_WEXT_DATASETS.items():
    add_gene_id_column(tcga_wext_data)
    export_data(tcga=tcga, data=tcga_wext_data)

Data is exported to BRCA_wext_output_gene_added_2022-03-09.tsv successfully.
Data is exported to COAD_wext_output_gene_added_2022-03-09.tsv successfully.
Data is exported to ESCA_wext_output_gene_added_2022-03-09.tsv successfully.
Data is exported to GBM_wext_output_gene_added_2022-03-09.tsv successfully.
Data is exported to HNSC_wext_output_gene_added_2022-03-09.tsv successfully.
Data is exported to OV_wext_output_gene_added_2022-03-09.tsv successfully.


---

In [6]:
test_data = pd.read_csv("BRCA_wext_raw_output_t0_k2.tsv", sep="\t")
test_data.rename({"#Gene set": "UNIPROT_PAIRS"}, axis="columns", inplace=True)

In [7]:
test_data

Unnamed: 0,UNIPROT_PAIRS,WRE (Saddlepoint) P-value,WRE (Saddlepoint) FDR,WRE (Saddlepoint) Runtime,T,Z,t00,t01,t10,t11
0,"O75367, Q71DI3",0.000000,0.0,,,,,,,
1,"Q16514, Q9Y294",0.000000,0.0,,,,,,,
2,"O75367, Q9UER7",0.000000,0.0,,,,,,,
3,"P84243, Q9Y294",0.000000,0.0,,,,,,,
4,"O75367, P61077",0.000000,0.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
1706,"Q14289, Q9H9B1",0.998271,1.0,,,,,,,
1707,"O00459, P0C0S8",0.998446,1.0,,,,,,,
1708,"P61077, Q9H3D4",0.998625,1.0,,,,,,,
1709,"P21860, Q96SB4",0.999060,1.0,,,,,,,


In [8]:
bar = add_gene_id_column(test_data)

In [9]:
test_data

Unnamed: 0,UNIPROT_PAIRS,GENE_ID_PAIRS,WRE (Saddlepoint) P-value,WRE (Saddlepoint) FDR,WRE (Saddlepoint) Runtime,T,Z,t00,t01,t10,t11
0,"O75367, Q71DI3","MACROH2A1, H3C15",0.000000,0.0,,,,,,,
1,"Q16514, Q9Y294","TAF12, ASF1A",0.000000,0.0,,,,,,,
2,"O75367, Q9UER7","MACROH2A1, DAXX",0.000000,0.0,,,,,,,
3,"P84243, Q9Y294","H3-3A, ASF1A",0.000000,0.0,,,,,,,
4,"O75367, P61077","MACROH2A1, UBE2D3",0.000000,0.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1706,"Q14289, Q9H9B1","PTK2B, EHMT1",0.998271,1.0,,,,,,,
1707,"O00459, P0C0S8","PIK3R2, H2AC11",0.998446,1.0,,,,,,,
1708,"P61077, Q9H3D4","UBE2D3, TP63",0.998625,1.0,,,,,,,
1709,"P21860, Q96SB4","ERBB3, SRPK1",0.999060,1.0,,,,,,,


In [10]:
export_data(tcga="TEST", data=test_data)

Data is exported to TEST_wext_output_gene_added_2022-03-09.tsv successfully.


In [18]:
test_data["UNIPROT_PAIRS"][0]

'P22415, P84022'

In [24]:
_get_gene_id_pairs(test_data["UNIPROT_PAIRS"][0])

`P22415`
`P84022`


In [11]:
test_data

Unnamed: 0,UNIPROT_PAIRS,WRE (Saddlepoint) P-value,WRE (Saddlepoint) FDR,WRE (Saddlepoint) Runtime,T,Z,t00,t01,t10,t11
0,"P22415, P84022",0.000000,0.0,,,,,,,
1,"P59998, Q16478",0.000000,0.0,,,,,,,
2,"P16885, P22415",0.000000,0.0,,,,,,,
3,"P19838, P22415",0.000000,0.0,,,,,,,
4,"P16234, P61019",0.000000,0.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
22150,"O14757, Q16539",0.999925,1.0,,,,,,,
22151,"Q13625, Q9Y3L3",0.999928,1.0,,,,,,,
22152,"P24941, Q8WUF5",0.999945,1.0,,,,,,,
22153,"O43683, Q99466",0.999954,1.0,,,,,,,
