In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import cancer_data

In [52]:
def process_tcga_splicing(tcga_splicing):
    
    # keep first four identifier fields
    tcga_splicing.index = tcga_splicing.index.map(lambda x: "-".join(x.split("-")[:4]))
    # remove last letter
    tcga_splicing.index = tcga_splicing.index.map(lambda x: x[:-1])
    tcga_splicing = tcga_splicing[tcga_splicing.index.map(lambda x: x[-2:] != "11")]

    tcga_splicing = tcga_splicing.loc[~tcga_splicing.index.duplicated(keep="first")]
    
    return tcga_splicing

In [53]:
tcga_se = cancer_data.load("tcga_se")
tcga_a3ss = cancer_data.load("tcga_a3ss")

tcga_genex = cancer_data.load("tcga_normalized_gene_expression")
tcga_cn_continuous = cancer_data.load("tcga_cn_continuous")
tcga_cn_thresholded = cancer_data.load("tcga_cn_thresholded")
tcga_mutations = cancer_data.load("tcga_mutations")
tcga_msi = cancer_data.load("tcga_msi")

tcga_annotations = cancer_data.load("tcga_annotations")

tcga_se = process_tcga_splicing(tcga_se)
tcga_a3ss = process_tcga_splicing(tcga_a3ss)

In [54]:
rpl22_tcga = pd.read_csv("../data/raw/rpl22.tcga.data.csv")

rpl22_tcga = rpl22_tcga.dropna(subset=["sampleid"])
rpl22_tcga = rpl22_tcga.set_index("sampleid")
rpl22_tcga.index = rpl22_tcga.index.map(lambda x: x[:15])

rpl22_mut = rpl22_tcga["rpl22mut.mc3.k15"].dropna()

# Aggregate attributes

## Tumor sample info

In [55]:
select_sample_info = tcga_annotations[
    ["sample_type", "_primary_disease", "abbreviated_disease"]
]

select_sample_info.columns = ["Sample_type", "Primary_disease", "Abbreviated_disease"]

## Mutations

In [56]:
select_mutations = rpl22_tcga[["TP53mut", "rpl22mut.mc3.k15", "rpl22mut.mc3.all"]]
select_mutations.columns = [
    "TP53_mutation_type",
    "RPL22_k15fs_mutation",
    "RPL22_any_mutation",
]

## MSI

In [57]:
select_msi = tcga_msi[["MANTIS Score"]].copy()
select_msi["MSI"] = select_msi["MANTIS Score"] > 0.4

select_msi.columns = ["MANTIS_score", "MSI"]

## Exonusage

In [58]:
select_se = [
    "ENSG00000198625.8_ES_1_204501318:204501374:204506557:204506625:204507336:204507436_204506557:204506625",
    "ENSG00000143569.14_ES_1_154241382:154241430:154241837:154241888:154242675:154243040_154241837:154241888",
]

select_a3ss = [
    "ENSG00000163584.13_A3_3_170586086:170586168:170585801:170585923:170585801:170585990_170585923:170585990",
]

select_exonusage = pd.concat([tcga_se[select_se],tcga_a3ss[select_a3ss]],axis=1)
select_exonusage.columns = [
    "MDM4_exon_6_inclusion",
    "RPL22L1_exon_3A_inclusion",
    "UBAP2L_exon_29_inclusion",
]

select_exonusage["RPL22L1_exon_3A_inclusion"] = 1-select_exonusage["RPL22L1_exon_3A_inclusion"]

## Gene expression

In [59]:
select_genex_genes = ["MDM2_10743", "MDM4_10744", "RPL22_15208", "RPL22L1_15209"]

select_genex = tcga_genex[select_genex_genes]
select_genex.columns = ["MDM2_mRNA", "MDM4_mRNA", "RPL22_mRNA", "RPL22L1_mRNA"]

# Copy number

## Continuous

In [60]:
select_copynumber_genes = [
    "TP53",
    "MDM2",
    "MDM4",
    "RPL22",
    "RPL22L1",
]

select_copynumber = tcga_cn_continuous[select_copynumber_genes]
select_copynumber.columns = [
    "TP53_copy_number",
    "MDM2_copy_number",
    "MDM4_copy_number",
    "RPL22_copy_number",
    "RPL22L1_copy_number",
]

## Thresholded

In [61]:
select_copynumber_thresholded_genes = ["TP53", "MDM2", "MDM4", "RPL22", "RPL22L1"]

select_copynumber_thresholded = tcga_cn_thresholded[select_copynumber_thresholded_genes]

select_copynumber_thresholded.columns = [
    "TP53_copy_number_thresholded",
    "MDM2_copy_number_thresholded",
    "MDM4_copy_number_thresholded",
    "RPL22_copy_number_thresholded",
    "RPL22L1_copy_number_thresholded",
]

In [62]:
merged_tcga_info = pd.concat(
    [
        select_sample_info,
        select_mutations,
        select_msi,
        select_exonusage,
        select_genex,
        select_copynumber,
        select_copynumber_thresholded,
    ],
    join="outer",
    axis=1,
    sort=True,
)

In [63]:
merged_tcga_info.to_csv("../data/supplementary/S2_merged-tcga-info.txt", sep="\t")