### Script to export labels for top 50 most mutated TCGA genes

In [1]:
import os

import pandas as pd

import pancancer_evaluation.config as cfg
import pancancer_evaluation.utilities.data_utilities as du
from pancancer_evaluation.utilities.tcga_utilities import (
    process_y_matrix
)

In [2]:
sample_info_df = du.load_sample_info(verbose=True)
pancan_data = du.load_pancancer_data(verbose=True)
(sample_freeze_df,
 mutation_df,
 copy_loss_df,
 copy_gain_df,
 mut_burden_df) = pancan_data
genes_df = du.load_top_50()

Loading sample info...
Loading pan-cancer data from cached pickle file...


In [3]:
labels_df = pd.DataFrame(index=sample_info_df.index)

for gene_idx, gene_series in genes_df.iterrows():
    
    gene = gene_series.gene
    classification = gene_series.classification
    y_mutation_df = mutation_df.loc[:, gene]
    
    include_copy = True
    if classification == 'Oncogene':
        y_copy_number_df = copy_gain_df.loc[:, gene]
    elif classification == 'TSG':
        y_copy_number_df = copy_loss_df.loc[:, gene]
    else:
        y_copy_number_df = pd.DataFrame()
        include_copy = False
        
    y_df = process_y_matrix(
        y_mutation=y_mutation_df,
        y_copy=y_copy_number_df,
        include_copy=include_copy,
        gene=gene,
        sample_freeze=sample_freeze_df,
        mutation_burden=mut_burden_df,
        filter_count=cfg.filter_count,
        filter_prop=cfg.filter_prop,
        output_directory=None,
        hyper_filter=5,
        test=True
    )
    
    gene_labels = y_df.status
    gene_labels.name = gene
    labels_df[gene] = gene_labels
    
labels_df.iloc[-5:, :20]

Unnamed: 0_level_0,TP53,TTN,MUC16,PIK3CA,CSMD3,RYR2,LRP1B,SYNE1,FLG,USH2A,PCLO,ZFHX4,DNAH5,KMT2D,OBSCN,CSMD1,FAT4,SPTA1,KMT2C,FAT3
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
TCGA-ZS-A9CG-01,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,1.0,,0.0
TCGA-ZT-A8OM-01,,,,,,,,,,,,,,,,,,,,
TCGA-ZU-A8S4-01,,,,,,,,,,,,,,,,,,,,
TCGA-ZU-A8S4-11,,,,,,,,,,,,,,,,,,,,
TCGA-ZX-AA5X-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,,0.0,


In [4]:
print(labels_df.shape)
print(sample_info_df.shape)

(11060, 50)
(11060, 3)


In [5]:
labels_df.to_csv(os.path.join(cfg.data_dir, 'top50_labels.tsv'), sep='\t', na_rep='NaN')