In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
tcga_path = "/home/nanni/Data/TCGA/Xena/tcga.tsv"
output_path = "/home/nanni/Data/TCGA/CIBB/"

TUMOR = 0
NORMAL = 1

In [3]:
tcga = pd.read_csv(tcga_path, sep="\t")
gene_labels = tcga.columns[7:]
gene_labels

  interactivity=interactivity, compiler=compiler, result=result)


Index(['ARHGEF10L', 'HIF3A', 'RNF17', 'RNF10', 'RNF11', 'RNF13', 'GTF2IP1',
       'REM1', 'MTVR2', 'RTN4RL2',
       ...
       'TULP2', 'NPY5R', 'GNGT2', 'GNGT1', 'TULP3', 'PTRF', 'BCL6B', 'GSTK1',
       'SELP', 'SELS'],
      dtype='object', length=20530)

Saving the gene names in a Pandas series for mapping between indices and names

In [4]:
gene_labels_df = pd.DataFrame(data=gene_labels.tolist(), columns=["gene_symbol"])
gene_labels_df.to_csv(os.path.join(output_path, "gene_symbols.tsv"), sep="\t", index=True, header=True)

In [30]:
for tumor_type in tcga.tumor_type.unique():
    if tumor_type not in ['GBMLGG', 'COADREAD', 'BRCA']:
        print(tumor_type)
        
        X_tt = tcga.loc[tcga.tumor_type == tumor_type, gene_labels].as_matrix()
        y_tt = tcga.loc[tcga.tumor_type == tumor_type, "sample_type"].as_matrix()
        # We consider TUMOR everything which is not "Solid Tissue Normal"
        y_tt_num = np.ones(y_tt.shape[0])
        y_tt_num[y_tt != 'Solid Tissue Normal'] = TUMOR
        
        t_out_folder = os.path.join(output_path, tumor_type)
        os.makedirs(t_out_folder, exist_ok=True)
        np.save(os.path.join(t_out_folder, "X.npy"), X_tt)
        np.save(os.path.join(t_out_folder, "y.npy"), y_tt_num)

ESCA
SARC
OV
ACC
CESC
PAAD
DLBC
LGG
TGCT
HNSC
PCPG
KIRP
KICH
LUAD
BLCA
THYM
PRAD
CHOL
STAD
MESO
SKCM
UCEC
LUSC
UVM
KIRC
LIHC
LAML
READ
UCS
COAD
GBM
THCA


Aggregates

In [18]:
aggregates = {
    'LUNG': ['LUAD', 'LUSC'],
    'KIDNEY': ['KIRP', 'KICH', 'KIRC'],
    'COADREAD': ['COAD', 'READ'],
    'GBMLGG': ['LGG', 'GBM']
}
aggregate_folder = os.path.join(output_path, 'aggregates')

In [19]:
for agg_name, agg in aggregates.items():
    print("{:<10}-->{:<30}".format(agg_name, ", ".join(agg)))
    X_tt = tcga.loc[tcga.tumor_type.isin(agg), gene_labels].as_matrix()
    y_tt = tcga.loc[tcga.tumor_type.isin(agg), "sample_type"].as_matrix()
    # We consider TUMOR everything which is not "Solid Tissue Normal"
    y_tt_num = np.ones(y_tt.shape[0])
    y_tt_num[y_tt != 'Solid Tissue Normal'] = TUMOR

    t_out_folder = os.path.join(aggregate_folder, agg_name)
    os.makedirs(t_out_folder, exist_ok=True)
    np.save(os.path.join(t_out_folder, "X.npy"), X_tt)
    np.save(os.path.join(t_out_folder, "y.npy"), y_tt_num)

LUNG      -->LUAD, LUSC                    
KIDNEY    -->KIRP, KICH, KIRC              
COADREAD  -->COAD, READ                    
GBMLGG    -->LGG, GBM                      
