Data structure for holding TCGA and GTEx samples

`dict(str, dict(str, set))`

`samples.DATASET.TISSUE` = list of samples

In [16]:
import pandas as pd
import pickle
from collections import defaultdict

import rnaseq_lib as r

Inputs

In [10]:
# Read in metadata
met_path = '/mnt/rna-seq-analysis/metadata/tcga_gtex_metadata_intersect.tsv'
met = pd.read_csv(met_path, index_col=0, sep='\t')

# Read in samples from dataframe
df_path = '/mnt/rna-seq-analysis/data/xena/tcga-gtex-processed-counts.tsv'
samples = [x.strip() for x in open(df_path).readline().split()]

# Subset metadata by samples
met = met[met.id.isin(samples)]

Sort sample by dataset then tissue

In [30]:
samples = defaultdict(dict)
for tissue in met.tissue.unique():
    sub = met[met.tissue == tissue]
    for dataset in ['tcga-t', 'tcga-n', 'gtex']:
        if dataset == 'tcga-t':
            samples[tissue]['tcga-t'] = set(sub[(sub.dataset == 'tcga') & (sub.tumor == 'yes')].id)
        elif dataset == 'tcga-n':
            samples[tissue]['tcga-n'] = set(sub[(sub.dataset == 'tcga') & (sub.tumor == 'no')].id)
        else:
            samples[tissue]['gtex'] = set(sub[(sub.dataset == 'gtex')].id)

Convert to Expando object

In [31]:
samples = r.utils.rexpando(samples)

Serialize

In [33]:
r.utils.mkdir_p('pickles')
with open('pickles/samples.pickle', 'w') as f:
    pickle.dump(samples, f)