Combine metadata and gene expression data into a single object for convenience

In [21]:
import pandas as pd
import os

import rnaseq_lib as r

## Inputs

In [4]:
# Expression dataframe
df_path = '/mnt/rnaseq-cancer/Expression/tcga-gtex-processed-counts.tsv'
%time df = pd.read_csv(df_path, index_col=0, sep='\t')

# Sample Metadata
met_path = '/mnt/rnaseq-cancer/Metadata/tcga_gtex_metadata_intersect.tsv'
met = pd.read_csv(met_path, index_col=0, sep='\t')
# Subset by samples in dataframe
met = met[met.id.isin(df.columns)]

CPU times: user 2min 16s, sys: 3.67 s, total: 2min 19s
Wall time: 2min 19s


Drop duplicate samples

In [13]:
met = met[~met.id.duplicated()]
met.shape

(10957, 16)

Map genes

In [6]:
df.index = r.tissues.map_genes(df.index)

## Create combined object

In [17]:
combined = pd.concat([met, df.T], axis=1)
combined.shape

(10957, 19813)

In [20]:
combined.head(2)

Unnamed: 0,id,reads,size_MB,platform,sex,tissue,seq_site,weight,height,mapped_reads,...,COX2,ATP8,ATP6,COX3,ND3,ND4L,ND4,ND5,ND6,CYTB
GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2426-SM-5EGGH,33263462,2329.0,Illumina HiSeq,female,Uterus,BI,199.0,66.0,53976291.0,...,32248.487527,6679.429698,59892.670463,59552.612717,6200.25742,5393.586347,92219.409914,13941.401501,2619.024286,67593.239471
GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2826-SM-5GZXL,38328619,2695.0,Illumina HiSeq,female,Breast,BI,199.0,66.0,64086947.0,...,138250.268953,31721.599785,290067.651987,389815.411184,46011.363867,22887.446158,446571.230577,47535.193603,6440.320348,222327.376092


Save

In [22]:
output_dir = '/mnt/rnaseq-cancer/Objects'
r.utils.mkdir_p(output_dir)

combined.to_csv(os.path.join(output_dir, 'tcga-gtex-metadata-expression.tsv'), sep='\t')