# Process Gene Expression Dataframe

Processing gene expression dataframe from Xena for downstream analysis

In [12]:
import rnaseq_lib as r
import pandas as pd
import os

- Expected counts downloaded from [Xena](https://xenabrowser.net/datapages/?host=https://toil.xenahubs.net)
- Metadata table processed from [this notebook](https://github.com/jvivian/ipython_notebooks/tree/master/tcga-gtex-metadata)

## Inputs

In [2]:
%%time
df_path = '/mnt/rna-seq-analysis/data/xena/TcgaTargetGtex_gene_expected_count'
met_path = '/mnt/rna-seq-analysis/metadata/tcga_gtex_metadata_intersect.tsv'
df = pd.read_csv(df_path, sep='\t', index_col=0)
met = pd.read_csv(met_path, sep='\t', index_col=0)

CPU times: user 16min 37s, sys: 2min 9s, total: 18min 47s
Wall time: 18min 57s


These files are too big for github, so set directory where all intermediate files will be written

In [3]:
workdir = '/mnt/gene-expression-processing'

## Process Dataframe

### I. Subset TCGA and GTEx samples

In [4]:
tcga_samples = [x for x in df.columns if x.startswith('TCGA')]
gtex_samples = [x for x in df.columns if x.startswith('GTEX')]
print 'TCGA Samples: {}\tGTEx Samples: {}'.format(len(tcga_samples), len(gtex_samples))

TCGA Samples: 10830	GTEx Samples: 7776


In [6]:
df = df[tcga_samples + gtex_samples]
df.shape

(60498, 18606)

### II. Reverse Xena normalization

Xena normalizes data: log2(x + 1). Reverse normalization to get expected counts

In [8]:
%%time
df = df.apply(lambda x: 2**x - 1)

CPU times: user 1min 4s, sys: 23.2 s, total: 1min 28s
Wall time: 1min 28s


save intermediate

In [13]:
r.utils.mkdir_p(workdir)
out = os.path.join(workdir, 'TCGA-GTEx-RSEM-gene-exp-counts.tsv')
df.to_csv(out, sep='\t')

### III. Subset Protein-Coding Genes

In [26]:
genes = []
gtf_path = '/mnt/rna-seq-analysis/metadata/gencode.v23.annotation.gtf'
with open(gtf_path, 'r') as f:
    for line in f:
        if line.startswith('#'):
            continue
        line = r.gtf.parse(line)
        if line['feature'] == 'gene':
            if line['gene_type'] == 'protein_coding':
                genes.append(line['gene_id'])
print '{} protein-coding genes found'.format(len(genes))

19797 protein-coding genes found


Save intermediate

In [29]:
df = df.loc[genes]
out = os.path.join(workdir, 'TCGA-GTEx-RSEM-counts-protein-coding.tsv')
df.to_csv(out, sep='\t')

Save TCGA and GTEx dataframes for normalizing

In [33]:
tcga_path = os.path.join(workdir, 'TCGA-RSEM-counts-protein-coding.tsv')
gtex_path = os.path.join(workdir, 'GTEx-RSEM-counts-protein-coding.tsv')
%time df[tcga_samples].to_csv(tcga_path, sep='\t')
%time df[gtex_samples].to_csv(gtex_path, sep='\t')

CPU times: user 3min 17s, sys: 33.2 s, total: 3min 50s
Wall time: 3min 52s
CPU times: user 2min 13s, sys: 19.5 s, total: 2min 32s
Wall time: 2min 33s


### IV. Run DESeq2 Normalization

In [34]:
%time tcga_norm = r.R.deseq2_normalize(df_path=gtex_path, output_dir=workdir, map_gene_names=False)


Calling: docker run --rm -v /mnt/gene-expression-processing:/data -v /mnt/gene-expression-processing:/df jvivian/deseq2 /data/work_dir/deseq2.R

CPU times: user 124 ms, sys: 1.7 s, total: 1.82 s
Wall time: 25min 27s


'/mnt/gene-expression-processing/GTEx-RSEM-counts-protein-coding.deseq2-normalized.tsv'

In [35]:
%time gtex_norm = r.R.deseq2_normalize(df_path=tcga_path, output_dir=workdir, map_gene_names=False)


Calling: docker run --rm -v /mnt/gene-expression-processing:/data -v /mnt/gene-expression-processing:/df jvivian/deseq2 /data/work_dir/deseq2.R

CPU times: user 256 ms, sys: 1.54 s, total: 1.79 s
Wall time: 47min 36s


'/mnt/gene-expression-processing/TCGA-RSEM-counts-protein-coding.deseq2-normalized.tsv'

### V. Combine and Upload Final Table

In [38]:
df = pd.concat([pd.read_csv(tcga_norm, index_col=0, sep='\t'), 
                pd.read_csv(gtex_norm, index_col=0, sep='\t')], axis=1)
df.shape

(19797, 18606)

In [40]:
df_path = os.path.join(workdir, 'TCGA-GTEx-RSEM-counts-protein-coding.deseq2-normalized.tsv')
df.to_csv(df_path, sep='\t')

In [43]:
os.environ['SYNAPSE_PASS
           ']

{'LESSOPEN': '| /usr/bin/lesspipe %s', 'SSH_CLIENT': '10.60.0.54 57372 22', 'LOGNAME': 'ubuntu', 'USER': 'ubuntu', 'PATH': '/home/ubuntu/anaconda2/bin:/home/ubuntu/anaconda2/bin:/home/ubuntu/anaconda3/bin:/home/ubuntu/anaconda2/bin:/home/ubuntu/anaconda3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games', 'HOME': '/home/ubuntu', 'LANG': 'en_US.UTF-8', 'TERM': 'xterm-color', 'SHELL': '/bin/bash', 'SHLVL': '2', 'WINDOW': '0', 'XDG_RUNTIME_DIR': '/run/user/1000', 'JPY_PARENT_PID': '29896', 'GIT_PAGER': 'cat', 'SYNAPSE_PASS': 'Ripper!05', 'XDG_SESSION_ID': '35489', 'STY': '7677.jupyter', '_': '/usr/bin/env', 'TERMCAP': 'SC|screen|VT 100/ANSI X3.64 virtual terminal:\\\n\t:DO=\\E[%dB:LE=\\E[%dD:RI=\\E[%dC:UP=\\E[%dA:bs:bt=\\E[Z:\\\n\t:cd=\\E[J:ce=\\E[K:cl=\\E[H\\E[J:cm=\\E[%i%d;%dH:ct=\\E[3g:\\\n\t:do=^J:nd=\\E[C:pt:rc=\\E8:rs=\\Ec:sc=\\E7:st=\\EH:up=\\EM:\\\n\t:le=^H:bl=^G:cr=^M:it#8:ho=\\E[H:nw=\\EE:ta=^I:is=\\E)0:\\\n\t:li#23:co#110:am:xn:xv:LP:s

In [42]:
r.web.synapse.upload_file(df_path, login='jtvivian@gmail.com', parent=r.web.synapse.expression)

TypeError: {'LESSOPEN': '| /usr/bin/lesspipe %s', 'SSH_CLIENT': '10.60.0.54 57372 22', 'LOGNAME': 'ubuntu', 'USER': 'ubuntu', 'PATH': '/home/ubuntu/anaconda2/bin:/home/ubuntu/anaconda2/bin:/home/ubuntu/anaconda3/bin:/home/ubuntu/anaconda2/bin:/home/ubuntu/anaconda3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games', 'HOME': '/home/ubuntu', 'LANG': 'en_US.UTF-8', 'TERM': 'xterm-color', 'SHELL': '/bin/bash', 'SHLVL': '2', 'WINDOW': '0', 'XDG_RUNTIME_DIR': '/run/user/1000', 'JPY_PARENT_PID': '29896', 'GIT_PAGER': 'cat', 'SYNAPSE_PASS': 'Ripper!05', 'XDG_SESSION_ID': '35489', 'STY': '7677.jupyter', '_': '/usr/bin/env', 'TERMCAP': 'SC|screen|VT 100/ANSI X3.64 virtual terminal:\\\n\t:DO=\\E[%dB:LE=\\E[%dD:RI=\\E[%dC:UP=\\E[%dA:bs:bt=\\E[Z:\\\n\t:cd=\\E[J:ce=\\E[K:cl=\\E[H\\E[J:cm=\\E[%i%d;%dH:ct=\\E[3g:\\\n\t:do=^J:nd=\\E[C:pt:rc=\\E8:rs=\\Ec:sc=\\E7:st=\\EH:up=\\EM:\\\n\t:le=^H:bl=^G:cr=^M:it#8:ho=\\E[H:nw=\\EE:ta=^I:is=\\E)0:\\\n\t:li#23:co#110:am:xn:xv:LP:sr=\\EM:al=\\E[L:AL=\\E[%dL:\\\n\t:cs=\\E[%i%d;%dr:dl=\\E[M:DL=\\E[%dM:dc=\\E[P:DC=\\E[%dP:\\\n\t:im=\\E[4h:ei=\\E[4l:mi:IC=\\E[%d@:ks=\\E[?1h\\E=:\\\n\t:ke=\\E[?1l\\E>:vi=\\E[?25l:ve=\\E[34h\\E[?25h:vs=\\E[34l:\\\n\t:ti=\\E[?1049h:te=\\E[?1049l:us=\\E[4m:ue=\\E[24m:so=\\E[3m:\\\n\t:se=\\E[23m:mb=\\E[5m:md=\\E[1m:mh=\\E[2m:mr=\\E[7m:\\\n\t:me=\\E[m:ms:\\\n\t:Co#8:pa#64:AF=\\E[3%dm:AB=\\E[4%dm:op=\\E[39;49m:AX:\\\n\t:vb=\\Eg:G0:as=\\E(0:ae=\\E(B:\\\n\t:ac=\\140\\140aaffggjjkkllmmnnooppqqrrssttuuvvwwxxyyzz{{||}}~~..--++,,hhII00:\\\n\t:po=\\E[5i:pf=\\E[4i:Km=\\E[M:k0=\\E[10~:k1=\\EOP:k2=\\EOQ:\\\n\t:k3=\\EOR:k4=\\EOS:k5=\\E[15~:k6=\\E[17~:k7=\\E[18~:\\\n\t:k8=\\E[19~:k9=\\E[20~:k;=\\E[21~:F1=\\E[23~:F2=\\E[24~:\\\n\t:F3=\\E[1;2P:F4=\\E[1;2Q:F5=\\E[1;2R:F6=\\E[1;2S:\\\n\t:F7=\\E[15;2~:F8=\\E[17;2~:F9=\\E[18;2~:FA=\\E[19;2~:kb=\x7f:\\\n\t:K2=\\EOE:kB=\\E[Z:kF=\\E[1;2B:kR=\\E[1;2A:*4=\\E[3;2~:\\\n\t:*7=\\E[1;2F:#2=\\E[1;2H:#3=\\E[2;2~:#4=\\E[1;2D:%c=\\E[6;2~:\\\n\t:%e=\\E[5;2~:%i=\\E[1;2C:kh=\\E[1~:@1=\\E[1~:kH=\\E[4~:\\\n\t:@7=\\E[4~:kN=\\E[6~:kP=\\E[5~:kI=\\E[2~:kD=\\E[3~:ku=\\EOA:\\\n\t:kd=\\EOB:kr=\\EOC:kl=\\EOD:km:', 'SSH_CONNECTION': '10.60.0.54 57372 10.101.0.20 22', 'LESSCLOSE': '/usr/bin/lesspipe %s %s', 'MPLBACKEND': 'module://ipykernel.pylab.backend_inline', 'SSH_TTY': '/dev/pts/0', 'OLDPWD': '/home/ubuntu/rnaseq-notebooks/DESeq2-Runs', 'CLICOLOR': '1', 'PWD': '/home/ubuntu/rnaseq-notebooks', 'MAIL': '/var/mail/ubuntu', 'LS_COLORS': 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.axa=00;36:*.oga=00;36:*.spx=00;36:*.xspf=00;36:', 'PAGER': 'cat'} is not JSON serializable