In [81]:
from bokeh.charts import HeatMap, output_file, show
from bokeh.resources import CDN
from bokeh.embed import file_html, autoload_static
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from notebooks import log_progress
import pickle
from bokeh.plotting import reset_output

import os

# Heatmap of Tissues

- Combine dataframes of tissues
- Drop duplicate frames
- Quantile Normalize Data (genes by samples)
- For each tissue:
    - Compute the cdist against every other tissue
    - Collapse cdist matrix via np.median
    
- Create dataframe
    - Tissues A (a,a,a...,b,b,b...c,c,c)
    - Tissues B (a,b,c....a,b,c...a,b,c)
    - Average distance between all points
   
   
### Read in quantile normalized dataset

I tried to find a function to quantile normalize our rather large dataset (12,000 x 20,000) on my laptop, but that was a failure so we'll compute it remotely and just read it in. 

In [82]:
df = pd.read_csv('/mnt/rna-seq-analysis/data/xena/deseq2_normalized_tcga_gtex_counts.tsv', sep='\t', index_col=0)

We'll try simple log-normalization to see how significantly QN changed things.

In [83]:
exp_dir = '/mnt/rna-seq-analysis/data/tissue-pairs'
tissues = os.listdir(exp_dir)
tsv = 'tcga-gtex-exp.tsv'
df = pd.concat([pd.read_csv(os.path.join(exp_dir, t, tsv), 
                             sep='\t', index_col=0) for t in tissues], axis=1)

In [84]:
df = df.T.groupby(level=0).first().T

In [85]:
samples = [x for x in df.columns if x.startswith('GTEX') or (x.endswith('01') or x.endswith('11'))]
df = df[samples]

In [86]:
df = df.apply(lambda x: np.log2(x + 1))

For each tissue, get average of every combination of points between another tissue

In [87]:
tissue_map = pickle.load(open('../../data/tissue_map.pickle', 'rb'))


## Relational Heatmap of TCGA Normals to GTEx

In [110]:
def filter_by_tissue(s, t):
    if t == 'normal':
        s = [x for x in s if x.endswith('-11')]
    elif t == 'tumor':
        s = [x for x in s if x.endswith('-01')]
    else:
        s = [x for x in s if x.startswith('GTEX')]
    return s

def heatmap_values(df, t1, t2):
    tissues = sorted(os.listdir('../../data/expression-PC/'))
    tissue_a = []
    tissue_b = []
    dist = []
    for a in log_progress(tissues):
        for b in tissues:
            # Collect samples for tissues a and b
            samples_a = [x for x in df.columns if tissue_map[x] == a]
            samples_b = [x for x in df.columns if tissue_map[x] == b]
            samples_a = filter_by_tissue(samples_a, t1)
            samples_b = filter_by_tissue(samples_b, t2)
            if samples_a and samples_b:
                df_a = df[samples_a].T  # Samples by Genes for cdist
                df_b = df[samples_b].T  # Samples by Genes for cdist
                dist.append(np.median(cdist(df_a, df_b)))  # Take median of all combinations of distances between points
                tissue_a.append(a)
                tissue_b.append(b)
    hm = pd.DataFrame()
    hm['t1'] = tissue_a
    hm['t2'] = tissue_b
    x = dist
    hm['dist'] = (x-min(x))/(max(x)-min(x))  # Normalize
    # Invert distance scores — We want dark spots to indicate strong relationships i.e. small multidimensional distance
    hm['dist'] = 1 - hm.dist
    return hm

In [109]:
def plot_heatmap(hm, xlabel, ylabel, title, output_name):
    h = HeatMap(hm, x='t1', y='t2', values='dist', stat=None,
                xlabel=xlabel, ylabel=ylabel,
                legend=False, responsive=False,
                title=title,
                plot_width=512, plot_height=512)
    
    h.title.align = 'center'

    js, tag = autoload_static(h, CDN, "js/bokeh-heatmaps/{}".format(output_name))
    with open(output_name, 'w') as f:
        f.write(js)
    with open("tags", 'a') as f:
        f.write(tag)

    reset_output()

In [111]:
hm = heatmap_values(df, 'normal', 'gtex')

In [112]:
plot_heatmap(hm, xlabel='Normal', ylabel='GTEx', 
             title='Average Euclidean Distance Protein-Coding', 
             output_name='hm-pc-normal-gtex.js')

## Relational Heapmap of TCGA Tumors to GTEx

In [113]:
hm = heatmap_values(df, 'tumor', 'gtex')

In [114]:
plot_heatmap(hm, xlabel='Tumor', ylabel='GTEx', 
             title='Average Euclidean Distance Protein-coding', 
             output_name='hm-pc-tumor-gtex.js')

## Heatmap of TCGA Normals and Tumors

In [115]:
hm = heatmap_values(df, 'normal', 'tumor')

In [116]:
plot_heatmap(hm, xlabel='Normal', ylabel='Tumor', 
             title='Average Euclidean Distance Protein-coding', 
             output_name='hm-pc-normal-tumor.js')

# Reduced Gene Space

Let's examine heatmaps that only contain genes from the UCSF RNA panel

In [117]:
temp = df.T  # Setting genes to columns

In [118]:
with open('../../data/UCSF-RNAPanel-Final-412-genes.csv', 'r') as f:
    ucsf_genes = [x.strip() for x in f.readlines()]

In [119]:
gene_map = pickle.load(open('../../data/gene_map.pickle', 'r'))

In [120]:
genes = [gene_map[x] if x in gene_map else x for x in temp.columns]

In [121]:
temp.columns = genes

In [122]:
ucsf_genes = [x for x in ucsf_genes if x in genes]
print 'Total reduced gene space: {}'.format(len(ucsf_genes))

Total reduced gene space: 409


In [123]:
sub = temp[ucsf_genes].T

#### Normal vs. GTEx

In [124]:
hm = heatmap_values(sub, 'normal', 'gtex')

In [125]:
plot_heatmap(hm, xlabel='Normal', ylabel='GTEx', 
             title='Average Euclidean Distance UCSF-409 Genes', 
             output_name='hm-ucsf-normal-gtex.js')

#### Tumor vs. GTEx

In [126]:
hm = heatmap_values(sub, 'tumor', 'gtex')

In [127]:
plot_heatmap(hm, xlabel='Tumor', ylabel='GTEx', 
             title='Average Euclidean Distance UCSF-409 Genes', 
             output_name='hm-ucsf-tumor-gtex.js')

#### Tumor vs. Normal

In [128]:
hm = heatmap_values(sub, 'normal', 'tumor')

In [129]:
plot_heatmap(hm, xlabel='Normal', ylabel='Tumor', 
             title='Average Euclidean Distance UCSF-409 Genes', 
             output_name='hm-ucsf-tumor-normal.js')

# ComBat Corrected Dataset

In [None]:
exp_dir = '../../data/expression-CBNT-PC/'
tissues = sorted(os.listdir(exp_dir))
tsv = 'expression-CBNT-PC.tsv'
exp_df = pd.concat([pd.read_csv(os.path.join(exp_dir, t, tsv), sep='\t', index_col=0) for t in tissues], axis=1)

In [None]:
exp_df = exp_df.T.groupby(level=0).first().T
samples = [x for x in exp_df.columns if x.startswith('GTEX') or (x.endswith('01') or x.endswith('11'))]
df = exp_df[samples]

### Normal vs. GTEx

In [None]:
hm = heatmap_values(df, 'normal', 'gtex')

In [None]:
h = HeatMap(hm, x='t1', y='t2', values='dist', stat=None,
            xlabel='TCGA Normal', ylabel='GTEx',
            legend=False, responsive=True,
            title='Average Euclidean Distance Across all Protein Coding Genes')
show(h)

js, tag = autoload_static(h, CDN, "js/bokeh-heatmaps/heatmap-normal-gtex.js")
with open("heatmap-normal-gtex-combat.js", 'w') as f:
    f.write(js)
with open("heatmap-normal-gtex-combat.tag", 'w') as f:
    f.write(tag)

### Tumor vs. GTEx

In [None]:
hm = heatmap_values(df, 'tumor', 'gtex')

In [None]:
h = HeatMap(hm, x='t1', y='t2', values='dist', stat=None,
            xlabel='TCGA Tumor', ylabel='GTEx',
            legend=False, responsive=True,
            title='Average Euclidean Distance Across all Protein Coding Genes')
show(h)

js, tag = autoload_static(h, CDN, "js/bokeh-heatmaps/heatmap-tumor-gtex.js")
with open("heatmap-tumor-gtex-combat.js", 'w') as f:
    f.write(js)
with open("heatmap-tumor-gtex-combat.tag", 'w') as f:
    f.write(tag)

### Normal vs. Tumor

In [None]:
hm = heatmap_values(df, 'normal', 'tumor')

In [None]:
h = HeatMap(hm, x='t1', y='t2', values='dist', stat=None,
            xlabel='TCGA Normal', ylabel='GTEx',
            legend=False, responsive=True,
            title='Average Euclidean Distance Across all Protein Coding Genes')
show(h)

js, tag = autoload_static(h, CDN, "js/bokeh-heatmaps/heatmap-normal-gtex.js")
with open("heatmap-normal-tumor-combat.js", 'w') as f:
    f.write(js)
with open("heatmap-normal-tumor-combat.tag", 'w') as f:
    f.write(tag)