In [59]:
%matplotlib inline
import bokeh
from bokeh.io import output_notebook
from bokeh.charts import Scatter, output_file, show, Bar
from bokeh.palettes import Accent, Category20c
from bokeh.resources import CDN
from bokeh.embed import file_html, autoload_static
from bokeh.plotting import reset_output

import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
import pickle

from notebooks import log_progress

sns.set_style('whitegrid')

# Clustering in Reduced Gene Space

We'll examine our clustering of tissues in a reduced gene space using genes selected from the UCSF-500 (actually 41 genes)

In [25]:
exp_dir = '../../data/expression-PC/'
tissues = os.listdir(exp_dir)
tsv = 'combined-gtex-tcga-counts-protein-coding.tsv'
df = pd.concat([pd.read_csv(os.path.join(exp_dir, t, tsv), 
                             sep='\t', index_col=0) for t in tissues], axis=1)

In [26]:
df = df.T.groupby(level=0).first().T

In [27]:
samples = [x for x in df.columns if x.startswith('GTEX') or (x.endswith('01') or x.endswith('11'))]
df = df[samples]

In [30]:
df = df.T

In [31]:
df = df.apply(lambda x: np.log2(x + 1))

## Subset Gene Space

Read in UCSF gene list

In [34]:
with open('../../data/UCSF-RNAPanel-Final-412-genes.csv', 'r') as f:
    ucsf_genes = [x.strip() for x in f.readlines()]

Load gene map which associates a gene ID with a gene name

In [8]:
gene_map = pickle.load(open('../../data/gene_map.pickle', 'r'))

In [36]:
genes = [gene_map[x] if x in gene_map else x for x in df.columns]

In [38]:
df.columns = genes

Subset ucsf genes if they're in our list

In [43]:
ucsf_genes = [x for x in ucsf_genes if x in genes]

In [45]:
sub = df[ucsf_genes]

Reduce down to 50 components with TruncatedSVD

In [53]:
y = TruncatedSVD(n_components=50, random_state=0).fit_transform(np.array(sub))

In [64]:
model = TSNE(n_components=2, random_state=1, perplexity=50, learning_rate=1000, verbose=2)
z = model.fit_transform(np.array(y))
tissue_map = pickle.load(open('../../data/tissue_map.pickle', 'rb'))
pdf = pd.DataFrame()
pdf['sample'] = df.index
pdf['tissue'] = [tissue_map[x].capitalize() for x in df.index]
pdf['x'] = z[:, 0]
pdf['y'] = z[:, 1]

types = []
for sample in samples:
    if sample.startswith('GTEX'):
        types.append('GTEX')
    elif sample.endswith('01'):
        types.append('TCGA-Tumor')
    elif sample.endswith('11'):
        types.append('TCGA-Normal')
pdf['type'] = types
tooltips=[
    ('Tissue', '@tissue'),
    ('Type', '@type'),
    ('Sample', '@sample'),
]

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 10864
[t-SNE] Computed conditional probabilities for sample 2000 / 10864
[t-SNE] Computed conditional probabilities for sample 3000 / 10864
[t-SNE] Computed conditional probabilities for sample 4000 / 10864
[t-SNE] Computed conditional probabilities for sample 5000 / 10864
[t-SNE] Computed conditional probabilities for sample 6000 / 10864
[t-SNE] Computed conditional probabilities for sample 7000 / 10864
[t-SNE] Computed conditional probabilities for sample 8000 / 10864
[t-SNE] Computed conditional probabilities for sample 9000 / 10864
[t-SNE] Computed conditional probabilities for sample 10000 / 10864
[t-SNE] Computed conditional probabilities for sample 10864 / 10864
[t-SNE] Mean sigma: 6.383180
[t-SNE] Iteration 25: error = 1.1559377, gradient norm = 0.0004925
[t-SNE] Iteration 25: gradient norm 0.000492. Finished.
[t-SNE] Iteration 50: erro

In [65]:
p = Scatter(pdf, x='x', y='y', title="t-SNE of GTEx and TCGA RNA-seq Expression",
            xlabel="1", ylabel="2",
            color='type',
            tooltips=tooltips,
            legend=True,
            plot_width=1024, plot_height=1024,
            palette=Accent[3],
            responsive=True)

p.title.align = 'center'

output_file('expression-UCSF-type.html')
show(p)

js, tag = autoload_static(p, CDN, "js/bokeh/expression-PC.js")
with open("expression-UCSF-type.js", 'w') as f:
    f.write(js)
with open("tags", 'a') as f:
    f.write(tag)

reset_output()

## All Tissues

In [71]:
with open('../../data/UCSF-RNAPanel-Final-412-genes.csv', 'r') as f:
    ucsf_genes = [x.strip() for x in f.readlines()]

def process_and_plot_tissue(df, title):
    # Remove duplicate columns
    df = df.T.groupby(level=0).first().T
    # Subset just tumor / normal
    samples = [x for x in df.columns if x.startswith('GTEX') or (x.endswith('01') or x.endswith('11'))]
    df = df[samples]
    # Subset Genes from UCSF
    df = df.T
    genes = [gene_map[x] if x in gene_map else x for x in df.columns]
    df.columns = genes
    sub = df[[x for x in ucsf_genes if x in genes]]
    # Log Normalize
    sub = sub.apply(lambda x: np.log2(x + 1))
    # Reduce to 50 components
    y = TruncatedSVD(n_components=50, random_state=0).fit_transform(np.array(sub))
    # t-SNE
    model = TSNE(n_components=2, random_state=1)
    z = model.fit_transform(np.array(y))
    
    # Create dataframe for plot
    tissue_map = pickle.load(open('../../data/tissue_map.pickle', 'rb'))
    df = pd.DataFrame()
    df['sample'] = samples
    df['tissue'] = [tissue_map[x].capitalize() for x in samples]
    df['x'] = z[:, 0]
    df['y'] = z[:, 1]
    
    # Get type of samples
    types = []
    for sample in samples:
        if sample.startswith('GTEX'):
            types.append('GTEX')
        elif sample.endswith('01'):
            types.append('TCGA-Tumor')
        elif sample.endswith('11'):
            types.append('TCGA-Normal')
    df['type'] = types
    
    # Specify tooltip
    tooltips=[
    ('Tissue', '@tissue'),
    ('Type', '@type'),
    ('Sample', '@sample'),
    ]
    
    # Plot
    p = Scatter(df, x='x', y='y', title="t-SNE of: " + title,
            xlabel="1", ylabel="2",
            color='type',
            tooltips=tooltips,
            legend=True,
            plot_width=400, plot_height=400,
            palette=Accent[3],
            active_drag="pan",
            active_scroll="wheel_zoom",
            responsive=True)

    js, tag = autoload_static(p, CDN, "js/bokeh/expression-UCSF-{}.js".format(title))
    with open("expression-UCSF-{}.js".format(title), 'w') as f:
        f.write(js)
    with open("tags".format(title), 'a') as f:
        f.write(tag)

In [72]:
for tissue in log_progress(tissues):
    df = pd.read_csv(os.path.join(exp_dir, tissue, tsv), sep='\t', index_col=0)
    process_and_plot_tissue(df, title=tissue)