# UMAP Sample Selection Comparison Analysis

Compare samples that are distinct from their counterpart match tissue and compare outlier results

# Inputs

In [23]:
import rnaseq_lib3 as r
import pandas as pd
import scipy.stats as st
import pymc3 as pm
import numpy as np
import time
import pickle
import os
import umap
from tqdm import tqdm_notebook as tqdm

import seaborn as sns
import matplotlib.pyplot as plt

import hvplot.pandas
import holoviews as hv
hv.extension('bokeh')

In [3]:
# Read in centered data
df = pd.read_hdf('/mnt/data/expression/tcga_gtex_tpm_norm_filt.hd5')
# Subset
gtex = df[df.label == 'gtex'].sort_values('tissue')
normal = df[df.label == 'tcga-normal'].sort_values('tissue')
tumor = df[df.label == 'tcga-tumor'].sort_values('tissue')
# Read in drug genes
genes = df.columns[5:]
drug_genes = [x.split('\t')[0] for x in open('../data/druggable-genes.tsv', 'r').readlines()]
drug_genes = [x for x in drug_genes if x in df.columns]

# UMAP of Adrenal

In [19]:
adrenal = df[df.tissue == 'Adrenal']
label_vec = adrenal['label'].reset_index(drop=True)

In [47]:
embedding = umap.UMAP().fit_transform(adrenal[genes])
embedding = pd.DataFrame(embedding, columns=['UMAP-1', 'UMAP-2'])
embedding['label'] = label_vec
embedding['sample'] = adrenal.index

scatter = hv.Scatter(data=embedding, kdims=['UMAP-1'], 
                     vdims=['UMAP-2', 'label', 'sample'],
                     label='UMAP of Adrenal in TCGA and GTEx')

In [74]:
%%opts Scatter [legend_position='left', color_index='label', width=700, height=500 tools=['hover']]
%%opts Scatter (size=5, alpha=0.75 cmap='Set1')
scatter * hv.VLine(-1.25).options(color='red', line_dash='dashed')

## Select samples from both groups

In [60]:
tgroup = scatter.data[scatter.data.label == 'tcga-tumor']
group1 = tgroup[tgroup['UMAP-1'] < -2]
group2 = tgroup[tgroup['UMAP-1'] > -2]

Run 10 samples from each group

In [72]:
g1_samples = np.random.choice(group1['sample'], 10, replace=False)
g2_samples = np.random.choice(group2['sample'], 10, replace=False)
print('Group 1 (Clustered away from GTEx)')
print('\n'.join(g1_samples))
print('\nGroup 2 (Clustered near GTEx)')
print('\n'.join(g2_samples))

Group 1 (Clustered away from GTEx)
TCGA-S7-A7WV-01
TCGA-RW-A67X-01
TCGA-WB-A80V-01
TCGA-WB-A81N-01
TCGA-QR-A6H0-01
TCGA-QT-A5XM-01
TCGA-WB-A820-01
TCGA-XG-A823-01
TCGA-S7-A7WR-01
TCGA-SA-A6C2-01

Group 2 (Clustered near GTEx)
TCGA-OR-A5LE-01
TCGA-OR-A5K4-01
TCGA-OR-A5LS-01
TCGA-OR-A5J3-01
TCGA-OR-A5JL-01
TCGA-OR-A5LM-01
TCGA-PA-A5YG-01
TCGA-OR-A5JV-01
TCGA-OR-A5LK-01
TCGA-OR-A5JW-01


## Compare Pearson Correlation of Drug Genes

In [154]:
sample_dir = '/mnt/outlier-runs/adrenal-cluster/'
far_dir = os.path.join(sample_dir, 'cluster-away')
near_dir = os.path.join(sample_dir, 'cluster-near')

In [151]:
from collections import OrderedDict

In [157]:
ppp = OrderedDict()
for sample_name in tqdm(os.listdir(far_dir)):
    m, t = r.outlier.load_model(os.path.join(far_dir, sample_name, 'model.pkl'))
    ppc = r.outlier.ppc(t, drug_genes)
    sample = tumor.loc[sample]
    ppp[sample_name] = r.outlier.posterior_predictive_pvals(sample, ppc)
    
for sample_name in tqdm(os.listdir(near_dir)):
    m, t = r.outlier.load_model(os.path.join(near_dir, sample_name, 'model.pkl'))
    ppc = r.outlier.ppc(t, drug_genes)
    sample = tumor.loc[sample_name]
    ppp[sample_name] = r.outlier.posterior_predictive_pvals(sample, ppc)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

NameError: name 'sample' is not defined

## Compare Ranks


## Compare Weights

# UMAP of All Tissues

In [101]:
umaps = {}
for tissue in tqdm(df.tissue.unique()):
    sub = df[df.tissue == tissue]
    if sub[sub.label == 'gtex'].empty or sub[sub.label == 'tcga-tumor'].empty:
        continue
    embedding = umap.UMAP().fit_transform(sub[genes])
    embedding = pd.DataFrame(embedding, columns=['UMAP-1', 'UMAP-2'])
    embedding['label'] = sub['label'].reset_index(drop=True)
    embedding['sample'] = sub.index
    embedding['subtype'] = sub['subtype'].reset_index(drop=True)

    umaps[tissue] = hv.Scatter(data=embedding, kdims=['UMAP-1'], 
                               vdims=['UMAP-2', 'label', 'sample', 'subtype'],
                               label='UMAP of TCGA and GTEx')

HBox(children=(IntProgress(value=0, max=39), HTML(value='')))

In [103]:
%%opts Scatter [legend_position='bottom_left' color_index='subtype', width=550, height=500 tools=['hover']]
%%opts Scatter (size=5, alpha=0.75 cmap='Set1')
hv.HoloMap(umaps, kdims='Tissue')

# GTEx UMAP

In [145]:
embedding = umap.UMAP().fit_transform(gtex[genes])
embedding = pd.DataFrame(embedding, columns=['UMAP-1', 'UMAP-2'])
embedding['label'] = gtex['label'].reset_index(drop=True)
embedding['sample'] = gtex.index
embedding['subtype'] = gtex['subtype'].reset_index(drop=True)
gtex_umap = hv.Scatter(data=embedding, kdims=['UMAP-1'], 
                       vdims=['UMAP-2', 'label', 'sample', 'subtype'],
                       label='UMAP of GTEx')

  n_components



In [149]:
%%opts Scatter [width=900 height=500 color_index='subtype' tools=['hover'] show_legend=False]
%%opts Scatter (cmap='Set1' size=3 alpha=0.5)
gtex_umap