# Tumor Biomarkers: One Sig DE Gene
For samples with sufficient sample size

In [1]:
# Set autoreload module for dev
%load_ext autoreload
%autoreload 2
%aimport rnaseq_lib

In [2]:
from collections import defaultdict
import pandas as pd
import rnaseq_lib as r
import numpy as np

import holoviews as hv
hv.extension('bokeh')

In [3]:
# TCGA and GTEx expression and metadata dataframe
df_path = '/mnt/rnaseq-cancer/Objects/tcga-gtex-metadata-expression.tsv'
df = pd.read_csv(df_path, sep='\t', index_col=0, dtype=r.tissues.dtype)

In [4]:
# Holoviews plot object
h = r.plot.Holoview(df)

In [5]:
# Read in de table
de = pd.read_csv('de-table.tsv', sep='\t', index_col=0, header=[0, 1])

In [None]:
# Filter genes for l2fc
candidates = []   # only one DE tissue
# Filter parameters tuned to get top 50
l2_filter = 2
norm_filter = 5
exp_filter = 8
# Plot values
x, y = [], []
for gene, row in de.iterrows():
    l2fcs = [row[x, 'l2fc_gtex'] for x in h.tissues]
    if len(filter(lambda x: x > l2_filter, l2fcs)) == 1:
        tissue = h.tissues[np.argmax(l2fcs)]
        exp_gtex = [row[x, 'exp_gtex'] for x in h.tissues]
        if row[tissue, 'exp_gtex'] < norm_filter and row[tissue, 'exp_tumor'] > exp_filter:
            candidates.append(gene)        

In [None]:
def extract_feature(feature):
    df = pd.DataFrame()
    df['value'] = r.utils.flatten([de[x, feature].tolist() for x in h.tissues])
    df['gene'] = r.utils.flatten([de[x, feature].index.tolist() for x in h.tissues])
    df['tissue'] = [t for _ in de.index for t in h.tissues]
    return df

In [None]:
# Sort candidates by l2fc
l2 = extract_feature('l2fc_gtex')
candidates = l2[l2.gene.isin(candidates)].sort_values('value', ascending=False).gene

## Sample Counts

In [9]:
%%opts Bars [xrotation=80]
h.sample_counts()

## Top 10

In [10]:
%%opts Overlay [tabs=True]
dists = [h.gene_distribution(g).relabel(g) for g in candidates[:10]]
hv.Overlay(dists)

## Top 20

In [11]:
%%opts Overlay [tabs=True]
dists = [h.gene_distribution(g).relabel(g) for g in candidates[10:20]]
hv.Overlay(dists)

## Top 30

In [12]:
%%opts Overlay [tabs=True]
dists = [h.gene_distribution(g).relabel(g) for g in candidates[20:30]]
hv.Overlay(dists)

## Top 40

In [13]:
%%opts Overlay [tabs=True]
dists = [h.gene_distribution(g).relabel(g) for g in candidates[30:40]]
hv.Overlay(dists)

## Top 50

In [14]:
%%opts Overlay [tabs=True]
dists = [h.gene_distribution(g).relabel(g) for g in candidates[40:50]]
hv.Overlay(dists)