# Treehouse PDX Samples

Run Treehouse PDX samples

# Inputs

In [31]:
import pandas as pd

import numpy as np
import seaborn as sns
import rnaseq_lib3 as r

In [7]:
df = pd.read_hdf('/mnt/data/expression/tcga_gtex_tpm_norm_filt.hd5')

Process raw input

In [13]:
path = '/mnt/treehouse-pdx-outliers/Vivian-PDX-RSEM-TPM.tsv'
pdx = pd.read_csv(path, sep='\t', index_col=0)
pdx = pdx.T
# Subset genes
pdx = pdx[df.columns[5:]]
# Normalize
pdx = pdx.apply(lambda x: np.log2(x + 1))

In [20]:
pdx.to_csv('/mnt/treehouse-pdx-outliers/pdx-samples-norm-filt.tsv', sep='\t')

# Run Samples

In [33]:
genes = df.columns[5:]

In [32]:
alpha = pdx.loc['alpha']
gtex = pd.read_hdf('/mnt/data/expression/gtex.hd5')

In [36]:
r.outlier.pairwise_distance_ranks(alpha, gtex, genes, 'tissue')

Unnamed: 0,Group,MedianRank
0,Cells,216.0
1,Uterus,736.5
2,Cervix,835.0
3,Ovary,875.0
4,Artery,1122.5
5,Adipose,1838.0
6,Colon,1889.0
7,Esophagus,2131.0
8,Breast,2291.0
9,Fallopian,2298.0


In [18]:
import subprocess

In [56]:
sample = '/mnt/treehouse-pdx-outliers/pdx-samples-norm-filt.tsv'
background = '/mnt/data/expression/gtex.hd5'
gene_list = '/mnt/Bayesian-Outlier-Model/data/drug-gene-list.txt'
out_dir = '/mnt/treehouse-pdx-outliers/'
col_skip = '5'

## Alpha

In [None]:
sample_names = ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota']
outs, errs = [], []
for sample_name in sample_names:
    call = ['outlier-model',
            '--sample', sample,
            '--background', background,
            '--name', sample_name,
            '--gene-list', gene_list,
            '--out-dir', out_dir,
            '--col-skip', col_skip]
    p = subprocess.Popen(call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out, err = p.communicate()
    outs.append(out)
    errs.append(err)

In [53]:
def get_sample(df_path: str, sample_name: str) -> pd.Series:
    """
    Loads dataframe containing sample and returns sample

    Args:
        df_path: Path to DataFrame containing sample
        sample_name: Name of sample in the index of the DataFrame

    Returns:
        Sample vector
    """
    if df_path.endswith('.csv'):
        df = pd.read_csv(df_path, index_col=0)
    elif df_path.endswith('.tsv'):
        df = pd.read_csv(df_path, sep='\t', index_col=0)
    else:
        try:
            df = pd.read_hdf(df_path)
        except:
            raise RuntimeError(f"Failed to open DataFrame: {df_path}")

    if sample_name in df.index:
        return df.loc[sample_name]
    else:
        raise RuntimeError(f"Sample {sample_name} not located in index of DataFrame {df_path}")


def load_df(df_path: str) -> pd.DataFrame:
    """
    Load background DataFrame

    Args:
        df_path: Path to DataFrame

    Returns:
        Background DataFrame
    """
    if df_path.endswith('.csv'):
        df = pd.read_csv(df_path, index_col=0)
    elif df_path.endswith('.tsv'):
        df = pd.read_csv(df_path, sep='\t', index_col=0)
    else:
        try:
            df = pd.read_hdf(df_path)
        except:
            raise RuntimeError(f"Failed to open DataFrame: {df_path}")
    return df


from sklearn.metrics import pairwise_distances
def pairwise_distance_ranks(sample: pd.Series, df: pd.DataFrame, genes, group: str) -> pd.DataFrame:
    """
    Calculate pairwise distance, rank, and take group median

    Args:
        sample: n-of-1 sample. Gets own label
        df: background dataset
        genes: genes to use for pairwise distance
        group: Column to use as class discriminator

    Returns:
        DataFrame of pairwise distance ranks
    """
    dist = pairwise_distances(np.array(sample[genes]).reshape(1, -1), df[genes])
    dist = pd.DataFrame([dist.ravel(), df[group]]).T
    dist.columns = ['Distance', 'Group']
    dist = dist.sort_values('Distance')

    # Pandas-FU
    dist = dist.reset_index(drop=True).reset_index()
    return dist.groupby('Group').apply(lambda x: x['index'].median()).sort_values().reset_index(name='MedianRank')

In [55]:
sample = get_sample(sample, 'alpha')

AttributeError: 'Series' object has no attribute 'endswith'

In [42]:
df = load_df(background)

In [44]:
group = 'tissue'

In [45]:
df = df.sort_values(group)

In [54]:
ranks = pairwise_distance_ranks(sample, df, genes, group)

In [61]:
n_bg = 5 if 5 < len(ranks) else len(ranks)

In [63]:
train_set = df[df[group].isin(ranks.head(n_bg).Group)]