In [1]:
import models
import data
import cache
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

sns.set(context='paper')



## load files for all cell types

In [2]:
df = cache.cached(data.prep_annotated_data)

INFO:cache:prep_annotated_data: Loading result from cache


In [3]:
assert all(pd.notnull(df['log1p_tpm_rescaled']))

## prep data for a sample of genes

In [4]:
sample_df = models.prep_sample_df(df, sample_n=500)

## fit4: model at level of cell-type, without correlation matrix

In [5]:
stan_data = models.prep_stan_data(sample_df, by='SubSet')

In [6]:
stan_file4 = models.get_model_file('model4')
print(cache._read_file(stan_file4))

## try neg binom parameterization
data {
    // dimensions
    int<lower=1> N;  // N obs
    int<lower=1> G;  // N genes
    int<lower=1> S;  // N samples
    int<lower=0> C;  // N classes (e.g. B-cell, T-cell, B_Naive, CD5, CD45RO, etc)
                     // note: classes should be mutually exclusive. Each row here should sum to 1
   
    // data
    int<lower=1, upper=G> gene[N];    // gene id for each obs
    int<lower=1, upper=S> sample[N];  // sample id for each obs
    vector<lower=0, upper=1>[C] x[N]; // map each obs to each class (0:'- or ?', 1:'+')
    int<lower=0> y[N];                // count/tpm for each obs
}
transformed data {
    int sample_y[S, G]; // array (size SxG) of ints
    vector[C] sample_x[S]; // array (size S) of vectors[C]
    for (n in 1:N) {
        sample_y[sample[n], gene[n]] = y[n];
        sample_x[sample[n]] = x[n,];
    }
}
parameters {
    matrix<lower=0>[G, C] theta; // loading factors for each gene, for each cell type
    vector[G] log_gene_base;

In [None]:
fit4 = cache.cached_stan_fit(model_name='model4', file=stan_file4, data=stan_data, iter=500, chains=4)

INFO:cache:Step 1: Get compiled model code, possibly from cache
INFO:cache:StanModel: Loading result from cache
INFO:cache:Step 2: Get posterior draws from model, possibly from cache
INFO:cache:sampling: Starting execution


## fit4: superficial check of convergence

In [None]:
models.plot_stan_summary(fit4, pars='theta', metric='Rhat')

In [None]:
models.print_stan_summary(fit4, pars='lp__')

## fit4: expression factors by gene & cell type

In [None]:
colnames = list(stan_data['x'].columns)

In [None]:
sort_by = colnames[0]
print(sort_by)

In [None]:
theta_ldf = models.prep_theta_summary(fit4,
                                    colnames=colnames,
                                    sample_df=sample_df,
                                    expose_group=sort_by)

In [None]:
g = sns.boxplot(data=theta_ldf.loc[theta_ldf['mean_value_rank_{}'.format(sort_by)] <= 50,:] \
                .sort_values('mean_value_rank_{}'.format(sort_by)),
            y='new_gene_cat',
            x='value',
            hue='variable', 
            fliersize=0, width=2, linewidth=0.2)

In [None]:
g = sns.boxplot(data=theta_ldf.loc[theta_ldf['mean_abs_diff_rank_{}'.format(sort_by)] <= 10,:] \
                .sort_values('mean_diff_rank_{}'.format(sort_by)),
            y='new_gene_cat',
            x='value',
            hue='variable', 
            fliersize=0, linewidth=0.2)

## fit4: review posterior predictions for sample genes

In [None]:
yrep_df = models.prep_yrep_summary(fit4, sample_df=sample_df, sample_kwds=dict(frac=0.5))

In [None]:
top_genes = theta_ldf.loc[theta_ldf['mean_abs_diff_rank_{}'.format(sort_by)] <= 10,:] \
                .drop_duplicates(subset='new_gene_cat')['new_gene_cat'].values

In [None]:
with sns.plotting_context('talk'):
    f, axarr = plt.subplots(1, 3, sharey=True)
    a=0
    for gene_name in top_genes[0:3]:
        g = sns.boxplot(data=yrep_df.loc[yrep_df['gene_cat'] == gene_name, :],
                    y='SubSet',
                    x='pp_est_counts',
                    ax=axarr[a],
                    fliersize=0, linewidth=0.2)
        sns.swarmplot(data=sample_df.loc[sample_df['gene_cat'] == gene_name, :],
                   y='SubSet', ax=axarr[a],
                   x='est_counts', color='black')
        plt.setp(axarr[a].get_xticklabels(), rotation='vertical')
        axarr[a].set_title(gene_name)
        a = a+1


## fit5 - model including estimated correlation matrix

In [None]:
stan_file5 = models.get_model_file('model5')
print(cache._read_file(stan_file5))

In [None]:
fit5 = models.cached_stan_fit(model_name='model5', file=stan_file5, data=stan_data, iter=500, chains=4)

In [None]:
models.print_stan_summary(fit5, pars='theta')

## fit5: review posterior estimates of theta_mu

In [None]:
mu_ex = fit5.extract('theta_mu')['theta_mu']

In [None]:
mu_df = pd.DataFrame(mu_ex, columns=list(stan_data['x'].columns))
mu_df.reset_index(inplace=True)
mu_df.rename(columns = {'index': 'iter'}, inplace=True)

In [None]:
mu_ldf = pd.melt(mu_df, id_vars='iter', value_vars=list(stan_data['x'].columns))
mu_ldf.head()

In [None]:
sns.boxplot(data=mu_ldf, x='variable', y='value')

## fit5: review posterior estimates of Omega

In [None]:
omega_df = models.extract_theta_summary(stan_fit=fit5,
                                      colnames=list(stan_data['x'].columns),
                                      gene_id='SubSet',
                                      par='Omega')
omega_df['SubSet'] = omega_df['SubSet'].apply(lambda x: list(stan_data['x'].columns)[x-1])

In [None]:
omega_df.head()

In [None]:
omega_summary = omega_df.groupby('SubSet').apply(lambda x: np.mean(x))
print(omega_summary)

In [None]:
with sns.plotting_context('paper'):
    sns.heatmap(omega_summary.loc[:, list(stan_data['x'].columns)])

## fit5: review posterior estimates of tau

In [None]:
tau_ex = fit5.extract('tau')['tau']

In [None]:
tau_ex.shape

In [None]:
tau_df = pd.DataFrame(tau_ex, columns=list(stan_data['x'].columns))
tau_df.reset_index(inplace=True)
tau_df.rename(columns={'index': 'iter'}, inplace=True)
tau_ldf = pd.melt(tau_df, id_vars='iter', value_vars=list(stan_data['x'].columns))

In [None]:
sns.boxplot(data=tau_ldf, x='variable', y='value')

## compare loo output with & without correlation matrix

In [None]:
loo4 = stanity.psisloo(fit4.extract('log_lik')['log_lik'])
loo5 = stanity.psisloo(fit5.extract('log_lik')['log_lik'])

In [None]:
stanity.loo_compare(loo4, loo5)