In [4]:
import data
import models
import cache
import seaborn as sns
import numpy as np
import pandas as pd
import patsy
from matplotlib import pyplot as plt

In [5]:
sns.set(context='talk')

In [6]:
model_name = 'model5.3.1'
by = 'SubSet'
sample_n = 500
nu = 2

## sample data for analysis

In [7]:
sample_df = cache.cached(models.prep_sample_df, sample_n=sample_n)

INFO:stancache.stancache:prep_sample_df: cache_filename set to prep_sample_df.cached.sample_n_500.pkl
INFO:stancache.stancache:prep_sample_df: Loading result from cache


## fit model

In [8]:
stan_data = models.prep_stan_data(sample_df, by=by, nu=nu)

In [11]:
model_file = models.get_model_file(model_name=model_name)
print(cache._read_file(model_file))

## neg binom parameterization
## estimate correlation matrix among cell types
data {
    // dimensions
    int<lower=1> N;  // N obs
    int<lower=1> G;  // N genes
    int<lower=1> S;  // N samples
    int<lower=0> C;  // N classes (e.g. B-cell, T-cell, B_Naive, CD5, CD45RO, etc)
                     //     note: classes should be mutually exclusive. Each row here should sum to 1
    // int<lower=0> M; // number of cell-level predictors 
   
    // data for each gene*sample
    int<lower=1, upper=G> gene[N];    // gene id for each obs
    int<lower=1, upper=S> sample[N];  // sample id for each obs
    vector<lower=0, upper=1>[C] x[N]; // map each obs to each class (0:'- or ?', 1:'+')
    int<lower=0> y[N];                // count/tpm for each obs
    
    int<lower=1> nu;  // hyper-parameter for lkj_corr prior in Omega.
    
}
transformed data {
    int sample_y[S, G];    // array (size SxG) of ints
    vector[C] sample_x[S]; // array (size S) of vectors[C]
    for (n in 1:N) {
      

In [12]:
model_fit = models.cached_stan_fit(file=model_file, data=stan_data, model_name=model_name)

INFO:stancache.stancache:Step 1: Get compiled model code, possibly from cache
INFO:stancache.stancache:StanModel: cache_filename set to model5_3_1.cython_0_25_1.model_code_4310098347088349196.pystan_2_12_0_0.stanmodel.pkl
INFO:stancache.stancache:StanModel: Starting execution


ValueError: Failed to parse Stan model 'model5_3_1_feab693e7af487810de210e832c17043'. Error message:
SYNTAX ERROR, MESSAGE(S) FROM PARSER:

No matches for: 

  row vector + matrix

Available argument signatures for operator+:

  int + int
  real + real
  vector + vector
  row vector + row vector
  matrix + matrix
  vector + real
  row vector + real
  matrix + real
  real + vector
  real + row vector
  real + matrix
  +int
  +real
  +vector
  +row vector
  +matrix

expression is ill formed

ERROR at line 43

 41:        {
 42:            matrix[G, C] log_theta;
 43:            log_theta = theta_mu' + (diag_pre_multiply(tau, L_Omega) * z)';
                                                                              ^
 44:            theta = exp(log_theta);



## check convergence (superficially)

In [None]:
models.print_stan_summary(model_fit, pars='lp__')

In [None]:
models.plot_stan_summary(model_fit, pars='theta', metric='Rhat')

## summarize posterior draws of theta by gene

In [None]:
# meta-data used for plotting functions below
# so that the following code is invariant to the model run
colnames = list(stan_data['x'].columns)
sort_by = colnames[0]
print(sort_by)

In [None]:
theta_ldf = models.prep_theta_summary(model_fit, sample_df=sample_df, colnames=colnames, expose_group=sort_by)

In [None]:
## show theta estimates for first 50 genes, by `sort-by`
g = sns.boxplot(data=theta_ldf.loc[theta_ldf['mean_value_rank_{}'.format(sort_by)] <= 50,:] \
                .sort_values('mean_value_rank_{}'.format(sort_by)),
            y='new_gene_cat',
            x='value',
            hue='variable', 
            fliersize=0, width=2, linewidth=0.2)

In [None]:
## zoom in on the highest-ranked genes by `sort-by` difference from average 
## across all cell types
g = sns.boxplot(data=theta_ldf.loc[theta_ldf['mean_abs_diff_rank_{}'.format(sort_by)] <= 10,:] \
                .sort_values('mean_diff_rank_{}'.format(sort_by)),
            y='new_gene_cat',
            x='value',
            hue='variable', 
            fliersize=0, linewidth=0.2)

## posterior-predictive checking for selected genes

In [None]:
# identify top_genes by name
top_genes = theta_ldf.loc[theta_ldf['mean_abs_diff_rank_{}'.format(sort_by)] <= 10,:] \
                .drop_duplicates(subset='new_gene_cat')['new_gene_cat'].values
print(top_genes)

In [None]:
# get yrep draws
yrep_df = models.prep_yrep_summary(model_fit, sample_df=sample_df, filter_genes=top_genes[0:3])

In [None]:
# plot estimates & observed values for top 3 genes, by Subset
models.plot_posterior_predictive_checks(model_fit=model_fit, plot_genes=top_genes, sample_df=sample_df,
                                        yrep_df=yrep_df, n_genes=2)


## summarize posterior draws for `Omega`

In [None]:
omega_summary = models.prep_omega_summary(stan_fit=model_fit, stan_data=stan_data, par='Omega', gene_id=by)

In [None]:
with sns.plotting_context('paper'):
    sns.heatmap(omega_summary.loc[:, list(stan_data['x'].columns)])