In [1]:
import data
import models
import cache
import seaborn as sns
import numpy as np
import pandas as pd
import patsy
%matplotlib inline



In [2]:
sns.set(context='talk')

In [3]:
model_name = 'model7'
by = 'cell_type'
sample_n = 500

## sample data for analysis

In [4]:
sample_df = cache.cached(models.prep_sample_df, sample_n=sample_n)

INFO:cache:prep_sample_df: cache_filename set to prep_sample_df.cached.sample_n_31194724242.pkl
INFO:cache:prep_sample_df: Loading result from cache


## fit model

In [5]:
stan_data = models.prep_stan_data(sample_df, by=by)

In [15]:
model_file = models.get_model_file(model_name=model_name)
print(cache._read_file(model_file))

## neg binom parameterization
## estimate correlation matrix among cell types
data {
    // dimensions
    int<lower=1> N;  // N obs
    int<lower=1> G;  // N genes
    int<lower=1> S;  // N samples
    int<lower=0> C;  // N classes (e.g. B-cell, T-cell, B_Naive, CD5, CD45RO, etc)
                     //     note: classes should be mutually exclusive. Each row here should sum to 1
    int<lower=0> M; // number of cell-level predictors 
   
    // data for each gene*sample
    int<lower=1, upper=G> gene[N];    // gene id for each obs
    int<lower=1, upper=S> sample[N];  // sample id for each obs
    vector<lower=0, upper=1>[C] x[N]; // map each obs to each class (0:'- or ?', 1:'+')
    int<lower=0> y[N];                // count/tpm for each obs
    
    // group-level predictors for each class C
    matrix[C, M] cell_features; 
}
transformed data {
    int sample_y[S, G];    // array (size SxG) of ints
    vector[C] sample_x[S]; // array (size S) of vectors[C]
    for (n in 1:N) {
    

In [18]:
model_fit = models.cached_stan_fit(file=model_file, data=stan_data, iter=500, chains=4, model_name=model_name)

INFO:cache:Step 1: Get compiled model code, possibly from cache
INFO:cache:StanModel: cache_filename set to model7.model_code_57672475544.stanmodel.pkl
INFO:cache:StanModel: Starting execution


ValueError: Failed to parse Stan model 'model7_2d87d54763056c8a795979bddb0c11e8'. Error message:
SYNTAX ERROR, MESSAGE(S) FROM PARSER:

No matches for: 

  to_matrix(vector[])

Available argument signatures for to_matrix:

  to_matrix(matrix)
  to_matrix(vector)
  to_matrix(row vector)
  to_matrix(real[,])
  to_matrix(int[,])


ERROR at line 44

 42:        { 
 43:            matrix[C, G] tmp_theta_mu;
 44:            tmp_theta_mu = to_matrix(rep_array(theta_mu, G));
                                                                ^
 45:            theta = tmp_theta_mu + cell_features*theta_coefs_per_gene + (diag_pre_multiply(tau,L_Omega) * z)';



## check convergence (superficially)

In [None]:
models.plot_stan_summary(model_fit, pars='theta', metric='Rhat')

In [None]:
models.print_stan_summary(model_fit, pars='lp__')

## summarize posterior draws of theta by gene

In [None]:
# meta-data used for plotting functions below
# so that the following code is invariant to the model run
colnames = list(stan_data['x'].columns)
sort_by = colnames[0]
print(sort_by)

In [None]:
theta_ldf = models.prep_theta_summary(model_fit, sample_df=sample_df, colnames=colnames, expose_group=sort_by)

In [None]:
## show theta estimates for first 50 genes, by `sort-by`
g = sns.boxplot(data=theta_ldf.loc[theta_ldf['mean_value_rank_{}'.format(sort_by)] <= 50,:] \
                .sort_values('mean_value_rank_{}'.format(sort_by)),
            y='new_gene_cat',
            x='value',
            hue='variable', 
            fliersize=0, width=2, linewidth=0.2)

In [None]:
## zoom in on the highest-ranked genes by `sort-by` difference from average 
## across all cell types
g = sns.boxplot(data=theta_ldf.loc[theta_ldf['mean_abs_diff_rank_{}'.format(sort_by)] <= 10,:] \
                .sort_values('mean_diff_rank_{}'.format(sort_by)),
            y='new_gene_cat',
            x='value',
            hue='variable', 
            fliersize=0, linewidth=0.2)

## posterior-predictive checking for selected genes

In [None]:
# get yrep draws
yrep_df = models.prep_yrep_summary(model_fit, sample_df=sample_df)

In [None]:
# identify top_genes by name
top_genes = theta_ldf.loc[theta_ldf['mean_abs_diff_rank_{}'.format(sort_by)] <= 10,:] \
                .drop_duplicates(subset='new_gene_cat')['new_gene_cat'].values
print(top_genes)

In [None]:
# plot estimates & observed values for top 3 genes, by Subset
with sns.plotting_context('talk'):
    f, axarr = plt.subplots(1, 3, sharey=True)
    a=0
    for gene_name in top_genes[0:3]:
        g = sns.boxplot(data=yrep_df.loc[yrep_df['gene_cat'] == gene_name, :],
                        y='SubSet',
                        hue='cell_type',
                        x='pp_est_counts',
                        ax=axarr[a],
                        fliersize=0, linewidth=0.2)
        sns.swarmplot(data=sample_df.loc[sample_df['gene_cat'] == gene_name, :],
                       y='SubSet', ax=axarr[a],
                       x='est_counts', color='black')
        plt.setp(axarr[a].get_xticklabels(), rotation='vertical')
        axarr[a].set_title(gene_name)
        a = a+1


## summarize posterior draws for `theta_mu`

In [None]:
mu_ldf = models.prep_theta_mu_summary(stan_fit=model_fit, stan_data=stan_data, par='theta_mu')

In [None]:
sns.boxplot(data=mu_ldf, x='variable', y='value')

## summarize posterior draws for `Omega`

In [None]:
omega_summary = models.prep_omega_summary(stan_fit=model_fit, stan_data=stan_data, par='Omega', gene_id=by)

In [None]:
with sns.plotting_context('paper'):
    sns.heatmap(omega_summary.loc[:, list(stan_data['x'].columns)])