In [3]:
import data
import seaborn as sns
import numpy as np
import pandas as pd
import patsy
%matplotlib inline

In [4]:
import pystan
import stanity

## load files for all cell types

In [5]:
# list types of cells
subsets = data.prep_filename_metadata()
subsets['SubSet'].unique()
all_types = list(subsets['SubSet'].unique())
print(all_types)

['CD4_Th2', 'CD8_Effector', 'B_CD5', 'B_Memory', 'CD4_Th1', 'CD4_Naive', 'B_Naive', 'CD4_Effector_Memory', 'CD8_Central_Memory', 'CD4_Central_Memory', 'CD4_Treg', 'CD8_Naive', 'CD4_Th17']


In [6]:
df = data.load_by_cell_type(all_types, metadata=subsets)

In [7]:
df = data.prep_annotated_data(df)

In [8]:
assert all(pd.notnull(df['log1p_tpm_rescaled']))

## prep gene ids

For model-estimation, we first need to map each gene_name to a numeric ID. 


In [9]:
df['gene_cat'] = df['gene_name'].astype('category')
df['gene_id'] = df['gene_cat'].cat.codes+1

In [10]:
df['B_cell'] = df['cell_type'].apply(lambda x: 1 if x == 'B' else 0)
df['T_cell'] = df['cell_type'].apply(lambda x: 1 if x != 'B' else 0)

In [11]:
df.head()

Unnamed: 0,sample_id,filename,gene_name,est_counts,tpm,log1p_tpm,log1p_counts,CCR6,CCR7,CD127,...,CXCR3,SubSet,cell_type,log1p_tpm_rescaled_type,log1p_tpm_rescaled_subset,log1p_tpm_rescaled,gene_cat,gene_id,B_cell,T_cell
0,1,ERR431566,A1BG,56.74329,6.931783,2.070878,4.056007,,,,...,,CD4_Th2,CD4,-5.599303,-3.517738,-5.47818,A1BG,1,0,1
1,1,ERR431566,A1CF,5.19492,0.111335,0.105562,1.82373,,,,...,,CD4_Th2,CD4,-1.164584,-1.52902,-0.735391,A1CF,2,0,1
2,1,ERR431566,A2M,10.000009,3.877857,1.584706,2.397896,,,,...,,CD4_Th2,CD4,1.338389,-0.920287,1.188265,A2M,3,0,1
3,1,ERR431566,A2ML1,9.38766,1.860813,1.051106,2.340619,,,,...,,CD4_Th2,CD4,1.092135,1.947468,0.895292,A2ML1,4,0,1
4,1,ERR431566,A2MP1,31.00003,3.441394,1.490968,3.465737,,,,...,,CD4_Th2,CD4,1.945796,4.135726,1.532385,A2MP1,5,0,1


## sample genes for first pass

In [12]:
sampled_genes = df.drop_duplicates(subset='gene_name').sample(n=100).loc[:,'gene_name']
sample_df = pd.merge(df, pd.DataFrame(sampled_genes), on='gene_name', how='inner')

In [13]:
sample_df.head()

Unnamed: 0,sample_id,filename,gene_name,est_counts,tpm,log1p_tpm,log1p_counts,CCR6,CCR7,CD127,...,CXCR3,SubSet,cell_type,log1p_tpm_rescaled_type,log1p_tpm_rescaled_subset,log1p_tpm_rescaled,gene_cat,gene_id,B_cell,T_cell
0,1,ERR431566,AC007679.1,2.18135,1.26437,0.817297,1.157306,,,,...,,CD4_Th2,CD4,2.454948,1.332365,2.346799,AC007679.1,475,0,1
1,2,ERR431579,AC007679.1,0.0,0.0,0.0,0.0,,,,...,,CD4_Th2,CD4,-0.669696,-1.260268,-0.639787,AC007679.1,475,0,1
2,3,ERR431600,AC007679.1,2.00136,1.02808,0.70709,1.099066,,,,...,,CD4_Th2,CD4,2.03361,0.982766,1.944078,AC007679.1,475,0,1
3,4,ERR431615,AC007679.1,2.0,1.04805,0.716888,1.098612,,,,...,,CD4_Th2,CD4,2.071072,1.013849,1.979884,AC007679.1,475,0,1
4,5,ERR431628,AC007679.1,0.0,0.0,0.0,0.0,,,,...,,CD4_Th2,CD4,-0.669696,-1.260268,-0.639787,AC007679.1,475,0,1


In [14]:
def prep_stan_data(df, sample_n=None):
    if sample_n:
        sampled_genes = df.drop_duplicates(subset='gene_name').sample(n=sample_n).loc[:,'gene_name']
        sample_df = pd.merge(df, pd.DataFrame(sampled_genes), on='gene_name', how='inner')
    else:
        sample_df = df
    sample_df['new_gene_cat'] = sample_df['gene_name'].astype('category')
    sample_df['new_gene_id'] = sample_df['new_gene_cat'].cat.codes+1
    sample_df['new_sample_cat'] = sample_df['sample_id'].astype('category')
    sample_df['new_sample_id'] = sample_df['new_sample_cat'].cat.codes+1
    stan_data = {'N': len(sample_df.index),
             'G': len(sample_df.new_gene_id.unique()),
             'S': len(sample_df.new_sample_id.unique()),
             'C': 2,
             'gene': sample_df.new_gene_id.values,
             'sample': sample_df.new_sample_id.values,
             'x': patsy.dmatrix('0 + B_cell + T_cell', data=sample_df, return_type='dataframe'),
             'y': sample_df.est_counts.astype(int).values,
            }
    return stan_data

    

## fit model at level of cell-type

In [15]:
stan_data = prep_stan_data(sample_df, sample_n=100)

In [16]:
alt_stan_code2 = '''
data {
    // dimensions
    int<lower=1> N;  // N obs
    int<lower=1> G;  // N genes
    int<lower=1> S;  // N samples
    int<lower=0> C;  // N classes (e.g. B-cell, T-cell, B_Naive, CD5, CD45RO, etc)
                        // classes should be mutually exclusive. So, each row here should sum to 1
   
    // data
    int<lower=1, upper=G> gene[N];    // gene id for each obs
    int<lower=1, upper=S> sample[N];  // sample id for each obs
    matrix<lower=0, upper=1>[N, C] x; // map each obs to each class (0:'- or ?', 1:'+')
    int<lower=0> y[N];                // count/tpm for each obs
}
parameters {
    vector<lower=0>[C] theta[G];   // loading factors for each gene, for each cell type
    real log_gene_base[G];      // constant intercept expression level for each gene, irrespective of cell type
    real log_gene_mult[G];      // multiplicative factor for each gene. IE if counts are 0, 2, 4, 6
}
model {
    real log_exp[N];
    for (i in 1:G)
        theta[i] ~ normal(0, 1);
    log_gene_base ~ normal(0, 1);
    log_gene_mult ~ normal(0, 1);
    for (n in 1:N)
        log_exp[n] = log_sum_exp(log_gene_base[gene[n]], log_gene_mult[gene[n]] + log(x[n,]*theta[gene[n],]));
    y ~ poisson_log(log_exp);
}
'''

In [17]:
alt_fit2 = data.cached_stan_fit(model_code=alt_stan_code2, data=stan_data, iter=500, chains=4)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_abce4d17dc3e64f8cb42af45b34f33e5 NOW.


NOT reusing model.
Ran in 1746.026 sec.


The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


In [18]:
print(alt_fit2)

Inference for Stan model: anon_model_abce4d17dc3e64f8cb42af45b34f33e5.
4 chains, each with iter=500; warmup=250; thin=1; 
post-warmup draws per chain=250, total post-warmup draws=1000.

                    mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
theta[0,0]          0.54    0.01   0.44   0.02    0.2   0.44   0.76   1.64   1000    1.0
theta[1,0]          0.94    0.02   0.54   0.13   0.53   0.86   1.29   2.05   1000    1.0
theta[2,0]           1.3    0.02   0.59   0.34   0.86   1.24   1.68   2.58   1000    1.0
theta[3,0]          0.65    0.02   0.48   0.02   0.28   0.55   0.92   1.82   1000    1.0
theta[4,0]          0.55    0.01   0.34   0.03   0.29    0.5   0.77   1.28    855   1.01
theta[5,0]          0.39    0.01   0.39   0.01   0.11   0.26   0.54   1.46   1000    1.0
theta[6,0]           1.2    0.02   0.57   0.27   0.78   1.13   1.58   2.42   1000    1.0
theta[7,0]          1.53    0.02   0.62   0.56   1.07   1.47   1.91   2.93   1000    1.0
theta[8,0]   