In [1]:
import data
import seaborn as sns
import numpy as np
import pandas as pd
import patsy
%matplotlib inline



In [2]:
import pystan
import stanity

## load files for all cell types

In [3]:
# list types of cells
subsets = data.prep_filename_metadata()
subsets['SubSet'].unique()
all_types = list(subsets['SubSet'].unique())
print(all_types)

['CD4_Th2', 'CD8_Effector', 'B_CD5', 'B_Memory', 'CD4_Th1', 'CD4_Naive', 'B_Naive', 'CD4_Effector_Memory', 'CD8_Central_Memory', 'CD4_Central_Memory', 'CD4_Treg', 'CD8_Naive', 'CD4_Th17']


In [4]:
df = data.load_by_cell_type(all_types, metadata=subsets)

In [5]:
df = data.prep_annotated_data(df)

In [6]:
assert all(pd.notnull(df['log1p_tpm_rescaled']))

## prep gene ids

For model-estimation, we first need to map each gene_name to a numeric ID. 


In [7]:
df['gene_cat'] = df['gene_name'].astype('category')
df['gene_id'] = df['gene_cat'].cat.codes+1

In [8]:
df['B_cell'] = df['cell_type'].apply(lambda x: 1 if x == 'B' else 0)
df['T_cell'] = df['cell_type'].apply(lambda x: 1 if x != 'B' else 0)

In [9]:
df.head()

Unnamed: 0,sample_id,filename,gene_name,est_counts,tpm,log1p_tpm,log1p_counts,CCR6,CCR7,CD127,...,CXCR3,SubSet,cell_type,log1p_tpm_rescaled_type,log1p_tpm_rescaled_subset,log1p_tpm_rescaled,gene_cat,gene_id,B_cell,T_cell
0,1,ERR431566,A1BG,56.74329,6.931783,2.070878,4.056007,,,,...,,CD4_Th2,CD4,-5.599303,-3.517738,-5.47818,A1BG,1,0,1
1,1,ERR431566,A1CF,5.19492,0.111335,0.105562,1.82373,,,,...,,CD4_Th2,CD4,-1.164584,-1.52902,-0.735391,A1CF,2,0,1
2,1,ERR431566,A2M,10.000009,3.877857,1.584706,2.397896,,,,...,,CD4_Th2,CD4,1.338389,-0.920287,1.188265,A2M,3,0,1
3,1,ERR431566,A2ML1,9.38766,1.860813,1.051106,2.340619,,,,...,,CD4_Th2,CD4,1.092135,1.947468,0.895292,A2ML1,4,0,1
4,1,ERR431566,A2MP1,31.00003,3.441394,1.490968,3.465737,,,,...,,CD4_Th2,CD4,1.945796,4.135726,1.532385,A2MP1,5,0,1


## sample genes for first pass

In [10]:
sampled_genes = df.drop_duplicates(subset='gene_name').sample(n=100).loc[:,'gene_name']
sample_df = pd.merge(df, pd.DataFrame(sampled_genes), on='gene_name', how='inner')

In [11]:
sample_df.head()

Unnamed: 0,sample_id,filename,gene_name,est_counts,tpm,log1p_tpm,log1p_counts,CCR6,CCR7,CD127,...,CXCR3,SubSet,cell_type,log1p_tpm_rescaled_type,log1p_tpm_rescaled_subset,log1p_tpm_rescaled,gene_cat,gene_id,B_cell,T_cell
0,1,ERR431566,ABCG4,0.0,0.0,0.0,0.0,,,,...,,CD4_Th2,CD4,-1.280639,-0.776229,-1.160056,ABCG4,106,0,1
1,2,ERR431579,ABCG4,1.0,0.033301,0.032759,0.693147,,,,...,,CD4_Th2,CD4,-1.114252,-0.623156,-1.037061,ABCG4,106,0,1
2,3,ERR431600,ABCG4,4.0,0.307412,0.26805,1.609438,,,,...,,CD4_Th2,CD4,0.080842,0.476311,-0.153639,ABCG4,106,0,1
3,4,ERR431615,ABCG4,4.0,0.908946,0.646551,1.609438,,,,...,,CD4_Th2,CD4,2.003333,2.24497,1.267481,ABCG4,106,0,1
4,5,ERR431628,ABCG4,0.0,0.0,0.0,0.0,,,,...,,CD4_Th2,CD4,-1.280639,-0.776229,-1.160056,ABCG4,106,0,1


In [12]:
def prep_stan_data(df, sample_n=None):
    if sample_n:
        sampled_genes = df.drop_duplicates(subset='gene_name').sample(n=sample_n).loc[:,'gene_name']
        sample_df = pd.merge(df, pd.DataFrame(sampled_genes), on='gene_name', how='inner')
    else:
        sample_df = df
    sample_df['new_gene_cat'] = sample_df['gene_name'].astype('category')
    sample_df['new_gene_id'] = sample_df['new_gene_cat'].cat.codes+1
    sample_df['new_sample_cat'] = sample_df['sample_id'].astype('category')
    sample_df['new_sample_id'] = sample_df['new_sample_cat'].cat.codes+1
    stan_data = {'N': len(sample_df.index),
             'G': len(sample_df.new_gene_id.unique()),
             'S': len(sample_df.new_sample_id.unique()),
             'C': 2,
             'gene': sample_df.new_gene_id.values,
             'sample': sample_df.new_sample_id.values,
             'x': patsy.dmatrix('0 + B_cell + T_cell', data=sample_df, return_type='dataframe'),
             'y': sample_df.est_counts.astype(int).values,
            }
    return stan_data

    

## fit model at level of cell-type

In [13]:
stan_data = prep_stan_data(sample_df, sample_n=100)

In [14]:
stan_code = '''
data {
    // dimensions
    int<lower=1> N;  // N obs
    int<lower=1> G;  // N genes
    int<lower=1> S;  // N samples
    int<lower=0> C;  // N classes (e.g. B-cell, T-cell, B_Naive, CD5, CD45RO, etc)
                        // classes should be mutually exclusive. So, each row here should sum to 1
   
    // data
    int<lower=1, upper=G> gene[N];    // gene id for each obs
    int<lower=1, upper=S> sample[N];  // sample id for each obs
    matrix<lower=0, upper=1>[N, C] x; // map each obs to each class (0:'- or ?', 1:'+')
    int<lower=0> y[N];                // count/tpm for each obs
}
parameters {
    vector<lower=0>[C] theta[G];   // loading factors for each gene, for each cell type
    real log_gene_base[G];    // constant intercept expression level for each gene, irrespective of cell type
    real log_sample_base[S];  // constant intercept expression level for each sample
}
model {
    real log_exp[N];
    for (i in 1:G)
        theta[i] ~ normal(0, 1);
    log_gene_base ~ normal(0, 1);
    log_sample_base ~ normal(0, 1);
    for (n in 1:N) {
        log_exp[n] = log_sample_base[sample[n]] + log_gene_base[gene[n]] + log(x[n,]*theta[gene[n],]);
    }
    y ~ poisson_log(log_exp);
}
'''

In [15]:
#fit = data.cached_stan_fit(model_code=stan_code, data=stan_data, iter=5000, chains=4)

In [16]:
#print(fit)

In [23]:
alt_stan_code = '''
data {
    // dimensions
    int<lower=1> N;  // N obs
    int<lower=1> G;  // N genes
    int<lower=1> S;  // N samples
    int<lower=0> C;  // N classes (e.g. B-cell, T-cell, B_Naive, CD5, CD45RO, etc)
                        // classes should be mutually exclusive. So, each row here should sum to 1
   
    // data
    int<lower=1, upper=G> gene[N];    // gene id for each obs
    int<lower=1, upper=S> sample[N];  // sample id for each obs
    matrix<lower=0, upper=1>[N, C] x; // map each obs to each class (0:'- or ?', 1:'+')
    int<lower=0> y[N];                // count/tpm for each obs
}
parameters {
    vector<lower=0>[C] theta[G];   // loading factors for each gene, for each cell type
    real log_gene_base[G];    // constant intercept expression level for each gene, irrespective of cell type
}
model {
    real log_exp[N];
    for (i in 1:G)
        theta[i] ~ normal(0, 1);
    log_gene_base ~ normal(0, 1);
    for (n in 1:N)
        log_exp[n] = log_gene_base[gene[n]] + log(x[n,]*theta[gene[n],]);
    y ~ poisson_log(log_exp);
}
'''

In [None]:
alt_fit = data.cached_stan_fit(model_code=alt_stan_code, data=stan_data, chains=4, iter=5000)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_22d159166d0f17079f256856d3edd3eb NOW.


NOT reusing model.


In [None]:
print(alt_fit)

In [None]:
alt_fit.plot()