In [None]:
import data
import seaborn as sns
import numpy as np
import pandas as pd
import patsy
%matplotlib inline

In [2]:
import pystan
import stanity

## load files for all cell types

In [3]:
# list types of cells
subsets = data.prep_filename_metadata()
subsets['SubSet'].unique()
all_types = list(subsets['SubSet'].unique())
print(all_types)

['CD4_Th2', 'CD8_Effector', 'B_CD5', 'B_Memory', 'CD4_Th1', 'CD4_Naive', 'B_Naive', 'CD4_Effector_Memory', 'CD8_Central_Memory', 'CD4_Central_Memory', 'CD4_Treg', 'CD8_Naive', 'CD4_Th17']


In [4]:
df = data.load_by_cell_type(all_types, metadata=subsets)

In [5]:
df = data.prep_annotated_data(df)

In [6]:
assert all(pd.notnull(df['log1p_tpm_rescaled']))

## prep gene ids

For model-estimation, we first need to map each gene_name to a numeric ID. 


In [7]:
df['gene_cat'] = df['gene_name'].astype('category')
df['gene_id'] = df['gene_cat'].cat.codes+1

## prep cell-type ids

In [8]:
df['B_cell'] = df['cell_type'].apply(lambda x: 1 if x == 'B' else 0)
df['T_cell'] = df['cell_type'].apply(lambda x: 1 if x != 'B' else 0)

In [9]:
df['celltype_id'] = df['B_cell']+1

In [10]:
df.head()

Unnamed: 0,sample_id,filename,gene_name,est_counts,tpm,log1p_tpm,log1p_counts,CCR6,CCR7,CD127,...,SubSet,cell_type,log1p_tpm_rescaled_type,log1p_tpm_rescaled_subset,log1p_tpm_rescaled,gene_cat,gene_id,B_cell,T_cell,celltype_id
0,1,ERR431566,A1BG,56.74329,6.931783,2.070878,4.056007,,,,...,CD4_Th2,CD4,-5.599303,-3.517738,-5.47818,A1BG,1,0,1,1
1,1,ERR431566,A1CF,5.19492,0.111335,0.105562,1.82373,,,,...,CD4_Th2,CD4,-1.164584,-1.52902,-0.735391,A1CF,2,0,1,1
2,1,ERR431566,A2M,10.000009,3.877857,1.584706,2.397896,,,,...,CD4_Th2,CD4,1.338389,-0.920287,1.188265,A2M,3,0,1,1
3,1,ERR431566,A2ML1,9.38766,1.860813,1.051106,2.340619,,,,...,CD4_Th2,CD4,1.092135,1.947468,0.895292,A2ML1,4,0,1,1
4,1,ERR431566,A2MP1,31.00003,3.441394,1.490968,3.465737,,,,...,CD4_Th2,CD4,1.945796,4.135726,1.532385,A2MP1,5,0,1,1


## prep sample data

The example LDA code from the Stan manual uses a document-topic modeling framework, in which several word counts are observed per document & used to infer topics.

We will apply this to our dataset as an example / POC, using the following translations:

* document -> sample
* word -> gene
* topic -> cell type

The data used for this example includes one record per *word* observed. IE a word that appears 10 times results in 10 records in the dataset.

To simulate this, we will first select a random subset of genes, and modify the Stan code to effectively "expand" our observations by the number of counts observed.


We will wrap this in a function `prep_stan_data` so that ids assigned within the function can be passed to the `stan_data` dict.

In [15]:
def prep_stan_data(df, sample_genes=None, y_col='est_counts'):
    df['counts'] = df[y_col].astype(int)
    new_df = df.loc[df['counts']>0,:].copy()
    if sample_genes:
        sampled_genes = new_df.drop_duplicates(subset='gene_id').sample(n=sample_genes).loc[:,'gene_id']
        new_df = pd.merge(new_df, pd.DataFrame(sampled_genes), on='gene_id', how='inner')
    new_df['new_gene_cat'] = new_df['gene_name'].astype('category')
    new_df['new_gene_id'] = new_df['new_gene_cat'].cat.codes+1
    per_sample = new_df.drop_duplicates(subset='sample_id').sort_values('sample_id')
    stan_data = {'N': len(new_df.index),
                 'V': len(new_df.new_gene_id.unique()),
                 'M': len(new_df.sample_id.unique()),
                 'K': max(new_df.celltype_id),
                 'w': new_df.new_gene_id.values,
                 'doc': new_df.sample_id.values,
                 'counts': new_df.est_counts.astype(int).values,
                 'topic': new_df.celltype_id.astype(int).values,
                }
    stan_data.update({'alpha': np.repeat(1./stan_data['K'], repeats=stan_data['K'])})
    stan_data.update({'beta': np.repeat(1./stan_data['V'], repeats=stan_data['V'])})
    return stan_data

## (modified) LDA model

The LDA model code below is taken from the Stan Manual (v2.12) with few modifications.

1. labeled `topics`, since the composition for these cells is known
2. `target += ... ` is repeated for each count observed

Otherwise, the model is basically the same.

In [25]:
stan_data = prep_stan_data(sample, y_col='tpm')
print(stan_data['V'])

79


In [26]:
stan_code_lda = '''
data {
    // dimensions
    int<lower=2> K; // num topics
    int<lower=2> V; // num words
    int<lower=1> M; // num docs
    int<lower=1> N; // total word instances
    
    // observed data per word*doc combination
    int<lower=1,upper=V> w[N];     // word n
    int<lower=1,upper=M> doc[N];   // doc ID for word n
    int<lower=1> counts[N];        // number of times word w[n] observed in doc[n]
    int<lower=0,upper=K> topic[N]; // topic for word n
    
    // prior on "words"
    // vector<lower=0>[K] alpha; // topic prior
    vector<lower=0>[V] beta; // word prior
}
parameters {
    //simplex[K] theta[M]; // topic dist for doc m
    simplex[V] phi[K]; // word dist for topic k
}
model {
    //for (m in 1:M)
    //    theta[m] ~ dirichlet(type[m,]); // prior
    for (k in 1:K)
        phi[k] ~ dirichlet(beta); // prior
    for (n in 1:N) {
        real gamma[K];
        real logsumexp;
        for (k in 1:K)
            gamma[k] = log(phi[topic[n], w[n]]);
        logsumexp = log_sum_exp(gamma);
        for (i in 1:counts[n])
            target += logsumexp; // likelihood, repeat for each count;
    }
}
'''

In [None]:
fit = stanity.fit(model_code=stan_code_lda, data=stan_data, iter=10000, chains=4)

Reusing model.


In [None]:
print(fit)