In [29]:
import data
import models
import cache
import seaborn as sns
import numpy as np
import pandas as pd
import patsy
from matplotlib import pyplot as plt

In [2]:
sns.set(context='talk')

In [28]:
model_name = 'model5'
by = 'cell_type'

# Import upenn-tex data (local cache)

In [35]:
def import_upenn_tex(data_path="../upenn-tex/data/inferelator_input/KP_RNAseq_counts.txt",
                    manifest_data_path="../upenn-tex/data/data_manifest.csv"):
    # prep experiment data
    all_experiment_counts = pd.read_csv(data_path, sep="\t")
    
    # prep experiment data
    all_experiment_counts.set_index(keys='tracking_id', inplace=True)
    upenn_df = all_experiment_counts.unstack().reset_index(name='est_counts')
    upenn_df.columns = ['experiment_name', 'gene', 'est_counts']
    upenn_df = upenn_df.sort_values(by='gene', ascending=False).reset_index()
    
    # numeric ids for sample (experiment) & transcript
    experiment_set = set(upenn_df['experiment_name'])
    experiment_ids = {x:i+1 for i,x in enumerate(experiment_set)}
    upenn_df['experiment_id'] = upenn_df['experiment_name'].apply(lambda x: experiment_ids[x])
    upenn_df['new_sample_id'] = upenn_df['experiment_id']
    gene_set = set(upenn_df['gene'])
    gene_ids = {x:i+1 for i,x in enumerate(gene_set)} # gene is 1-indexed in the stan model
    upenn_df['new_gene_id'] = upenn_df['gene'].apply(lambda x: gene_ids[x])
    
    # de-dup '2-Mar' transcript (for now)
    duplicated_by_experiment = upenn_df[upenn_df\
                                         .duplicated(subset=['experiment_name','gene'])\
                                        ]\
        .loc[:,['experiment_name','gene']].copy()
    duplicated_records = pd.merge(upenn_df,
         duplicated_by_experiment,
         on=['experiment_name','gene'],
        )
    assert(all(duplicated_records['gene'] == '2-Mar'))
    upenn_df2 = upenn_df.loc[upenn_df['gene'] != '2-Mar',:].copy()
    # re-index gene ids
    gene_set2 = set(upenn_df2['gene'])
    gene_ids2 = {x:i+1 for i,x in enumerate(gene_set2)} # gene is 1-indexed in the stan model
    upenn_df2['new_gene_id'] = upenn_df2['gene'].apply(lambda x: gene_ids2[x])
    
    # confirm no duplicates 
    duplicated_by_experiment2 = upenn_df2[upenn_df2\
                                         .duplicated(subset=['experiment_name','gene'])\
                                        ]\
        .loc[:,['experiment_name','gene']].copy()
    assert(len(duplicated_by_experiment2)==0)

    # prep data manifest
    data_manifest = pd.read_csv(manifest_data_path)
    def tmp_fn(filename):
        name = filename.split('.')[0]
        return name.replace('_', '') 
    rna_data_manifest = data_manifest[data_manifest['Type'] == 'RNA-Seq'].copy()
    rna_data_manifest['experiment_name'] = rna_data_manifest['File'].apply(lambda x: tmp_fn(x))
    # map cell_type to indices in rna_data_manifest
    # n.b. "d160" and "d235" are day 160, 235 after infection; others are day 30
    cell_type_index_dict = {
        'CD8_naive': [0, 1], 
        'CD8_exh_untreated': list(range(2,5)) + [9,10], 
        'CD8_exh_PD-L1': list(range(5, 9)) + [11,12]
    }

    def map_cell_type(index, cell_type_index_dict):
        for k, v in cell_type_index_dict.items():
            if index in v:
                return k

    rna_data_manifest['cell_type'] = rna_data_manifest.index.map(lambda x: map_cell_type(x, cell_type_index_dict))
    experiment_cell_map = rna_data_manifest[['experiment_name', 'cell_type']]
    
    # merge manifest & expression data
    full_df_ = pd.merge(upenn_df2, experiment_cell_map, on='experiment_name', how='outer')
    full_df = full_df_[['new_gene_id', 'est_counts', 'new_sample_id', 'cell_type']]
    return full_df


In [36]:
full_df = cache.cached(import_upenn_tex)

INFO:stancache.stancache:import_upenn_tex: cache_filename set to import_upenn_tex.cached.default.pkl
INFO:stancache.stancache:import_upenn_tex: Starting execution
INFO:stancache.stancache:import_upenn_tex: Execution completed (0:00:00.736580 elapsed)
INFO:stancache.stancache:import_upenn_tex: Saving results to cache


# Turn into `stan_data`

In [38]:
stan_data = cache.cached(models.prep_stan_data, sample_df=full_df, by=by)

INFO:stancache.stancache:prep_stan_data: cache_filename set to prep_stan_data.cached.by_cell_type.sample_df_8873201493.pkl
INFO:stancache.stancache:prep_stan_data: Starting execution
INFO:stancache.stancache:prep_stan_data: Execution completed (0:00:00.717323 elapsed)
INFO:stancache.stancache:prep_stan_data: Saving results to cache


# Run model fitting

In [25]:
model_file = models.get_model_file(model_name=model_name)
print(cache._read_file(model_file))

## neg binom parameterization
## estimate correlation matrix among cell types
data {
    // dimensions
    int<lower=1> N;  // N obs
    int<lower=1> G;  // N genes
    int<lower=1> S;  // N samples
    int<lower=0> C;  // N classes (e.g. B-cell, T-cell, B_Naive, CD5, CD45RO, etc)
                     //     note: classes should be mutually exclusive. Each row here should sum to 1
    // int<lower=0> M; // number of cell-level predictors 
   
    // data for each gene*sample
    int<lower=1, upper=G> gene[N];    // gene id for each obs
    int<lower=1, upper=S> sample[N];  // sample id for each obs
    vector<lower=0, upper=1>[C] x[N]; // map each obs to each class (0:'- or ?', 1:'+')
    int<lower=0> y[N];                // count/tpm for each obs
    
    // group-level predictors for each class C
    // (to come) - 
}
transformed data {
    int sample_y[S, G];    // array (size SxG) of ints
    vector[C] sample_x[S]; // array (size S) of vectors[C]
    for (n in 1:N) {
        sample

In [41]:
model_fit = models.cached_stan_fit(file=model_file, data=stan_data, iter=5, model_name=model_name)

INFO:stancache.stancache:Step 1: Get compiled model code, possibly from cache
INFO:stancache.stancache:StanModel: cache_filename set to model5.cython_0_25_1.model_code_12673779526111968781.pystan_2_12_0_0.stanmodel.pkl
INFO:stancache.stancache:StanModel: Loading result from cache
INFO:stancache.stancache:Step 2: Get posterior draws from model, possibly from cache
INFO:stancache.stancache:sampling: cache_filename set to model5.cython_0_25_1.model_code_12673779526111968781.pystan_2_12_0_0.stanfit.chains_4.data_49159624310.iter_5.seed_1245502385.pkl
INFO:stancache.stancache:sampling: Loading result from cache


In [None]:
model_fit = models.cached_stan_fit(file=model_file, data=stan_data, iter=1000, model_name=model_name)

INFO:stancache.stancache:Step 1: Get compiled model code, possibly from cache
INFO:stancache.stancache:StanModel: cache_filename set to model5.cython_0_25_1.model_code_12673779526111968781.pystan_2_12_0_0.stanmodel.pkl
INFO:stancache.stancache:StanModel: Loading result from cache
INFO:stancache.stancache:Step 2: Get posterior draws from model, possibly from cache
INFO:stancache.stancache:sampling: cache_filename set to model5.cython_0_25_1.model_code_12673779526111968781.pystan_2_12_0_0.stanfit.chains_4.data_49159624310.iter_1000.seed_1245502385.pkl
INFO:stancache.stancache:sampling: Starting execution
