In [3]:
import data
import models
import cache
import seaborn as sns
import numpy as np
import pandas as pd
import patsy
from matplotlib import pyplot as plt

  "Cython.Distutils.old_build_ext does not properly handle dependencies "
INFO:stancache.seed:Setting seed to 1245502385
INFO:root:Setting CACHE_DIR = /mnt/modelcache/immune-infiltrate-explorations
INFO:stancache.seed:Setting seed to 1245502385


In [4]:
sns.set(context='talk')

In [5]:
model_name = 'model5'
by = 'cell_type'
sample_n = 500

# Import upenn-tex data (local cache)

In [7]:
data_path = "../upenn-tex/data/inferelator_input/KP_RNAseq_counts.txt"
all_experiment_counts = pd.read_csv(data_path, sep="\t")

In [8]:
all_experiment_counts.set_index(keys='tracking_id', inplace=True)

### Munge into `stan-data` format

In [71]:
upenn_df = all_experiment_counts.unstack().reset_index(name='est_counts')

upenn_df.columns = ['experiment_name', 'gene', 'est_counts']

upenn_df = upenn_df.sort_values(by='gene', ascending=False).reset_index()

experiment_set = set(upenn_df['experiment_name'])

experiment_ids = {x:i+1 for i,x in enumerate(experiment_set)}

upenn_df['experiment_id'] = upenn_df['experiment_name'].apply(lambda x: experiment_ids[x])
upenn_df['new_sample_id'] = upenn_df['experiment_id']

gene_set = set(upenn_df['gene'])

gene_ids = {x:i+1 for i,x in enumerate(gene_set)} # gene is 1-indexed in the stan model

upenn_df['new_gene_id'] = upenn_df['gene'].apply(lambda x: gene_ids[x])

In [72]:
duplicated_by_experiment = upenn_df[upenn_df.duplicated(subset=['experiment_name','gene'])].loc[:,['experiment_name','gene']].copy()

All duplicated records are gene '2-Mar'; likely a typo (excel date conversion?)

In [73]:
duplicated_records = pd.merge(upenn_df,
         duplicated_by_experiment,
         on=['experiment_name','gene'],
        )
duplicated_records.sort_values(['experiment_name','gene']).head()

Unnamed: 0,index,experiment_name,gene,est_counts,experiment_id,new_sample_id,new_gene_id
2,1299,KPRNA1,2-Mar,84.548,8,8,4662
3,857,KPRNA1,2-Mar,119.894,8,8,4662
6,91465,KPRNA10,2-Mar,95.0319,7,7,4662
7,91907,KPRNA10,2-Mar,54.0591,7,7,4662
12,103233,KPRNA11,2-Mar,58.5224,3,3,4662


In [74]:
duplicated_records['gene'].unique()

array(['2-Mar'], dtype=object)

Other than this gene, we have one count per experiment. Let's assume for now that each experiment is one sample, and enumerate our experiment IDs by sample.

In [79]:
upenn_df2 = upenn_df.loc[upenn_df['gene'] != '2-Mar',:].copy()

# renumber gene ids
gene_set2 = set(upenn_df2['gene'])

gene_ids2 = {x:i+1 for i,x in enumerate(gene_set2)} # gene is 1-indexed in the stan model

upenn_df2['new_gene_id'] = upenn_df2['gene'].apply(lambda x: gene_ids2[x])


Now, we should have no remaining duplicates by experiment*gene

In [82]:
duplicated_by_experiment = upenn_df2[upenn_df2.duplicated(subset=['experiment_name','gene'])].loc[:,['experiment_name','gene']].copy()
assert(len(duplicated_by_experiment)==0)

### Munge the data manifest for merging

In [83]:
data_manifest = pd.read_csv("../upenn-tex/data/data_manifest.csv")

def tmp_fn(filename):
    name = filename.split('.')[0]
    return name.replace('_', '') 

rna_data_manifest = data_manifest[data_manifest['Type'] == 'RNA-Seq'].copy()

rna_data_manifest['experiment_name'] = rna_data_manifest['File'].apply(lambda x: tmp_fn(x))

In [84]:
# map cell_type to indices in rna_data_manifest
# n.b. "d160" and "d235" are day 160, 235 after infection; others are day 30
cell_type_index_dict = {'CD8_naive': [0, 1], 
'CD8_exh_untreated': list(range(2,5)) + [9,10], 
'CD8_exh_PD-L1': list(range(5, 9)) + [11,12]
}

def map_cell_type(index, cell_type_index_dict):
    for k, v in cell_type_index_dict.items():
        if index in v:
            return k

rna_data_manifest['cell_type'] = rna_data_manifest.index.map(lambda x: map_cell_type(x, cell_type_index_dict))

experiment_cell_map = rna_data_manifest[['experiment_name', 'cell_type']]

In [85]:
# Do the merge

full_df_ = pd.merge(upenn_df2, experiment_cell_map, on='experiment_name', how='outer')

full_df = full_df_[['new_gene_id', 'est_counts', 'new_sample_id', 'cell_type']]

In [86]:
full_df.head()

Unnamed: 0,new_gene_id,est_counts,new_sample_id,cell_type
0,2172,47.4226,13,CD8_naive
1,6397,10.7483,13,CD8_naive
2,9083,9.02731,13,CD8_naive
3,318,202.869,13,CD8_naive
4,1767,5.92918,13,CD8_naive


# Turn into `stan_data`

In [87]:
stan_data = models.prep_stan_data(full_df, by=by)

In [88]:
from copy import deepcopy

In [89]:
cmdstan_data = deepcopy(stan_data)

In [90]:
stan_data

{'C': 3,
 'G': 11324,
 'M': 0,
 'N': 147212,
 'S': 13,
 'cell_features': Empty DataFrame
 Columns: []
 Index: [CD8_naive, CD8_exh_PD-L1, CD8_exh_untreated],
 'gene': array([2172, 6397, 9083, ..., 1110, 4885, 9959]),
 'sample': array([13, 13, 13, ..., 10, 10, 10]),
 'x':         cell_type[CD8_exh_PD-L1]  cell_type[CD8_exh_untreated]  \
 0                            0.0                           0.0   
 1                            0.0                           0.0   
 2                            0.0                           0.0   
 3                            0.0                           0.0   
 4                            0.0                           0.0   
 5                            0.0                           0.0   
 6                            0.0                           0.0   
 7                            0.0                           0.0   
 8                            0.0                           0.0   
 9                            0.0                           

In [91]:
for k, v in cmdstan_data.items():
    if isinstance(v, pd.DataFrame):
        cmdstan_data[k] = v.values

cmdstan_data

{'C': 3,
 'G': 11324,
 'M': 0,
 'N': 147212,
 'S': 13,
 'cell_features': array([], shape=(3, 0), dtype=float64),
 'gene': array([2172, 6397, 9083, ..., 1110, 4885, 9959]),
 'sample': array([13, 13, 13, ..., 10, 10, 10]),
 'x': array([[ 0.,  0.,  1.],
        [ 0.,  0.,  1.],
        [ 0.,  0.,  1.],
        ..., 
        [ 0.,  1.,  0.],
        [ 0.,  1.,  0.],
        [ 0.,  1.,  0.]]),
 'y': array([47, 10,  9, ...,  9, 14, 70])}

In [92]:
sum(cmdstan_data['sample'] < 0)

0

### Model fitting below doesn't work - try converting to R dump format to run from `cmdstan`

In [93]:
import pystan

In [94]:
# prep_stan_data() currently does not support pd.DataFrame. Must be ndarray
for k, v in cmdstan_data.items():
    if isinstance(v, pd.DataFrame):
        cmdstan_data[k] = v.values

pystan.misc.stan_rdump(cmdstan_data, "run_cmdstan/model5-tex.data.R")

## Run model fitting

In [95]:
model_file = models.get_model_file(model_name=model_name)
print(cache._read_file(model_file))

## neg binom parameterization
## estimate correlation matrix among cell types
data {
    // dimensions
    int<lower=1> N;  // N obs
    int<lower=1> G;  // N genes
    int<lower=1> S;  // N samples
    int<lower=0> C;  // N classes (e.g. B-cell, T-cell, B_Naive, CD5, CD45RO, etc)
                     //     note: classes should be mutually exclusive. Each row here should sum to 1
    // int<lower=0> M; // number of cell-level predictors 
   
    // data for each gene*sample
    int<lower=1, upper=G> gene[N];    // gene id for each obs
    int<lower=1, upper=S> sample[N];  // sample id for each obs
    vector<lower=0, upper=1>[C] x[N]; // map each obs to each class (0:'- or ?', 1:'+')
    int<lower=0> y[N];                // count/tpm for each obs
    
    // group-level predictors for each class C
    // (to come) - 
}
transformed data {
    int sample_y[S, G];    // array (size SxG) of ints
    vector[C] sample_x[S]; // array (size S) of vectors[C]
    for (n in 1:N) {
        sample

In [None]:
model_fit = models.cached_stan_fit(file=model_file, data=stan_data, iter=5, model_name=model_name)

INFO:stancache.stancache:Step 1: Get compiled model code, possibly from cache
INFO:stancache.stancache:StanModel: cache_filename set to model5.cython_0_25_1.model_code_12673779526111968781.pystan_2_12_0_0.stanmodel.pkl
INFO:stancache.stancache:StanModel: Loading result from cache
INFO:stancache.stancache:Step 2: Get posterior draws from model, possibly from cache
INFO:stancache.stancache:sampling: cache_filename set to model5.cython_0_25_1.model_code_12673779526111968781.pystan_2_12_0_0.stanfit.chains_4.data_6723842445.iter_5.seed_1245502385.pkl
INFO:stancache.stancache:sampling: Starting execution
INFO:stancache.stancache:sampling: Execution completed (0:00:19.206024 elapsed)
INFO:stancache.stancache:sampling: Saving results to cache
The relevant StanModel instance must be pickled along with this fit object.
When unpickling the StanModel must be unpickled first.
  pickle.dump(res, open(cache_filepath, 'wb'), pickle.HIGHEST_PROTOCOL)


In [None]:
model_fit = models.cached_stan_fit(file=model_file, data=stan_data, iter=5000, model_name=model_name)