In [14]:
import data
import models
import cache
import seaborn as sns
import numpy as np
import pandas as pd
import patsy
from matplotlib import pyplot as plt

In [15]:
sns.set(context='talk')

In [16]:
model_name = 'model5'
by = 'SubSet'
sample_n = 500

# Import upenn-tex data (local cache)

In [17]:
data_path = "/home/elizachang/upenn-tex-data/inferelator_input/KP_RNAseq_counts.txt"

all_experiment_counts = pd.read_csv(data_path, sep="\t")

In [18]:
all_experiment_counts.set_index(keys='tracking_id', inplace=True)

### Munge into `stan-data` format

In [19]:
upenn_df = all_experiment_counts.unstack().reset_index(name='est_counts')

upenn_df.columns = ['experiment_name', 'gene', 'est_counts']

upenn_df = upenn_df.sort_values(by='gene', ascending=False).reset_index()

upenn_df['new_sample_id'] = range(1, 1+len(upenn_df))

gene_set = set(upenn_df['gene'])

gene_ids = {x:i+1 for i,x in enumerate(gene_set)} # gene is 1-indexed in the stan model

upenn_df['new_gene_id'] = upenn_df['gene'].apply(lambda x: gene_ids[x])

### Munge the data manifest for merging

In [20]:
data_manifest = pd.read_csv("/home/elizachang/upenn-tex/data/data_manifest.csv")

def tmp_fn(filename):
    name = filename.split('.')[0]
    return name.replace('_', '') 

rna_data_manifest = data_manifest[data_manifest['Type'] == 'RNA-Seq']

rna_data_manifest['experiment_name'] = rna_data_manifest['File'].apply(lambda x: tmp_fn(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [25]:
# map cell_type to indices in rna_data_manifest
# n.b. "d160" and "d235" are day 160, 235 after infection; others are day 30
cell_type_index_dict = {'CD8_naive': [0, 1], 
'CD8_exh_untreated': list(range(2,5)) + [9,10], 
'CD8_exh_PD-L1': list(range(5, 9)) + [11,12]
}

def map_cell_type(index, cell_type_index_dict):
    for k, v in cell_type_index_dict.items():
        if index in v:
            return k

rna_data_manifest[by] = rna_data_manifest.index.map(lambda x: map_cell_type(x, cell_type_index_dict))

experiment_cell_map = rna_data_manifest[['experiment_name', by]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [26]:
# Do the merge

full_df_ = pd.merge(upenn_df, experiment_cell_map, on='experiment_name', how='outer')

full_df = full_df_[['new_gene_id', 'est_counts', 'new_sample_id', by]]

# Turn into `stan_data`

In [27]:
stan_data = models.prep_stan_data(full_df, by=by)

In [28]:
from copy import deepcopy

In [29]:
cmdstan_data = deepcopy(stan_data)

In [30]:
stan_data

{'C': 3,
 'G': 11325,
 'M': 0,
 'N': 147238,
 'S': 147238,
 'cell_features': Empty DataFrame
 Columns: []
 Index: [CD8_naive, CD8_exh_PD-L1, CD8_exh_untreated],
 'gene': array([8804, 9081, 7520, ..., 9344, 4780, 3274]),
 'sample': array([     1,     20,     33, ..., 147202, 147220, 147228]),
 'x':         SubSet[CD8_exh_PD-L1]  SubSet[CD8_exh_untreated]  SubSet[CD8_naive]
 0                         0.0                        0.0                1.0
 1                         0.0                        0.0                1.0
 2                         0.0                        0.0                1.0
 3                         0.0                        0.0                1.0
 4                         0.0                        0.0                1.0
 5                         0.0                        0.0                1.0
 6                         0.0                        0.0                1.0
 7                         0.0                        0.0                1.0
 8       

In [31]:
for k, v in cmdstan_data.items():
    if isinstance(v, pd.DataFrame):
        cmdstan_data[k] = v.values

cmdstan_data

{'C': 3,
 'G': 11325,
 'M': 0,
 'N': 147238,
 'S': 147238,
 'cell_features': array([], shape=(3, 0), dtype=float64),
 'gene': array([8804, 9081, 7520, ..., 9344, 4780, 3274]),
 'sample': array([     1,     20,     33, ..., 147202, 147220, 147228]),
 'x': array([[ 0.,  0.,  1.],
        [ 0.,  0.,  1.],
        [ 0.,  0.,  1.],
        ..., 
        [ 0.,  1.,  0.],
        [ 0.,  1.,  0.],
        [ 0.,  1.,  0.]]),
 'y': array([47, 10,  9, ...,  9, 14, 70])}

In [32]:
sum(cmdstan_data['sample'] < 0)

0

### Model fitting below doesn't work - try converting to R dump format to run from `cmdstan`

In [33]:
import pystan

In [34]:
# prep_stan_data() currently does not support pd.DataFrame. Must be ndarray
for k, v in cmdstan_data.items():
    if isinstance(v, pd.DataFrame):
        cmdstan_data[k] = v.values

rdump = "/home/elizachang/upenn-tex-data/immune-infiltrate/model5-tex-{}.data.R".format(by)        
        
pystan.misc.stan_rdump(cmdstan_data, rdump)

## Run model fitting

In [None]:
model_file = models.get_model_file(model_name=model_name)
print(cache._read_file(model_file))

## neg binom parameterization
## estimate correlation matrix among cell types
data {
    // dimensions
    int<lower=1> N;  // N obs
    int<lower=1> G;  // N genes
    int<lower=1> S;  // N samples
    int<lower=0> C;  // N classes (e.g. B-cell, T-cell, B_Naive, CD5, CD45RO, etc)
                     //     note: classes should be mutually exclusive. Each row here should sum to 1
    // int<lower=0> M; // number of cell-level predictors 
   
    // data for each gene*sample
    int<lower=1, upper=G> gene[N];    // gene id for each obs
    int<lower=1, upper=S> sample[N];  // sample id for each obs
    vector<lower=0, upper=1>[C] x[N]; // map each obs to each class (0:'- or ?', 1:'+')
    int<lower=0> y[N];                // count/tpm for each obs
    
    // group-level predictors for each class C
    // (to come) - 
}
transformed data {
    int sample_y[S, G];    // array (size SxG) of ints
    vector[C] sample_x[S]; // array (size S) of vectors[C]
    for (n in 1:N) {
        sample

In [None]:
model_fit = models.cached_stan_fit(file=model_file, data=stan_data, iter=5000, model_name=model_name)

INFO:stancache.stancache:Step 1: Get compiled model code, possibly from cache
INFO:stancache.stancache:StanModel: cache_filename set to model5.cython_0_25_1.model_code_15550038732164966801.pystan_2_12_0_0.stanmodel.pkl
INFO:stancache.stancache:StanModel: Starting execution
INFO:pystan:COMPILING THE C++ CODE FOR MODEL model5_ae1010523a62a92a2318939fc3104d5f NOW.
