In [7]:
import numpy as np
import data
import models
import cache
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

AttributeError: 'Settings' object has no attribute 'cache_dir'

In [52]:
model_name = 'model6.2'
model_file = models.get_model_file(model_name=model_name)
by = 'SubSet'
sample_n = 500 # number genes

In [17]:
sample_df = cache.cached(models.prep_sample_df,
                         sample_n=sample_n)

INFO:stancache.stancache:prep_sample_df: cache_filename set to prep_sample_df.cached.sample_n_500.pkl
INFO:stancache.stancache:prep_sample_df: Loading result from cache


In [32]:
sample_df.head(n=3)

Unnamed: 0,index,sample_id,filename,gene_name,est_counts,tpm,log1p_tpm,log1p_counts,CCR6+,CCR7+,...,log1p_tpm_rescaled_subset,log1p_tpm_rescaled,gene_cat,gene_id,B_cell,T_cell,new_gene_cat,new_gene_id,new_sample_cat,new_sample_id
0,0,1,ERR431566,ABCC1,1519.2643,54.99271,4.025222,7.326639,0.0,0.0,...,-3.727516,2.87421,ABCC1,76,0,1,ABCC1,1,1,1
1,1,2,ERR431567,ABCC1,671.7973,29.17218,3.40692,6.511444,0.0,0.0,...,-3.534278,-3.463493,ABCC1,76,0,1,ABCC1,1,2,2
2,2,3,ERR431568,ABCC1,744.026575,14.615387,2.748257,6.61342,0.0,0.0,...,-4.645626,-10.214918,ABCC1,76,1,0,ABCC1,1,3,3


In [33]:
def get_sample_ids_by_subset(sample_df):
    return {subset: 
            sample_df[sample_df['SubSet'] == subset].new_sample_id.unique() for subset in sample_df.SubSet.unique()}

def mix_cell_lines(xdata, subsets, weights, sample_ids=None, new_sample_id=10001):
    """
    e.g. xdata=stan_data['x'], subsets=['B_Naive', 'B_Memory'], weights=[.5, .5], sample_ids=None
    if sample_ids are None, the first sample of each subset is used
    """
    
    assert len(weights) == len(subsets)
    if not sample_ids:
        sample_ids = [relevant_sample_ids[subset][0] for subset in subsets]
    
    weights = np.array(weights)
    weights = weights / np.sum(weights) # normalize
    
    x2_data = pd.DataFrame(np.zeros((1, xdata.shape[1])), columns=xdata.columns)
    
    transformed_lines = []
    for subset, weight, sample_id in zip(subsets, weights, sample_ids):
        transformed = sample_df[sample_df['new_sample_id'] == sample_id].copy()
        transformed.loc[:,'est_counts'] *= weight
        transformed_lines.append(transformed)
        x2_data['SubSet[%s]' % subset] = weight
    
    mixed_sample = pd.concat(transformed_lines).groupby( \
        ['gene_name', 'new_gene_id'])['est_counts']\
        .sum().reset_index()
        
    mixed_sample['sample_id'] = new_sample_id
    
    return mixed_sample, x2_data

In [39]:
stan_data = models.prep_stan_data(sample_df, by='SubSet')

In [36]:
# here are the mixtures we want
mix1, mix1_x = mix_cell_lines(xdata=stan_data['x'],
                              subsets=['B_Naive', 'B_Memory'],
                              weights=[.5, .5],
                              sample_ids=[7, 4],
                              new_sample_id = 10001)

mix2, mix2_x = mix_cell_lines(xdata=stan_data['x'],
                              subsets=['B_Naive', 'B_Memory'],
                              weights=[.5, .5],
                              sample_ids=[21, 29],
                              new_sample_id = 10002)

mix3, mix3_x = mix_cell_lines(xdata=stan_data['x'],
                              subsets=['B_Naive', 'B_Memory'],
                              weights=[.25, .75],
                              sample_ids=[7, 4],
                              new_sample_id = 10003)

mix4, mix4_x = mix_cell_lines(xdata=stan_data['x'],
                              subsets=['B_Naive', 'B_Memory'],
                              weights=[.25, .75],
                              sample_ids=[21, 29],
                              new_sample_id = 10004)

mix5, mix5_x = mix_cell_lines(xdata=stan_data['x'],
                              subsets=['B_Naive', 'B_Memory'],
                              weights=[.75, .25],
                              sample_ids=[7, 4],
                              new_sample_id = 10005)

mix6, mix6_x = mix_cell_lines(xdata=stan_data['x'],
                              subsets=['B_Naive', 'B_Memory'],
                              weights=[.75, .25],
                              sample_ids=[21, 29],
                              new_sample_id = 10006)

# tregs vs naive B cells

mix7, mix7_x = mix_cell_lines(xdata=stan_data['x'],
                              subsets=['B_Naive', 'CD4_Treg'],
                              weights=[.5, .5],
                              sample_ids=[7, 18],
                              new_sample_id = 10007)

mix8, mix8_x = mix_cell_lines(xdata=stan_data['x'],
                              subsets=['B_Naive', 'CD4_Treg'],
                              weights=[.25, .75],
                              sample_ids=[7, 18],
                              new_sample_id = 10008)

mix9, mix9_x = mix_cell_lines(xdata=stan_data['x'],
                              subsets=['B_Naive', 'CD4_Treg'],
                              weights=[.5, .5],
                              sample_ids=[21, 24],
                              new_sample_id = 10009)

mix10, mix10_x = mix_cell_lines(xdata=stan_data['x'],
                              subsets=['B_Naive', 'CD4_Treg'],
                              weights=[.25, .75],
                              sample_ids=[21, 24],
                              new_sample_id = 10010)

In [45]:
training_df = sample_df

# make a test_df and x2_data with all of them
test_df = pd.concat([mix1, mix2, mix3, mix4, mix5, mix6, mix7, mix8, mix9, mix10])
test_df['gene_id'] = test_df['new_gene_id']
x2_data = pd.concat([mix1_x,mix2_x,mix3_x,mix4_x,mix5_x,mix6_x,mix7_x,mix8_x,mix9_x,mix10_x])

#for dat in [small_training_df, test_df]:
for dat in [training_df, test_df]:
    dat.sort_values(['gene_id','sample_id'], inplace=True)
    dat['new_sample_cat'] = dat['sample_id'].astype('category')
    dat['new_sample_id'] = dat['new_sample_cat'].cat.codes+1
    
test_data = {
    'N2': len(test_df.index),
    'S2': len(test_df.new_sample_id.unique()),
    'gene2': test_df.new_gene_id.values,
    'sample2': test_df.new_sample_id.values,
    'y2': test_df.est_counts.astype(int).values,
    'x2': x2_data, ## for easy access later
}

In [48]:
stan_data.update(test_data)

In [50]:
import pystan

In [55]:
model = pystan.StanModel(file=model_file)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_a181b2be649df2bf097dcb4c6a9053f7 NOW.


In [None]:
model.vb(data=stan_data, iter=10000, verbose=True)



In [66]:
len(test_df.index)

5000

In [None]:
test_df

In [77]:
test_df.gene_id.max()

500

In [94]:
len(training_df.SubSet.unique())  # 10 test mixture, 13 subsets

13