##### PyMC3 Examples

# GLM using Ensemble Sampler

**A minimal reproducable example of integrating `emcee` with `pymc3` to use the Ensemble CSampler.**

+ This is a bit of a hack
+ The dataset is medium-size and download from an external source by this Notebook. 


**Note:**

+ Python 3.4 project using latest available [PyMC3](https://github.com/pymc-devs/pymc3)
+ Developed using [ContinuumIO Anaconda](https://www.continuum.io/downloads) distribution on a Macbook Pro 3GHz i7, 16GB RAM, OSX 10.10.5.
+ If the models become unstable or Theano throws weird errors, try clearing the cache `$> theano-cache clear` and rerunning the notebook.


**Package Requirements (shown as a conda-env YAML):**
```
$> less conda_env_pymc3_examples.yml

name: pymc3_examples
    channels:
      - defaults
    dependencies:
      - python=3.4
      - ipython
      - ipython-notebook
      - ipython-qtconsole
      - numpy
      - scipy
      - matplotlib
      - pandas
      - seaborn
      - patsy  
      - pip

$> conda env create --file conda_env_pymc3_examples.yml

$> source activate pymc3_examples

$> pip install --process-dependency-links git+https://github.com/pymc-devs/pymc3

```

# Setup

In [1]:
%matplotlib inline
%qtconsole --colors=linux

import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import optimize
import emcee
import pymc3 as pm
import theano as thno
import theano.tensor as T 

# configure some basic options
sns.set(style="darkgrid", palette="muted")
pd.set_option('display.notebook_repr_html', True)
plt.rcParams['figure.figsize'] = 12, 8
np.random.seed(0)

## Local Functions

In [None]:
# def emceevals_to_pymcdict(emcee_vals, vn_pymc):
#     """ 
#     Convenience function:
#     Transform flat emcee vals to a pymc dict for use in lnprob
#     """
    
#     vardict = {}
#     k = 0
#     for vn in vn_pymc.keys():
#         nvals = max(len(vn_pymc[vn]),1)
#         vals = emcee_vals[k:k+nvals]
#         vardict[vn] = vals if len(vals) > 1 else np.array(vals[0])
#         k += nvals

#     return vardict


# def flatten_arr3d_fortran(arr3d):
#     """ flatten 3d array to 2d array Fortran style  """
#     shptup = arr3d.shape
#     return arr3d.reshape((shptup[0] * shptup[1], shptup[2]), order='F')

# Acquire and Prepare Data

In [None]:
# def get_raw_data(storename='data/store_01.h5'):
#     """ Extract data from local store or go download it """

#     store = pd.HDFStore(storename)
#     zip_url = 'http://carfueldata.direct.gov.uk/additional/aug2015/'
#     zip_url_hr = 'http://carfueldata.direct.gov.uk/downloads/download.aspx?rg=aug2015'
#     zip_fn = 'download-data-for-Aug-2015-Euro-6.zip'
#     key_raw = '/raw'
    
#     if key_raw not in store.keys():  # then go get info from interwebs
#         try:         
#             r = requests.get(zip_url+zip_fn)
#             f = ZipFile(BytesIO(r.content),mode='r')
#             fn = f.namelist()[0]
#             dfraw = pd.read_csv(f.open(fn), encoding='cp1252', sep=',') # <- Excel encoding!   
#             dfraw.rename(columns=lambda x: col_renamer(x), inplace=True)
#             dfraw.drop([c for c in dfraw.columns if c[:3] == 'unn'], inplace=True, axis=1)

#             # TODO: clean dtypes! causing issues when saving
            
#             store[key_raw] = dfraw
#             print('Retrived dataset from web {}'.format(zip_url_hr))
        
#         except pd.io.pytables.PerformanceWarning as e:
#             logging.warning('{}'.format(e))
#         except Exception as e:
#             logging.error('{}'.format(e))
#             #return None
#     else:
#         dfraw = store[key_raw]
#         print('Used dataset from file {}'.format(storename))
    
#     store.close()
    
#     return dfraw

# Create Model

In [None]:
# with pymc.Model() as price_model:


#     # put cluster categories on hierarchical intercept
#     intercept_mu = pymc.Normal('intercept_mu', mu=0, sd=1e3)
#     intercept_tau = pymc.Gamma('intercept_tau', alpha=1, beta=10)  
#     intercept = pymc.Normal('intercept', mu=intercept_mu, tau=intercept_tau
#                         , shape=len(feats_fctr_li_train['clustername']['lbl']))


#     # define linear model
#     y_est = intercept[feats_fctr_li_train['clustername']['idx']]


#     # add on factor feats
#     f_feats = {}
#     for nm in feats_fctr_li_train.keys()[1:]:
#         f_feats['f_{}'.format(nm)] = pymc.Normal('f_{}'.format(nm),mu=0,sd=1e3)
#         y_est += f_feats['f_{}'.format(nm)] * feats_fctr_li_train[nm]['idx']


#     # add on linear coeffs (standardized feats)
#     b_feats = {}
#     for nm in feats_coeff_linear2sd:
#         b_feats['b_{}'.format(nm)] = pymc.Normal('b_{}'.format(nm),mu=0,sd=1e3)
#         y_est += b_feats['b_{}'.format(nm)] * mdi['df_train'][nm].values


#     # create likelihood dist with stochastic error
#     epsilon = pymc.Gamma('epsilon', alpha=1, beta=10)      

#     likelihood = pymc.Normal('likelihood', mu=y_est, tau=epsilon
#                                 ,observed=mdi['df_train'][feat_price].values)         
      

# ## find map
# with price_model:
#     start_MAP = find_MAP(fmin=optimize.fmin_powell)




In [None]:
    
# p0_MAP = []
# [p0_MAP.extend(start_MAP[vn].ravel()) for vn in mdi['vn_pymc'].keys()]

#         modelnm = 'model_{}'.format(feat_price)     # new dict for model run
#         modelnms.append(modelnm)
#         mdi[modelnm] = {'p0_MAP': np.array(p0_MAP)}  

#         ## Sample using emcee
#         def lnprob(emceevals):
#             """ calculate log-prob using pymc3 model logp(), convert vn_emcee to vn_pymc """
#             pymc_vardict = emceevals_to_pymcdict(emceevals, mdi['vn_pymc'])
#             logp_arr = price_model.logp(pymc_vardict).ravel()
#             return logp_arr[0]

#         nwalkers = 200
#         nsteps = 1000
#         p0 = emcee.utils.sample_ball(mdi[modelnm]['p0_MAP']
#                                      , np.abs(mdi[modelnm]['p0_MAP'])*0.3, nwalkers)         

#         sampler = emcee.EnsembleSampler(p0.shape[0], p0.shape[1], lnprob)
#         pbar = pb.ProgressBar(widgets=pbar_widgets_emcee, maxval=nsteps * nwalkers,).start()
#         for i, _ in enumerate(sampler.sample(p0, iterations=nsteps, storechain=True)):
#             pbar.update((i+1)*nwalkers)
#         pbar.finish()
#         mdi[modelnm]['sampler_chain'] = sampler.chain


In [None]:
# Optional diagnostic check, sample using PyMC NUTS Sampler

# <codecell>

# with price_model:
#     trace_nuts = pymc.sample(1000, step=pymc.NUTS(), start=start_MAP)

# <codecell>

# pymc.traceplot(trace_nuts)

---

Example originally contributed by Jonathan Sedar 2016-01-10 [github.com/jonsedar](https://github.com/jonsedar)