In [2]:
#!/usr/bin/env python

'''
GA Data Science Q2 2016

Code walk-through 16: Bayesian change detection
'''

import numpy as np
import pymc as pm

import matplotlib.pyplot as plt

%matplotlib inline

ImportError: No module named pymc

In [None]:
# Generate some data from a Poisson distribution with a change point
y = np.concatenate((
    np.random.poisson(5, size=100),
    np.random.poisson(10, size=200)
))

# Plot generated data
plt.plot(y)

In [None]:
# Define prior distributions 
# pick change point uniformly.
# Guessing randomly where the change point is and then at some point it will get around change point
# this will generate distribution around change point
change_point = pm.DiscreteUniform('change_point', lower=1, upper=len(y) - 1)
early_rate = pm.Exponential('early_rate', beta=1.0)
late_rate = pm.Exponential('late_rate', beta=1.0)

In [None]:
# Define (observed) stochastic variable for the number of arrivals (probabilistic)
# mean changes from er to lr there is a structural break

@pm.stochastic(observed=True, dtype=int)
def arrivals(value=y, change_point=change_point,\
             early_rate=early_rate, late_rate=late_rate):
    return pm.poisson_like(value[:change_point], early_rate) +\
           pm.poisson_like(value[change_point:], late_rate)

In [None]:
#first estimate of where change point is, 254
change_point.get_value()

In [None]:
arrivals.get_logp() #probballity of this model being true given the data that has been generated is not hight (-4108)

In [None]:
# Create model and sample
# markov chain monte carlo
model = pm.MCMC([change_point, early_rate, late_rate, arrivals])
model.sample(iter=100000, burn=10000, thin=100)

In [None]:
# Explore posterior summary statistics
# trying to approximate location of change point by simulating different values of where the change point could be
# at the beginning it will be very unstable but at some point we hope it converges towards the true value
# we don't like the fact it is unstable at the start, we want to see where it starts converging (burn-in rate)
# so above we will end up with 90000 points to build the distributionbecause we are ignoring 10000 (burn)
# thinning is to prevent autocorrelation, take every 100th simulation, sample regulary and allow the simulation to forget where 
# it came from so that the series won't be autocorrelated.
#
model.stats()

In [None]:
# we now know that the mean around the change point is estimated to be around 100
# HPD = higher posterior Density
# 95% interval that captures chnage point
model.summary()

In [None]:
# Plot traces and posterior densities
# trace shws already converged around 100
# chane_oint acorr we want no autocorrelation
pm.Matplot.plot(model)

In [None]:
# why is it that we want a distirbution for change point?
# tells uncertainty around where it could be
# e.g. brexit vote we can predict from 0-100% where we think the vote will be based on data
# once you have distribution you can derive a number of properties e.g. probailities (area under the curve)