In [2]:
import os
import sys
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%precision 4
plt.style.use('ggplot')
import pystan
import scipy.stats as stats
pd.set_option('max_rows', 1000)
pd.set_option('max_columns', 50)
from sqlalchemy import create_engine
import psycopg2 as pq
golf_db = 'postgresql://localhost:5432/golf'

In [1]:
def exec_sql(sqlTxt, result=False):  
    with pq.connect(golf_db) as conn:
        with conn.cursor() as cursor:
            cursor.execute(sqlTxt) 
            if result:
                return [row for row in cursor] 
        
def pd_from_sql(sqlTxt):
    with pq.connect(golf_db) as conn:
        return pd.read_sql_query(sqlTxt, conn)
    
def np_to_sql(arr, table):
    df = pd.DataFrame(arr)
    with pq.connect(golf_db) as conn:
        df.to_sql(table, conn, if_exists='append')
        
def np_from_sql(sqlTxt):
    return pd_from_sql(sqlTxt).as_matrix()

In [49]:
sqlTxt = '''
select player_id, permanent_tournament_id, round, score from scores_view where player_id in (
    select player_id from stats_view where year = 2014 limit 100
)
'''
scores = pd_from_sql(sqlTxt)
num_scores = len (scores)

In [38]:
player_ids = scores['player_id'].unique()
num_players = len (player_ids)
player_map = dict(zip(player_ids, range(1,num_players + 1)))
tournament_ids = scores['permanent_tournament_id'].unique()
num_tournaments = len (tournament_ids)
tournament_map = dict(zip(tournament_ids, range(1, num_tournaments + 1)))

In [45]:
datum = sorted([[player_map[i[0]], tournament_map[i[1]], i[3]] for i in scores.as_matrix()], key = lambda (el): el[0])
p, t, y = zip(*datum)
rounds = np.zeros([100])
for s in p:
    rounds[s-1] += 1
rounds = rounds.tolist()

In [52]:
code = """
data {
  int N;
  int N_P;
  int N_T;
  int y[N];
  int p[N];
  int t[N];
  real rounds[N_P];
}
# Trending Model
parameters {
  real<lower=-3, upper=3> alpha[N_T];
  real<lower=0, upper=7> sigma[N_P];
  real<lower=65, upper=75> tau[N_P];
  real<lower=-2, upper=2> beta[N_P];
} 
model {
  real trend = 0;
  real m = 1;
  real scale = 1.0/70.0;  #  average score
  y[1] ~ normal(alpha[t[1]] + tau[p[1]], sigma[p[1]]);
  for (n in 2:N) {
    if (p[n] != p[n-1])
      m = n;
    trend = 1 - beta[p[n]]*scale/2 + beta[p[n]]*(n-m)*scale/rounds[p[n]];
    y[n] ~ normal(alpha[t[n]] + tau[p[n]]*trend, sigma[p[n]]);
    
  } 
}

"""
data = {
    'N': num_scores,
    'y': y,
    'p' : p,
    't' : t,
    'N_P' : num_players,
    'N_T' : num_tournaments,
    'rounds' : rounds
}

fit = pystan.stan(model_code=code, data=data, iter=1000, chains=4)
print fit

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_c698f7ef457b027af36067dedcc45bbe NOW.


Inference for Stan model: anon_model_c698f7ef457b027af36067dedcc45bbe.
4 chains, each with iter=1000; warmup=500; thin=1; 
post-warmup draws per chain=500, total post-warmup draws=2000.

            mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
alpha[0]   -0.41    0.02   0.18  -0.77  -0.53  -0.42  -0.29  -0.04   93.0   1.07
alpha[1]    1.21    0.02   0.19   0.84   1.07   1.21   1.33   1.57  103.0   1.06
alpha[2]    0.05    0.02   0.19  -0.31  -0.08   0.05   0.18   0.44  104.0   1.06
alpha[3]    0.38    0.02   0.19   0.01   0.25   0.38    0.5   0.75  110.0   1.05
alpha[4]    1.44    0.03   0.18   1.11   1.32   1.44   1.56   1.79   48.0   1.07
alpha[5]     0.8    0.02   0.19   0.44   0.67   0.79   0.92   1.17  107.0   1.05
alpha[6]    0.63    0.02   0.18   0.29   0.51   0.63   0.76   1.01  106.0   1.05
alpha[7]   -0.63    0.03    0.2  -1.03  -0.77  -0.63   -0.5  -0.22   51.0   1.06
alpha[8]   -0.07    0.02   0.18  -0.41  -0.19  -0.08   0.05   0.29   97.0   1.06
alp