In [1]:
import util
%matplotlib inline
%precision 4
import pystan
import numpy as np
import pandas as pd


In [11]:
year = 2014
tag = 'sg-200'
limit = 200
sqlTxt = '''
select * from (select year, player_id, row_number from rank_view where year = %s  limit %s) r 
    join scores_view s 
    using (player_id, year) order by r.row_number, player_id, year, permanent_tournament_id, round;
'''
scores = util.pd_from_sql(sqlTxt, [year, limit])
scores = scores[(pd.notnull(scores['sg_approach'])) & (pd.notnull(scores['sg_around']))]
scores

Unnamed: 0,player_id,year,row_number,date,permanent_tournament_id,round,score,sg_tee,sg_approach,sg_around,sg_putting
0,28237,2014,4,2014-02-27,10,1,63,1.993,0.681,3.700000e-01,4.371000e+00
1,28237,2014,4,2014-02-28,10,2,66,-0.149,0.494,1.319000e+00,2.780000e+00
2,28237,2014,4,2014-03-01,10,3,70,0.626,-0.513,4.140000e-01,8.760000e-01
3,28237,2014,4,2014-03-02,10,4,76,0.620,-2.855,-6.400000e-02,-8.670000e-01
4,28237,2014,4,2014-05-08,11,1,70,0.450,1.613,8.200000e-02,-9.580000e-01
5,28237,2014,4,2014-05-09,11,2,74,0.347,-1.185,-6.980000e-01,-1.366000e+00
6,28237,2014,4,2014-05-10,11,3,70,0.574,-0.045,1.157000e+00,5.760000e-01
7,28237,2014,4,2014-05-11,11,4,66,1.535,0.836,7.400000e-01,1.987000e+00
8,28237,2014,4,2014-04-03,20,1,71,1.236,1.079,5.230000e-01,-9.760000e-01
9,28237,2014,4,2014-04-04,20,2,72,-1.999,2.206,6.840000e-01,-3.240000e-01


In [3]:
num_scores = len (scores)
player_ids = scores['player_id'].unique()
num_players = len (player_ids)
player_map = dict(zip(player_ids, range(1,num_players + 1)))
player_map_reverse = dict(zip(range(1,num_players + 1), player_ids, ))
tournament_ids = scores['permanent_tournament_id'].unique()
num_tournaments = len (tournament_ids)
tournament_map = dict(zip(tournament_ids, range(1, num_tournaments + 1)))
tournament_map_reverse = dict(zip(range(1, num_tournaments + 1), tournament_ids))
x = np.ones((num_scores,1))
datum = sorted([[player_map[i[0]], tournament_map[i[4]], i[6], i[7], i[8], i[9], i[10]] for i in scores.as_matrix()], key = lambda (el): el[0])
p, t, y, x1, x2, x3, x4 = zip(*datum)
x = np.column_stack((x, x1))
x = np.column_stack((x, x2))
x = np.column_stack((x, x3))
x = np.column_stack((x, x4))

In [5]:
code = """
data {
  int N;
  int N_P;
  int N_T;
  int y[N];
  int p[N];
  int t[N];
  matrix[N, 5] x;
}
# 
parameters {
  matrix [N_T, 5] b;
  real<lower=0, upper=4> sigma;
} 
model {
  for (n in 1:N) {
    b[t[n]][1] ~ normal(70, 10);
    for (k in 2:5) {
      b[t[n]][k] ~ normal(0, 10);
    }
     y[n] ~ normal((x[n])*(b[t[n]])', sigma);
  }
 
}

"""
data = {
    'N': num_scores,
    'y': y,
    'p' : p,
    't' : t,
    'N_P' : num_players,
    'N_T' : num_tournaments,
    'x' : x
}

fit = pystan.stan(model_code=code, data=data, iter=1000, chains=4)
params = fs = fit.summary()['summary']
print fit

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_c00a671056895465abdb93a7e1b1fd77 NOW.


Inference for Stan model: anon_model_c00a671056895465abdb93a7e1b1fd77.
4 chains, each with iter=1000; warmup=500; thin=1; 
post-warmup draws per chain=500, total post-warmup draws=2000.

          mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
b[0,0]   70.95  8.9e-4   0.04  70.87  70.93  70.95  70.98  71.03 2000.0    1.0
b[1,0]   71.66  7.4e-4   0.03   71.6  71.64  71.66  71.68  71.73 2000.0    1.0
b[2,0]   72.32  9.6e-4   0.04  72.23  72.29  72.32  72.35  72.41 2000.0    1.0
b[3,0]   72.31  8.8e-4   0.04  72.23  72.28  72.31  72.34  72.39 2000.0    1.0
b[4,0]   71.48  8.0e-4   0.04  71.41  71.46  71.48   71.5  71.55 2000.0    1.0
b[5,0]    72.2  1.3e-3   0.06  72.08  72.16   72.2  72.24  72.32 2000.0    1.0
b[6,0]   72.68  9.2e-4   0.04   72.6  72.65  72.68  72.71  72.77 2000.0    1.0
b[7,0]   71.61  1.5e-3   0.07  71.48  71.56  71.61  71.66  71.74 2000.0    1.0
b[8,0]   72.82  1.3e-3   0.06  72.71  72.78  72.82  72.86  72.94 2000.0    1.0
b[9,0]   71.55  1.1e-3 

In [6]:
def create_stat(parm, i, offset):
    i += offset
    return (round(parm[i][0],3), round(parm[i][1],3), round(parm[i][2],3))

In [9]:
t_range = range(0,num_tournaments)
d = {
    'permanent_tournament_id': pd.Series([tournament_map_reverse[i+1] for i in t_range]),
    'tag': pd.Series([tag for i in t_range]),
    'year' : pd.Series([year for i in t_range]),
    'b_offset': pd.Series([create_stat(params, i, 0) for i in t_range]),
    'b_tee': pd.Series([create_stat(params, i, num_tournaments) for i in t_range]),
    'b_approach': pd.Series([create_stat(params, i, 2*num_tournaments) for i in t_range]),
    'b_around': pd.Series([create_stat(params, i, 3*num_tournaments) for i in t_range]),
    'b_putting': pd.Series([create_stat(params, i, 4*num_tournaments) for i in t_range])
}
t_stats = pd.DataFrame(d)
t_stats.to_sql(name= 'stan_sg_tournaments', con= util.golf_engine, if_exists= 'append', index=False)