In [12]:
import util
%matplotlib inline
%precision 4
import pystan
import numpy as np
import pandas as pd


In [13]:
year = 2016
tag = 'sg-50'
limit = 50
sqlTxt = '''
select * from (select year, player_id, row_number from rank_view where year = %s  limit %s) r 
    join scores_view s 
    using (player_id, year) order by r.row_number, player_id, year, permanent_tournament_id, round;
'''
scores = util.pd_from_sql(sqlTxt, [year, limit])
scores = scores[(pd.notnull(scores['sg_approach'])) & (pd.notnull(scores['sg_around']))]
scores

Unnamed: 0,player_id,year,row_number,date,permanent_tournament_id,round,score,sg_tee,sg_approach,sg_around,sg_putting
0,24502,2016,6,2016-01-14,6,1,68,2.110000e+00,2.685,0.313,-2.604
1,24502,2016,6,2016-01-15,6,2,69,1.387779e-16,2.645,-0.910,0.412
2,24502,2016,6,2016-01-16,6,3,69,-1.650000e-01,2.999,-0.749,0.131
3,24502,2016,6,2016-01-17,6,4,69,4.630000e-01,1.736,-0.541,-0.389
4,24502,2016,6,2016-02-18,7,1,68,-2.480000e-01,2.455,1.201,-0.149
5,24502,2016,6,2016-02-19,7,2,68,-4.930000e-01,1.668,-0.455,2.393
6,24502,2016,6,2016-02-20,7,3,67,7.150000e-01,2.323,2.014,-0.823
7,24502,2016,6,2016-02-21,7,4,68,-4.210000e-01,3.250,0.470,0.881
8,24502,2016,6,2016-03-17,9,1,67,1.234000e+00,1.157,-0.846,3.286
9,24502,2016,6,2016-03-18,9,2,74,8.690000e-01,-0.795,-0.187,-1.351


In [14]:
num_scores = len (scores)
player_ids = scores['player_id'].unique()
num_players = len (player_ids)
player_map = dict(zip(player_ids, range(1,num_players + 1)))
player_map_reverse = dict(zip(range(1,num_players + 1), player_ids, ))
tournament_ids = scores['permanent_tournament_id'].unique()
num_tournaments = len (tournament_ids)
tournament_map = dict(zip(tournament_ids, range(1, num_tournaments + 1)))
tournament_map_reverse = dict(zip(range(1, num_tournaments + 1), tournament_ids))
x = np.ones((num_scores,1))
datum = sorted([[player_map[i[0]], tournament_map[i[4]], i[6], i[7], i[8], i[9], i[10]] for i in scores.as_matrix()], key = lambda (el): el[0])
p, t, y, x1, x2, x3, x4 = zip(*datum)
x = np.column_stack((x, x1))
x = np.column_stack((x, x2))
x = np.column_stack((x, x3))
x = np.column_stack((x, x4))

In [15]:
code = """
data {
  int N;
  int N_P;
  int N_T;
  int y[N];
  int p[N];
  int t[N];
  matrix[N, 5] x;
}
# 
parameters {
  matrix [N_T, 5] b;
  real<lower=0, upper=4> sigma;
} 
model {
  for (n in 1:N) {
    b[t[n]][1] ~ normal(70, 5);
    for (k in 2:5) {
      b[t[n]][k] ~ normal(0, 5);
    }
     y[n] ~ normal((x[n])*(b[t[n]])', sigma);
  }
 
}

"""
data = {
    'N': num_scores,
    'y': y,
    'p' : p,
    't' : t,
    'N_P' : num_players,
    'N_T' : num_tournaments,
    'x' : x
}

fit = pystan.stan(model_code=code, data=data, iter=1000, chains=4)
params = fs = fit.summary()['summary']
print fit

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_cebadd112a607d14a57abfb19b7ad67b NOW.


Inference for Stan model: anon_model_cebadd112a607d14a57abfb19b7ad67b.
4 chains, each with iter=1000; warmup=500; thin=1; 
post-warmup draws per chain=500, total post-warmup draws=2000.

          mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
b[0,0]   70.72  2.3e-3    0.1  70.54  70.66  70.73  70.79  70.91 1784.0    1.0
b[1,0]   71.33  2.1e-3   0.09  71.14  71.26  71.32  71.39  71.51 1896.0    1.0
b[2,0]   72.08  1.6e-3   0.07  71.93  72.03  72.08  72.13  72.22 2000.0    1.0
b[3,0]   71.24  1.7e-3   0.07   71.1   71.2  71.24  71.29  71.39 2000.0    1.0
b[4,0]   71.99  1.2e-3   0.05  71.89  71.96  71.99  72.03   72.1 2000.0    1.0
b[5,0]   70.73  1.6e-3   0.07  70.58  70.68  70.73  70.78  70.87 2000.0    1.0
b[6,0]   71.79  1.3e-3   0.06  71.67  71.74  71.78  71.83   71.9 2000.0    1.0
b[7,0]   72.72  1.5e-3   0.07  72.59  72.68  72.73  72.77  72.85 2000.0    1.0
b[8,0]   71.44  1.7e-3   0.08  71.29  71.39  71.44  71.48  71.59 2000.0    1.0
b[9,0]   71.54  1.7e-3 

In [16]:
def create_stat(parm, i, offset):
    i += offset
    return (round(parm[i][0],3), round(parm[i][1],3), round(parm[i][2],3))

In [17]:
t_range = range(0,num_tournaments)
d = {
    'permanent_tournament_id': pd.Series([tournament_map_reverse[i+1] for i in t_range]),
    'tag': pd.Series([tag for i in t_range]),
    'year' : pd.Series([year for i in t_range]),
    'b_offset': pd.Series([create_stat(params, i, 0) for i in t_range]),
    'b_tee': pd.Series([create_stat(params, i, num_tournaments) for i in t_range]),
    'b_approach': pd.Series([create_stat(params, i, 2*num_tournaments) for i in t_range]),
    'b_around': pd.Series([create_stat(params, i, 3*num_tournaments) for i in t_range]),
    'b_putting': pd.Series([create_stat(params, i, 4*num_tournaments) for i in t_range])
}
t_stats = pd.DataFrame(d)
t_stats.to_sql(name= 'stan_sg_tournaments', con= util.golf_engine, if_exists= 'append', index=False)