In [2]:
from pystan import StanModel
import util
import pandas as pd
import numpy as np

In [3]:
code = '''
data {
    int sg_n;
    int<lower=2> N;
    matrix[sg_n,N] y;
    vector[sg_n+1] s_coef;
}
parameters {
    vector[sg_n] alpha;
    vector[sg_n] beta;
    vector<lower=0>[sg_n] sigma;
    vector[sg_n] y_tilda;
} 
model {
    for (k in 1:sg_n) {
        for (n in 2:N) {
            y[k][n] ~ normal(alpha[k]+ beta[k] * y[k][n-1], sigma[k]);
        }
    }
    for (j in 1:sg_n) {
        y_tilda[j] ~ normal(alpha[j]+ beta[j] * y[j][N], sigma[j]);
    }
}
generated quantities {
    real score_predict;
    score_predict = s_coef[1] + s_coef[2]*y_tilda[1] + s_coef[3]*y_tilda[2] 
                + s_coef[4]*y_tilda[3] + s_coef[5]*y_tilda[4];
}
'''

In [29]:
num_scores = 30
year = 2016
tag = '30-prev-sg-50'

In [30]:
sqlTxt = '''
select player_id from stats_view where year = %s limit 50;
'''
player_ids = util.pd_from_sql(sqlTxt, [year]).as_matrix();

sqlTxt = '''
select permanent_tournament_id, (b_offset).mean as offset,
       (b_tee).mean as tee, (b_approach).mean as approach, 
       (b_around).mean as around, (b_putting).mean as putting
       from stan_sg_tournaments 
       where year = %s 
       and tag like 'sg-50';
'''
t_df = util.pd_from_sql(sqlTxt, [year])
sg_coef_map = t_df.set_index(['permanent_tournament_id']).T.to_dict('list')

In [31]:
p_sqlTxt = '''
select player_id, permanent_tournament_id, round, sg_tee, sg_approach, 
       sg_around, sg_putting 
    from scores
    where year = %s 
      and player_id = %s 
      and sg_putting is not null
      and sg_tee is not null
      and sg_around is not null
      and sg_approach is not null
      order by date;
'''
i_sqlTxt = '''
insert into stan_prediction values(%s, %s, %s, %s, %s, (%s, %s, %s, %s, %s, %s, %s, %s));
'''

In [7]:
stan_model = StanModel(model_code=code);

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_09b67b94dc836f93d73b5592b851118d NOW.


In [32]:

for p_id in player_ids:
    p_id = p_id[0]
    players_df = util.pd_from_sql(p_sqlTxt, [year, p_id])
    y_all = [players_df[sg].as_matrix() for sg in['sg_tee', 'sg_approach', 'sg_around', 'sg_putting'] ]
    y_all = [ players_df['permanent_tournament_id'] , players_df['round'].as_matrix()] + y_all
    s_coefs = [sg_coef_map[i] for i in players_df['permanent_tournament_id'].as_matrix()]
    for j in range(0, len (s_coefs) - num_scores - 1):
        y = [y_all[m][j:j+num_scores] for m in range(2,6)]
        s_coef = s_coefs[j+num_scores]
        data = { 'N': len (y[0]), 'y': y, 'sg_n' : 4, 's_coef' : s_coef }
        fit = stan_model.sampling(data=data, iter=1000, chains=4)
        params = fit.summary()['summary']
        args = [p_id, y_all[0][j+num_scores], year, tag, y_all[1][j+num_scores]] + params[16][:8].tolist()
        util.insert_sql(i_sqlTxt, args)

In [14]:
# first cut model to test....
draft_code = '''
data {
    int<lower=1> T;
    real y[T];
}
parameters {
    real mu;
    vector<lower = -1, upper = 1>[2] phi;
    vector<lower = -1, upper = 1>[2] theta;
    real<lower=0> sigma;
    real y_tilda;
} 
model {
    vector[T] nu;
    vector[T] err;
    nu[1] = mu + phi[1] * mu;
    err[1] = y[1] - nu[1];
    nu[2] = mu + phi[1]*y[1] + theta[1]*err[1];
    err[2] = y[2] - nu[2];
    for (t in 3:T) {
        nu[t] = mu + phi[1] * y[t-1] + phi[2]*y[t-2] 
              + theta[1] * err[t-1] + theta[2] * err[t-2];
        err[t] = y[t] - nu[t];
    }
    mu ~ normal(0, 10);
    phi ~ normal(0, 2);
    theta ~ normal(0, 2);
    sigma ~ cauchy(0, 5);
    err ~ normal(0, sigma);
    y_tilda ~ normal(mu + y[T]*phi[1]+y[T-1]*phi[2] + theta[1]*err[T] + theta[2]*err[T-1], sigma);
}

'''
y =  [ 2.352,  2.619,  3.292,  2.657,  0.739,  1.115,  0.148,  0.572,
        0.859,  1.081,  1.104,  0.952, -1.324, -0.998,  1.124, -0.442,
        0.824, -0.825, -2.64 , -0.563,  1.129,  1.047,  0.943,  0.781,
       -0.933,  0.124,  0.341,  1.1  , -0.282,  1.173,  0.089,  1.544,
        0.026, -0.855,  1.448,  0.426, -0.245,  1.346, -0.36 , -0.328,
        0.482,  0.614,  0.572,  0.737, -0.097,  1.668, -0.012,  0.97 ,
       -0.898,  0.159,  1.543, -0.297,  1.257,  0.761,  0.725,  0.629,
        1.871,  0.061]
data = {
    'T': len (y),
    'y': y
}

fit = pystan.stan(model_code=code, data=data, iter=1000, chains=4)
# params = fs = fit.summary()['summary']
print fit

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_6abda944bb0b6acb21d3626f5be8dfef NOW.


Inference for Stan model: anon_model_6abda944bb0b6acb21d3626f5be8dfef.
4 chains, each with iter=1000; warmup=500; thin=1; 
post-warmup draws per chain=500, total post-warmup draws=2000.

           mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
mu         0.61  9.2e-3   0.26   0.17   0.44    0.6   0.76   1.16  766.0    1.0
phi[0]     0.33    0.02   0.34  -0.46   0.14   0.41   0.57   0.85  295.0   1.02
phi[1]    -0.45    0.02   0.32   -0.9  -0.67  -0.51  -0.28   0.33  262.0   1.03
theta[0]   -0.1    0.02    0.3  -0.59  -0.31  -0.16   0.06   0.62  300.0   1.02
theta[1]    0.6    0.01   0.31  -0.18   0.46   0.68   0.84   0.97  450.0   1.02
sigma      1.01  2.8e-3    0.1   0.84   0.94    1.0   1.07   1.22 1223.0    1.0
y_tilda    0.62    0.03   1.08  -1.45  -0.09   0.63   1.31   2.82 1249.0    1.0
lp__     -33.79    0.09   2.05 -38.48 -34.94  -33.4 -32.31 -30.78  523.0    1.0

Samples were drawn using NUTS at Thu Aug 10 22:57:51 2017.
For each parameter, n_eff is a cr

In [None]:

# 

code = '''
data {
    int<lower=1> N_SEG;
    int<lower=2> N;
    matrix[4,N] y;
    int<lower=2> N_PRED;
    real s_base;
}
parameters {
    matrix[4,N_SEG] alpha;
    vector[4] beta;
    vector<lower=0>[4] sigma;
    matrix[4,N_PRED] y_tilda;
} 
model {
    int m;
    for (k in 1:4) {
        for (n in 2:N) {
            m = (n-1)*N_SEG/N + 1;
            y[k][n] ~ normal(alpha[k][m]+ beta[k] * y[k][n-1], sigma[k]);
        }
    }
    for (j in 1:4) {
        y_tilda[j][1] ~ normal(alpha[j][N_SEG]+ beta[j] * y[j][N], sigma[j]);
        for (i in 2:N_PRED) {
            y_tilda[j][i] ~ normal(alpha[j][N_SEG] + beta[j]* y_tilda[j][i-1], sigma[j]);
        }
    }
}
generated quantities {
      vector[N_PRED] score_predict;
      for (l in 1:N_PRED) {
        score_predict[l] = s_base - y_tilda[1][l] - y_tilda[2][l] - y_tilda[3][l] - y_tilda[4][l];
    }
}

'''
data = {
    'N': len (y[0]),
    'y': y,
    'N_SEG' : 2,
    'N_PRED' :4, 
    's_base' : 71.571
}

fit = pystan.stan(model_code=code, data=data, iter=1000, chains=4)
# params = fs = fit.summary()['summary']
print fit