In [107]:
import nfldb
import pandas as pd
import numpy as np
from pymongo import MongoClient

from ml.feature_extraction.nfldb_feature_extraction import ExtractColumns
from ml.feature_extraction.nfldb_feature_extraction import load_feature_set
from ml.feature_extraction.nfldb_feature_extraction import prediction_feature_set

from ml.helpers.scoring_helpers import make_scorer
from ml.helpers.scoring_helpers import score_stats
from ml.helpers.testing_helpers import train_test_split_index
from ml.helpers.testing_helpers import split_by_year_week
from ml.helpers.nfldb_helpers import player_team_info
from ml.helpers.nfldb_helpers import player_game_info
from ml.mongo_helpers.web_helpers import VegasData

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KernelDensity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import clone

In [121]:
%reload_ext ml.helpers.nfldb_helpers

In [66]:
def fit_predict(model, X_train, y_train, X_test = None, y_test = None, predict_proba = False):
    model = clone(model).fit(X_train, y_train)
    return_obj = (model,)
    if X_test is not None:
        if predict_proba:
            pred_test = model.predict_proba(X_test)
        else:
            pred_test = model.predict(X_test)

        return_obj += (pred_test,)
        
        if y_test is not None:
            rmse = mean_squared_error(y_test, pred_test)**0.5
            mae = mean_absolute_error(y_test, pred_test)
            
            return_obj += ({'rmse':rmse, 'mae':mae},)
    
    return(return_obj)

In [68]:
# get vegas data
client = MongoClient()
mdb = client.data
vegas = VegasData(mdb)
vegas_pipe = Pipeline(steps=[('vegas', vegas)])
vegas_data = vegas_pipe.fit_transform(X=None)

# create a new training set with predicted values and vegas data
cols = ['full_name','player_id','week','year']
X_with_info = X_train_info[cols]

# get model output
#lr = LinearRegression()
#predicted = cross_val_predict(lr, X, y)
predicted = gb.predict(X.iloc[train_i])
X_with_info.loc[:,y_col] = predicted

In [152]:
#team_info = player_team_info(db)
team_info = player_game_info(db, X_with_info['player_id'].unique(), use_current_team=False)
#with_team = pd.merge(X_with_info, team_info, how='inner', on=['player_id','year','week'])
with_team = pd.merge(X_with_info, team_info, how='left', on=['player_id','year','week'])
with_vegas = pd.merge(with_team, vegas_data, how='left',
    left_on=['team','week','year'],
    right_on=['Favorite_Abbr','Week','Year'])
X_vegas = pd.merge(with_vegas, vegas_data, how='left',
    left_on=['team','week','year'],
    right_on=['Underdog_Abbr','Week','Year'])


In [153]:
X_vegas.columns

Index([    u'full_name_x',       u'player_id',            u'week',
                  u'year',     u'rushing_yds',         u'at_home',
             u'away_team',     u'full_name_y',         u'gsis_id',
             u'home_team',        u'opp_team',     u'season_type',
                  u'team',      u'DateTime_x',      u'Favorite_x',
       u'Favorite_Abbr_x',    u'Money_Odds_x',         u'Month_x',
              u'Spread_x',         u'Total_x',      u'Underdog_x',
       u'Underdog_Abbr_x',          u'Week_x',       u'Weekday_x',
                u'Year_x',      u'DateTime_y',      u'Favorite_y',
       u'Favorite_Abbr_y',    u'Money_Odds_y',         u'Month_y',
              u'Spread_y',         u'Total_y',      u'Underdog_y',
       u'Underdog_Abbr_y',          u'Week_y',       u'Weekday_y',
                u'Year_y'],
      dtype='object')

In [154]:

# TODO: dummy code weekday, month
# TODO: discuss whether looking at home team here makes sense

# combine columns with NaN values, caused by left joins above
cols_to_fill = ['Favorite_Abbr','Underdog_Abbr','Spread','Total', 'full_name']
for col in cols_to_fill:
    X_vegas.loc[:,col] = X_vegas[col+'_x'].fillna(X_vegas[col+'_y'])

# determine if player's team is favored
X_vegas.loc[:,'is_favorite'] = X_vegas['team'] == X_vegas['Favorite_Abbr']
# want look at interaction of spread and favorite because
# otherwise spread is ambiguous, mapping False to -1 so sign of spread
# is reversed
X_vegas.loc[:,'is_favorite'] = X_vegas['is_favorite'].map({True:1, False:-1})
X_vegas.loc[:,'spread_x_favorite'] = X_vegas['is_favorite']*X_vegas['Spread']

# get rid of unnecessary columns
cols_to_keep = ['full_name','player_id','week','year','team','Total',
    'is_favorite','spread_x_favorite', y_col]
cols_to_drop = [col for col in X_vegas.columns if col not in cols_to_keep]
cols_to_drop.extend(['Favorite_Abbr','Underdog_Abbr','Spread'])
X_vegas.drop(cols_to_drop, axis=1, inplace=True)
X_vegas['team_points'] = (X_vegas['Total'] - X_vegas['spread_x_favorite'])/2


In [155]:
X_vegas


Unnamed: 0,player_id,week,year,rushing_yds,team,Total,full_name,is_favorite,spread_x_favorite,team_points
0,00-0023500,1,2009,31.994063,SF,46.0,Frank Gore,-1,5.0,20.50
1,00-0023500,2,2009,165.116191,SF,39.5,Frank Gore,1,-1.0,20.25
2,00-0023500,3,2009,75.655422,SF,39.0,Frank Gore,-1,6.5,16.25
3,00-0023500,7,2009,39.943566,SF,44.5,Frank Gore,-1,3.0,20.75
4,00-0023500,8,2009,58.309613,SF,45.0,Frank Gore,-1,13.0,16.00
5,00-0023500,9,2009,74.298903,SF,40.5,Frank Gore,1,-4.5,22.50
6,00-0023500,10,2009,82.972138,SF,43.0,Frank Gore,1,-3.0,23.00
7,00-0023500,12,2009,58.468343,SF,42.0,Frank Gore,1,-3.0,22.50
8,00-0023500,13,2009,49.847474,SF,41.5,Frank Gore,1,-1.0,21.25
9,00-0023500,14,2009,96.934176,SF,45.0,Frank Gore,-1,4.0,20.50


In [142]:
team_info = player_game_info(db, X_with_info['player_id'].unique(), use_current_team=False)

In [148]:
(45 - (-5.5))/2

25.25

In [150]:
45-25.25

19.75

In [143]:
X_vegas

Unnamed: 0,player_id,week,year,rushing_yds,team,Total,full_name,is_favorite,spread_x_favorite
0,00-0023500,1,2009,31.994063,SF,46.0,Frank Gore,-1,5.0
1,00-0023500,2,2009,165.116191,SF,39.5,Frank Gore,1,-1.0
2,00-0023500,3,2009,75.655422,SF,39.0,Frank Gore,-1,6.5
3,00-0023500,7,2009,39.943566,SF,44.5,Frank Gore,-1,3.0
4,00-0023500,8,2009,58.309613,SF,45.0,Frank Gore,-1,13.0
5,00-0023500,9,2009,74.298903,SF,40.5,Frank Gore,1,-4.5
6,00-0023500,10,2009,82.972138,SF,43.0,Frank Gore,1,-3.0
7,00-0023500,12,2009,58.468343,SF,42.0,Frank Gore,1,-3.0
8,00-0023500,13,2009,49.847474,SF,41.5,Frank Gore,1,-1.0
9,00-0023500,14,2009,96.934176,SF,45.0,Frank Gore,-1,4.0


In [123]:
team_info.shape

(4509, 11)

In [92]:
team_info.groupby(['player_id','year','week']).get_group(('00-0023500',2009,2))

Unnamed: 0,at_home,away_team,full_name,gsis_id,home_team,opp_team,player_id,team,week,year
1350,True,OAK,Frank Gore,2009082257,SF,OAK,00-0023500,SF,2,2009
3047,True,SEA,Frank Gore,2009092010,SF,SEA,00-0023500,SF,2,2009


In [126]:
def build_vegas_dataframe(X, y, row_info, model, db, y_col):
    # get vegas data
    client = MongoClient()
    mdb = client.data
    vegas = VegasData(mdb)
    vegas_pipe = Pipeline(steps=[('vegas', vegas)])
    vegas_data = vegas_pipe.fit_transform(X=None)

    # create a new training set with predicted values and vegas data
    cols = ['full_name','player_id','week','year']
    X_with_info = row_info[cols]

    # get model output
    #lr = LinearRegression()
    #predicted = cross_val_predict(lr, X, y)
    predicted = model.predict(X)
    X_with_info.loc[:,y_col] = predicted


    #team_info = player_team_info(db)
    team_info = player_game_info(db, X_with_info['player_id'].unique(), use_current_team=False)
    #with_team = pd.merge(X_with_info, team_info, how='inner', on=['player_id','year','week'])
    with_team = pd.merge(X_with_info, team_info, how='left', on=['player_id','year','week'])
    with_vegas = pd.merge(with_team, vegas_data, how='left',
        left_on=['team','week','year'],
        right_on=['Favorite_Abbr','Week','Year'])
    X_vegas = pd.merge(with_vegas, vegas_data, how='left',
        left_on=['team','week','year'],
        right_on=['Underdog_Abbr','Week','Year'])

    # TODO: dummy code weekday, month
    # TODO: discuss whether looking at home team here makes sense

    # combine columns with NaN values, caused by left joins above
    cols_to_fill = ['Favorite_Abbr','Underdog_Abbr','Spread','Total', 'full_name']
    for col in cols_to_fill:
        X_vegas.loc[:,col] = X_vegas[col+'_x'].fillna(X_vegas[col+'_y'])

    # determine if player's team is favored
    X_vegas.loc[:,'is_favorite'] = X_vegas['team'] == X_vegas['Favorite_Abbr']
    # want look at interaction of spread and favorite because
    # otherwise spread is ambiguous, mapping False to -1 so sign of spread
    # is reversed
    X_vegas.loc[:,'is_favorite'] = X_vegas['is_favorite'].map({True:1, False:-1})
    X_vegas.loc[:,'spread_x_favorite'] = X_vegas['is_favorite']*X_vegas['Spread']

    # get rid of unnecessary columns
    cols_to_keep = ['full_name','player_id','week','year','team','Total',
        'is_favorite','spread_x_favorite', y_col]
    cols_to_drop = [col for col in X_vegas.columns if col not in cols_to_keep]
    cols_to_drop.extend(['Favorite_Abbr','Underdog_Abbr','Spread'])
    X_vegas.drop(cols_to_drop, axis=1, inplace=True)
    
    X_vegas['team_points'] = (X_vegas['Total'] - X_vegas['spread_x_favorite'])/2

    return X_vegas

In [4]:
db = nfldb.connect()
result_path='../results'
full_train, pipe, stats = load_feature_set(db)

# picks columns to model
lag_cols = [stat + '_lag' for stat in stats]
mean_cols = [stat + '_mean' for stat in stats]
other_cols = ['same_year_lag', 'played_lag']

infoColumns = ExtractColumns(like=[], exact=['year','week','time','player_id','full_name'])
row_info = infoColumns.fit_transform(X=full_train)

In [5]:
pred_data, predict_i, pred_info, pred_yr_wk = prediction_feature_set(db, pipe, infoColumns)

In [53]:
X_all = full_train
pred_all = pred_data.iloc[predict_i]
pred_results = pred_info.iloc[predict_i]

# which rows did players play
played_bool = full_train['played'] == 1
played_index = [i for i in range(X_all.shape[0]) if played_bool[i]]

# random split train and test
train_index, test_index = train_test_split_index(X_all.shape[0], test_size=0.1, seed=0)

feature_cols = lag_cols + mean_cols + other_cols
XColumns = ExtractColumns(like=feature_cols)
X = XColumns.fit_transform(X=X_all)
X_pred = XColumns.fit_transform(X=pred_all)

played_only = True


In [20]:
X_train_all.shape

(3723, 9)

In [21]:
X.iloc[train_i].shape

(3839, 68)

In [130]:
y_cols = ['rushing_yds']
vegas_adjustment = True
#y_cols = ['played', 'receiving_rec', 'receiving_tds', 'receiving_yds', 'rushing_att', 'rushing_tds','rushing_yds']

for y_col in y_cols:
    
    y = X_all[y_col]

    if(played_only and y_col != 'played'):
        train_i = list(set.intersection(set(train_index), set(played_index)))
        test_i = list(set.intersection(set(test_index), set(played_index)))
    else:
        train_i = train_index
        test_i = test_index

    X_train = X.iloc[train_i]
    y_train = y.iloc[train_i]
    X_test = X.iloc[test_i]
    y_test = y.iloc[test_i]
    
    # get player info for train and test data
    X_train_info = row_info.iloc[train_i]
    X_test_info = row_info.iloc[test_i]

    ### Test Predictions
    
    predict_proba = y_col == 'played'
    
    if(predict_proba):
        models = {
            'gb':GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
            'rf':RandomForestClassifier(),
            'lin':LogisticRegression()
        }
    else:
        models = {
            'gb':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1),
            'rf':RandomForestRegressor(),
            'lin':LinearRegression()
        }
        
    gb, gb_test, gb_scores = fit_predict(
        model=models['gb'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        predict_proba=predict_proba)
    
    rf, rf_test, rf_scores = fit_predict(
        model=models['rf'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        predict_proba=predict_proba)
    
    lin, lin_test, lin_scores = fit_predict(
        model=models['lin'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        predict_proba=predict_proba)
    
    if vegas_adjustment and y_col != 'played':
            print '-'*50
            print 'Adjusted Prediction:', y_col

            X_train_all = build_vegas_dataframe(X=X_train, y=y_train,
                row_info=X_train_info, model=gb, db=db, y_col=y_col)
            X_test_all = build_vegas_dataframe(X=X_test, y=y_test,
                row_info=X_test_info, model=gb, db=db, y_col=y_col)

            features = [y_col, 'Total','is_favorite','spread_x_favorite']
            X_cols = ExtractColumns(exact=features)
            X_train_vegas = X_cols.fit_transform(X=X_train_all)
            X_test_vegas = X_cols.fit_transform(X=X_test_all)

            gb_a, gb_test_a, gb_scores_a = fit_predict(
                model=models['gb'],
                X_train=X_train_vegas,
                y_train=y_train,
                X_test=X_test_vegas,
                y_test=y_test)

            rf_a, rf_test_a, rf_scores_a = fit_predict(
                model=models['rf'],
                X_train=X_train_vegas,
                y_train=y_train,
                X_test=X_test_vegas,
                y_test=y_test)

            lin_a, lin_test_a, lin_scores_a = fit_predict(
                model=models['lin'],
                X_train=X_train_vegas,
                y_train=y_train,
                X_test=X_test_vegas,
                y_test=y_test)

            print 'Predicting %s' % (y_col)
            print lin_a.coef_
            print 'Gradient Boosting: RMSE %.2f | MAE %.2f' % (gb_scores_a['rmse'], gb_scores_a['mae'])
            print 'Random Forest: RMSE %.2f | MAE %.2f' % (rf_scores_a['rmse'], rf_scores_a['mae'])
            print '%s Regression: RMSE %.2f | MAE %.2f' % ('Linear', lin_scores_a['rmse'], lin_scores_a['mae'])

    # Print Results
    print 'Predicting %s' % (y_col)
    print 'Gradient Boosting: RMSE %.2f | MAE %.2f' % (gb_scores['rmse'], gb_scores['mae'])
    print 'Random Forest: RMSE %.2f | MAE %.2f' % (rf_scores['rmse'], rf_scores['mae'])
    print '%s Regression: RMSE %.2f | MAE %.2f' % ('Logistic' if predict_proba else 'Linear', lin_scores['rmse'], lin_scores['mae'])
    # Build full models on all data

    gb = gb.fit(X, y)
    rf = rf.fit(X, y)
    lin = lin.fit(X, y)
    #### Next week's predictions
    # Make prediction, just gbr for now
    
    if(y_col == 'played'):
        preds = gb.predict_proba(X_pred)[:,1]
    else:
        preds = gb.predict(X_pred)

    pred_results.loc[:,y_col] = preds

--------------------------------------------------
Adjusted Prediction: rushing_yds
Predicting rushing_yds
[ 1.13853858 -0.08322251  0.45943673 -0.1734473 ]
Gradient Boosting: RMSE 31.87 | MAE 22.47
Random Forest: RMSE 35.96 | MAE 25.52
Linear Regression: RMSE 30.46 | MAE 21.90
Predicting rushing_yds
Gradient Boosting: RMSE 29.88 | MAE 21.64
Random Forest: RMSE 31.44 | MAE 22.95
Linear Regression: RMSE 30.33 | MAE 22.12


In [131]:
X_train_vegas

Unnamed: 0,rushing_yds,Total,is_favorite,spread_x_favorite
0,31.994063,46.0,-1,5.0
1,165.116191,39.5,1,-1.0
2,75.655422,39.0,-1,6.5
3,39.943566,44.5,-1,3.0
4,58.309613,45.0,-1,13.0
5,74.298903,40.5,1,-4.5
6,82.972138,43.0,1,-3.0
7,58.468343,42.0,1,-3.0
8,49.847474,41.5,1,-1.0
9,96.934176,45.0,-1,4.0


In [129]:
X.iloc[train_i].shape

(3845, 68)

In [34]:
X_train_info.loc[pd.isnull(X_train['Total']).values,:]

Unnamed: 0_level_0,Unnamed: 1_level_0,year,week,player_id,full_name
player_id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00-0029104,755,2014,7,00-0029104,Jonas Gray
00-0029104,756,2014,8,00-0029104,Jonas Gray
00-0029104,757,2014,9,00-0029104,Jonas Gray
00-0029104,761,2014,13,00-0029104,Jonas Gray
00-0029104,762,2014,14,00-0029104,Jonas Gray
00-0029104,763,2014,15,00-0029104,Jonas Gray
00-0029104,764,2014,16,00-0029104,Jonas Gray
00-0029104,768,2015,3,00-0029104,Jonas Gray
00-0029104,769,2015,4,00-0029104,Jonas Gray
00-0029104,771,2015,6,00-0029104,Jonas Gray


# Playing around with Ensembling

In [None]:
from sklearn.base import TransformerMixin

class PredictionFeature(TransformerMixin):
    def __init__(self, model, predict_proba=False):
        self.model = model
        self.predict_proba = predict_proba
    def fit(self, X, y):
        self.model.fit(X, y)
        return self
    def transform(self, X):
        if(self.predict_proba):
            pred = self.model.predict_proba(X)
        else:
            pred = self.model.predict(X)
        return np.expand_dims(pred, 1)
    def get_params(self, deep=True):
        return {}
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self


In [None]:
import copy

models['en'] = Pipeline([
        ('models', FeatureUnion([
                        ('gb',PredictionFeature(copy.deepcopy(models['gb']), predict_proba=predict_proba)),
                        ('rf',PredictionFeature(copy.deepcopy(models['rf']), predict_proba=predict_proba)),
                        ('lin',PredictionFeature(copy.deepcopy(models['lin']), predict_proba=predict_proba))
                    ])),
        ('pred', GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01, loss='lad', max_depth=5, min_samples_leaf=10))
            ])

en, en_test, en_scores = fit_predict(
        model=models['en'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        predict_proba=predict_proba)

In [None]:
en_scores