In [1]:
import nfldb
import pandas as pd
import numpy as np

from ml.feature_extraction.nfldb_feature_extraction import ExtractColumns
from ml.feature_extraction.nfldb_feature_extraction import load_feature_set
from ml.feature_extraction.nfldb_feature_extraction import prediction_feature_set

from ml.helpers.scoring_helpers import make_scorer
from ml.helpers.scoring_helpers import score_stats
from ml.helpers.testing_helpers import train_test_split_index
from ml.helpers.testing_helpers import split_by_year_week

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KernelDensity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

In [2]:
db = nfldb.connect()
result_path='../results'
full_train, pipe, stats = load_feature_set(db)

# picks columns to model
lag_cols = [stat + '_lag' for stat in stats]
mean_cols = [stat + '_mean' for stat in stats]
other_cols = ['same_year_lag', 'played_lag']

infoColumns = ExtractColumns(like=[], exact=['year','week','time','player_id','full_name'])
row_info = infoColumns.fit_transform(X=full_train)

In [3]:
pred_data, predict_i, pred_info, pred_yr_wk = prediction_feature_set(db, pipe, infoColumns)

In [45]:
X_all = full_train
pred_all = pred_data.iloc[predict_i]
pred_results = pred_info.iloc[predict_i]

# which rows did players play
played_bool = full_train['played'] == 1
played_index = [i for i in range(X_all.shape[0]) if played_bool[i]]

# random split train and test
train_index, test_index = train_test_split_index(X_all.shape[0], test_size=0.1, seed=0)

feature_cols = lag_cols + mean_cols + other_cols
XColumns = ExtractColumns(like=feature_cols)
X = XColumns.fit_transform(X=X_all)
X_pred = XColumns.fit_transform(X=pred_all)

played_only = True


In [41]:
def fit_predict(model, X_train, y_train, X_test = None, y_test = None, predict_proba = False):
    model = model.fit(X_train, y_train)
    return_obj = (model,)
    if X_test is not None:
        if predict_proba:
            pred_test = model.predict_proba(X_test)
        else:
            pred_test = model.predict(X_test)

        return_obj += (pred_test,)
        
        if y_test is not None:
            rmse = mean_squared_error(y_test, pred_test)**0.5
            mae = mean_absolute_error(y_test, pred_test)
            
            return_obj += ({'rmse':rmse, 'mae':mae},)
    
    return(return_obj)

In [78]:
y_cols = ['receiving_yds']

for y_col in y_cols:
    
    y = X_all[y_col]

    if(played_only and y_col != 'played'):
        train_i = list(set.intersection(set(train_index), set(played_index)))
        test_i = list(set.intersection(set(test_index), set(played_index)))
    else:
        train_i = train_index
        test_i = test_index

    X_train = X.iloc[train_i]
    y_train = y.iloc[train_i]
    X_test = X.iloc[test_i]
    y_test = y.iloc[test_i]

    ### Test Predictions
    
    predict_proba = y_col == 'played'
    
    if(predict_proba):
        models = {
            'gb':GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
            'rf':RandomForestClassifier(),
            'lin':LogisticRegression()
        }
    else:
        models = {
            'gb':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1),
            'rf':RandomForestRegressor(),
            'lin':LinearRegression()
        }
        
    gb, gb_test, gb_scores = fit_predict(
        model=models['gb'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        predict_proba=predict_proba)
    
    rf, rf_test, rf_scores = fit_predict(
        model=models['rf'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        predict_proba=predict_proba)
    
    lin, lin_test, lin_scores = fit_predict(
        model=models['lin'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        predict_proba=predict_proba)

    # Print Results
    print 'Predicting %s' % (y_col)
    print 'Gradient Boosting: RMSE %.2f | MAE %.2f' % (gb_scores['rmse'], gb_scores['mae'])
    print 'Random Forest: RMSE %.2f | MAE %.2f' % (rf_scores['rmse'], rf_scores['mae'])
    print '%s Regression: RMSE %.2f | MAE %.2f' % ('Logistic' if predict_proba else 'Linear', lin_scores['rmse'], lin_scores['mae'])
    # Build full models on all data

    gb = gb.fit(X, y)
    rf = rf.fit(X, y)
    lin = lr.fit(X, y)
    #### Next week's predictions
    # Make prediction, just gbr for now
    
    if(y_col == 'played'):
        preds = gb.predict_proba(X_pred)[:,1]
    else:
        preds = gb.predict(X_pred)

    pred_results.loc[:,y_col] = preds

Predicting receiving_yds
Gradient Boosting: RMSE 17.61 | MAE 12.30
Random Forest: RMSE 18.89 | MAE 13.13
Linear Regression: RMSE 17.30 | MAE 12.36


# Playing around with Ensembling

In [66]:
class PredictionFeature(TransformerMixin):
    def __init__(self, model, predict_proba=False):
        self.model = model
        self.predict_proba = predict_proba
    def fit(self, X, y):
        self.model.fit(X, y)
        return self
    def transform(self, X):
        if(self.predict_proba):
            pred = self.model.predict_proba(X)
        else:
            pred = self.model.predict(X)
        return np.expand_dims(pred, 1)
    def get_params(self, deep=True):
        return {}
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self


In [125]:
import copy

models['en'] = Pipeline([
        ('models', FeatureUnion([
                        ('gb',PredictionFeature(copy.deepcopy(models['gb']), predict_proba=predict_proba)),
                        ('rf',PredictionFeature(copy.deepcopy(models['rf']), predict_proba=predict_proba)),
                        ('lin',PredictionFeature(copy.deepcopy(models['lin']), predict_proba=predict_proba))
                    ])),
        ('pred', GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01, loss='lad', max_depth=5, min_samples_leaf=10))
            ])

en, en_test, en_scores = fit_predict(
        model=models['en'],
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        predict_proba=predict_proba)

In [126]:
en_scores

{'mae': 14.257369448787047, 'rmse': 20.877766389819243}