In [1]:
from evaluation import regression_evaluator
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
def partition(data, play_type="pass"):
    return data[data['play_type'] == play_type].drop('play_type', axis=1)

In [3]:
def objective(params):
    # Ensure parameters that should be integers are converted
    params['max_depth'] = int(params['max_depth'])
    params['n_estimators'] = int(params['n_estimators'])
    
    
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        learning_rate=params['eta'],
        max_depth=params['max_depth'],
        gamma=params['gamma'],
        reg_alpha=params['reg_alpha'],
        reg_lambda=params['reg_lambda'],
        colsample_bytree=params['colsample_bytree'],
        min_child_weight=params['min_child_weight'],
        n_estimators=params['n_estimators'],
        seed=4455542111
    )
    
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    rmse = np.abs(scores.mean())
    
    return {'loss': rmse, 'status': STATUS_OK}


def objective2(params):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    params['max_depth'] = int(params['max_depth'])
    cv_result = xgb.cv(params, dtrain, num_boost_round=100, nfold=5, metrics='rmse', seed=0)
    return cv_result['test-rmse-mean'].min()

In [1]:
data = pd.read_csv("fourth_down_dataset.csv")
print(data['play_type'].value_counts())


passing = partition(data,'pass')
run = partition(data,'run')
fg = partition(data,'field_goal')
punt = partition(data,'punt')

partitioned_dataset = [("run", run) , 
                       ("pass", passing), 
                       ("fg", fg), 
                       ("punt", punt)]

NameError: name 'pd' is not defined

In [5]:
param_space = {
        'objective': 'reg:squarederror',
        'eta': hp.uniform('eta',0.001, 0.1),
        'max_depth': hp.quniform("max_depth", 3, 13, 1),
        'gamma': hp.uniform ('gamma', 0,9),
        'reg_alpha' : hp.uniform('reg_alpha', 0,2),
        'reg_lambda' : hp.uniform('reg_lambda', 0,2),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 25)
    }

In [6]:
for name, play_data in partitioned_dataset: 
    
    print(f"Producing {name} model..,")
    y = play_data['wpa_avg'].to_numpy()
    X = play_data.drop('wpa_avg', axis=1)
    cols = X.columns
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    
    xg = xgb.XGBRegressor(objective='reg:squarederror')
    sel_xg = SelectFromModel(xg, threshold='median')
    sel_xg.fit(X_train, y_train)
    
    cols = X_train.columns[(sel_xg.get_support())]
    re = regression_evaluator(col_names=cols)
    X_train_sel = X_train[cols]
    X_test_sel = X_test[cols]
    
    
    
    trials = Trials()
    final_params = fmin(fn=objective2,
                        space=param_space,
                        algo=tpe.suggest,
                        max_evals=500,
                        trials=trials)
    
    print(final_params)
    
    final_params['max_depth'] = int(final_params['max_depth'])
#     final_params['n_estimators'] = int(final_params['n_estimators'])

    best_model = xgb.XGBRegressor(**final_params)
    best_model.fit(X_train_sel, y_train)
    re.evaluate(best_model, X_train_sel, X_test_sel, y_train, y_test)
    
    
    best_model.save_model(f"{name}_model_v2.model")
    
    
    
    
    

Producing run model..,
  6%|▍      | 30/500 [00:08<02:11,  3.57trial/s, best loss: 0.06776567026036638]


KeyboardInterrupt: 

In [None]:
'''
run: {
'colsample_bytree': 0.6111755709148441, 
'eta': 0.08909119216693924, 
'gamma': 0.0001562168730994257, 
'max_depth': 4.0, 
'min_child_weight': 7.0, 
'reg_alpha': 0.8008549945797766, 
'reg_lambda': 0.611875698595822}


pass:
{'colsample_bytree': 0.7527085633427926, 
'eta': 0.06125424545559793, 
'gamma': 0.0008624501548679542, 
'max_depth': 6.0, 
'min_child_weight': 1.0, 
'reg_alpha': 0.16054258456531909, 
'reg_lambda': 1.6510936257105764}

fg:
{'colsample_bytree': 0.7840046251333294, 
'eta': 0.05801846694430202, 
'gamma': 0.004287732919976334, 
'max_depth': 11.0, 
'min_child_weight': 10.0, 
'reg_alpha': 0.0013737772038666662, 
'reg_lambda': 1.475816724089246}


punt:

'''

In [None]:
fg

In [None]:
plt.scatter(fg['year'], fg['fg_prob'])

In [None]:
'''
TODO:
feature selection with each of the models
finish classifier
build system
wrap everything into a script
dockerize
'''