In [127]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics 

In [98]:
data = pd.read_csv('Data/processed/data_for_model.csv')

In [99]:
for col in data.select_dtypes(include=['int64']).columns:
    data[col] = data[col].astype(int)
for col in data.select_dtypes(include=['float64']).columns:
    data[col] = data[col].astype(float)
data['player_height'] = data['player_height'].apply(lambda x: int(x.split('-')[0])*12 + int(x.split('-')[1]))
data['seconds_left'] = data['game_clock'].apply(lambda x: int(x.split(':')[0])*60 + int(x.split(':')[1]))
extra_seconds = {1: 60*15*3, 2: 60*15*2, 3: 60*15, 4: 0}
data['seconds_left'] = data['seconds_left'] + data['quarter'].map(extra_seconds)
data['player_position'] = data['player_position'].astype('category')
data['play_action'] = data['play_action'].astype('category')
data['dropback_type'] = data['dropback_type'].astype('category')
data['team_coverage_type'] = data['team_coverage_type'].astype('category')
data['route_of_targeted_receiver'] = data['route_of_targeted_receiver'].astype('category')
data['qb_hit'] = data['qb_hit'].astype('category')
data['qb_hit'] = data['down'].astype('category')
data['outcome'] = data['outcome'].astype('category')

In [100]:
X = data.drop(['game_id', 'play_id', 'player_name', 'disruption', 'pd', 'int', 'quarter', 'game_clock',
               'pass_result', 'outcome'], axis = 1)

enc = LabelEncoder()
y = data[['outcome', 'week']].copy()
y['outcome'] = enc.fit_transform(y['outcome'])

In [101]:
X_train = X[X['week'] <= 12].drop(['week'], axis = 1)
X_validate = X[(X['week'] <= 15) & (X['week'] > 12)].drop(['week'], axis = 1)
X_test = X[X['week'] > 15].drop(['week'], axis = 1)

y_train = y[y['week'] <= 12].drop(['week'], axis = 1)
y_validate = y[(y['week'] <= 15) & (X['week'] > 12)].drop(['week'], axis = 1)
y_test = y[y['week'] > 15].drop(['week'], axis = 1)

In [103]:
Xy_train = xgb.DMatrix(X_train, y_train, enable_categorical = True)
Xy_validate = xgb.DMatrix(X_validate, y_validate, enable_categorical = True)
Xy_test = xgb.DMatrix(X_test, y_test, enable_categorical = True)

In [108]:
params = {
    'objective': 'multi:softprob',
}

clf = xgb.XGBClassifier(**params, random_state = 10142025, enable_categorical = True)
clf.fit(X_train,  y_train, 
        eval_set=[(X_train, y_train), (X_validate, y_validate)]);
clf.save_model("test_model.json")

[0]	validation_0-mlogloss:0.93188	validation_1-mlogloss:0.98104
[1]	validation_0-mlogloss:0.80674	validation_1-mlogloss:0.90413
[2]	validation_0-mlogloss:0.71801	validation_1-mlogloss:0.86071
[3]	validation_0-mlogloss:0.65413	validation_1-mlogloss:0.83101
[4]	validation_0-mlogloss:0.60182	validation_1-mlogloss:0.81097
[5]	validation_0-mlogloss:0.55584	validation_1-mlogloss:0.79408
[6]	validation_0-mlogloss:0.51726	validation_1-mlogloss:0.78448
[7]	validation_0-mlogloss:0.48051	validation_1-mlogloss:0.77212
[8]	validation_0-mlogloss:0.44937	validation_1-mlogloss:0.76381
[9]	validation_0-mlogloss:0.41519	validation_1-mlogloss:0.75872
[10]	validation_0-mlogloss:0.39557	validation_1-mlogloss:0.75358
[11]	validation_0-mlogloss:0.37553	validation_1-mlogloss:0.75310
[12]	validation_0-mlogloss:0.35497	validation_1-mlogloss:0.75412
[13]	validation_0-mlogloss:0.34052	validation_1-mlogloss:0.75220
[14]	validation_0-mlogloss:0.32569	validation_1-mlogloss:0.75334
[15]	validation_0-mlogloss:0.31741	

In [121]:
clf.predict(X_validate)

array([0, 2, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 2, 2,
       0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1, 0, 2, 0, 0, 2, 0, 2, 0, 1, 0,
       0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0,

In [122]:
pd.Series(clf.predict(X_validate)).value_counts()

0    465
2    112
1      7
Name: count, dtype: int64

In [125]:
clf.predict_proba(X_validate)

array([[9.3539488e-01, 1.9516963e-05, 6.4585619e-02],
       [7.8214251e-02, 2.9810760e-04, 9.2148763e-01],
       [3.2384157e-02, 8.8431734e-05, 9.6752745e-01],
       ...,
       [8.5188287e-01, 1.2682319e-04, 1.4799035e-01],
       [5.8667314e-01, 2.1759011e-03, 4.1115099e-01],
       [9.5796770e-01, 8.2007879e-03, 3.3831503e-02]], dtype=float32)

In [129]:
metrics.accuracy_score(clf.predict(X_validate), y_validate)

0.6523972602739726

In [135]:
print(metrics.classification_report(clf.predict(X_validate), y_validate))

              precision    recall  f1-score   support

           0       0.86      0.69      0.76       465
           1       0.24      0.86      0.38         7
           2       0.29      0.48      0.36       112

    accuracy                           0.65       584
   macro avg       0.46      0.68      0.50       584
weighted avg       0.74      0.65      0.68       584



In [137]:
metrics.roc_auc_score(y_validate, clf.predict_proba(X_validate), average='weighted', multi_class='ovr')

0.6165742140649713

# Control for imbalance

In [None]:
params = {
    'objective': 'multi:softprob',
}

clf_imbalance = xgb.XGBClassifier(**params, random_state = 10142025, enable_categorical = True)
clf_imbalance.fit(X_train,  y_train, 
        eval_set=[(X_train, y_train), (X_validate, y_validate)]);
clf_imbalance.save_model("test_model_imbalance.json")