In [25]:
import sys
import os
sys.path.append(os.path.abspath(".."))  # Go up one level to `project` directory

In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, roc_curve
from sklearn.calibration import calibration_curve, CalibrationDisplay
from data.shots_data_retriever import ShotsDataRetriever
from tqdm import tqdm

In [27]:
shotsDataRetriever = ShotsDataRetriever()
df = shotsDataRetriever.get_df_for_milestone2_part4()

In [28]:
df

Unnamed: 0,game_id,play_num,period,is_goal,x_coord,y_coord,shot_type,is_empty_net,distance,angle_to_goal,...,last_event_x_coord,last_event_y_coord,time_since_last_event,distance_from_last_event,rebound,angle_change,speed,time_since_powerplay,away_skaters,home_skaters
0,2016020001,7,1,0,77,5,wrist,0,13.928388,21.037511,...,61.0,11.0,1.0,17.088007,False,0.000000,17.088007,0,5,5
1,2016020001,14,1,0,86,13,wrist,0,13.601471,72.897271,...,54.0,-5.0,5.0,36.715120,False,0.000000,7.343024,0,5,5
2,2016020001,21,1,0,23,-38,wrist,0,77.025970,-29.560354,...,72.0,0.0,18.0,62.008064,False,0.000000,3.444892,0,5,5
3,2016020001,23,1,0,33,-15,slap,0,58.940648,-14.743563,...,77.0,-2.0,19.0,45.880279,False,0.000000,2.414752,0,5,5
4,2016020001,36,1,0,34,28,wrist,0,62.609903,26.565051,...,47.0,34.0,16.0,14.317821,False,0.000000,0.894864,0,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68485,2019021082,283,3,0,77,-23,backhand,0,26.419690,-60.524111,...,72.0,-21.0,21.0,5.385165,False,0.000000,0.256436,0,5,5
68486,2019021082,285,3,0,76,38,slap,0,40.496913,69.775141,...,95.0,-17.0,16.0,58.189346,False,0.000000,3.636834,0,5,5
68487,2019021082,286,3,0,79,10,backhand,0,14.866069,42.273689,...,76.0,38.0,2.0,28.160256,True,27.501452,14.080128,0,5,5
68488,2019021082,293,3,0,81,-7,snap,0,11.401754,-37.874984,...,60.0,40.0,32.0,51.478151,False,0.000000,1.608692,0,6,5


### XGBClassifier v0:
- Trained on `angle_to_goal` and `distance`
- No hyperparameter tuning

In [32]:
X0 = df[['angle_to_goal', 'distance']]
y = df['is_goal']

X_train_0, X_val_0, y_train, y_val = train_test_split(X0, y, test_size=0.3, random_state=42)


In [30]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
y_pred_prob = model.predict_proba(X_val)[:, 1]

### XGBClassifier v1:
- Trained on all features
- Hyperparameters optimized

In [None]:
df['shot_type'] = df['shot_type'].astype('category')
X1 = df.drop(columns=['is_goal'])
y = df['is_goal']

X_train_1, X_val_1, y_train, y_val = train_test_split(X1, y, test_size=0.3, random_state=42)

dtrain = xgb.DMatrix(X1, label=y, enable_categorical=True)

In [35]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_1, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Hyperparameters:", best_params)

y_pred = best_model.predict(X_val_1)
y_pred_prob = best_model.predict_proba(X_val_1)[:, 1]

print("\nClassification Report:")
print(classification_report(y_val, y_pred))

Fitting 3 folds for each of 324 candidates, totalling 972 fits


ValueError: 
All the 972 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
972 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\sklearn.py", line 1512, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\sklearn.py", line 596, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
                    ^^^^^^^^^^^^^^^
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\sklearn.py", line 1003, in _create_dmatrix
    return QuantileDMatrix(
           ^^^^^^^^^^^^^^^^
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\core.py", line 1573, in __init__
    self._init(
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\core.py", line 1632, in _init
    it.reraise()
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\core.py", line 569, in reraise
    raise exc  # pylint: disable=raising-bad-type
    ^^^^^^^^^
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\core.py", line 550, in _handle_exception
    return fn()
           ^^^^
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\core.py", line 637, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
                                          ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\data.py", line 1388, in next
    input_data(**self.kwargs)
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\core.py", line 617, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
                                                   ^^^^^^^^^^^^^^^^^
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\data.py", line 1431, in _proxy_transform
    df, feature_names, feature_types = _transform_pandas_df(
                                       ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\data.py", line 603, in _transform_pandas_df
    pandas_check_dtypes(data, enable_categorical)
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\data.py", line 569, in pandas_check_dtypes
    _invalid_dataframe_dtype(data)
  File "c:\Users\Daniel\.conda\envs\ift6758-conda-env\Lib\site-packages\xgboost\data.py", line 356, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:shot_type: object
