In [1]:
import math
import os
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from FeatureEngineering_2 import *

from joblib import dump, load
from AdvancedModel_helpers import *

In [2]:
if not os.path.exists("csv"):
    os.makedirs("csv")
for year in [2016,2017, 2018, 2019]: # 2020 has been excluded to be reserved as the test set
    read_a_season("../nhl_data/",year).to_csv('csv/tidy_{season}.csv'.format(season = year), sep = ',', index = False)

100%|██████████| 5456/5456 [00:12<00:00, 419.74it/s]
100%|██████████| 5456/5456 [01:45<00:00, 51.54it/s] 
100%|██████████| 5456/5456 [01:42<00:00, 53.23it/s] 
100%|██████████| 5456/5456 [01:26<00:00, 62.99it/s] 


### Consolidating and Sorting CSV Files into a Unified DataFrame

In [3]:
directory = 'csv'

# Get a list of csv file names within the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Sort the list of csv files
csv_files.sort()

# List to hold your DataFrames
dataframes_list = []

# Loop over the sorted list of csv files with tqdm for progress indication
for filename in tqdm(csv_files, desc="Loading files", unit="file"):
    file_path = os.path.join(directory, filename)
    df = pd.read_csv(file_path)
    dataframes_list.append(df)

# Concatenate all the DataFrames in the list into a single DataFrame
train_val_df = pd.concat(dataframes_list, ignore_index=True)

# Sort the DataFrame first by 'game_id' and then by 'event_idx'
train_val_df =train_val_df .sort_values(by=['game_id', 'event_idx'])

# Reset the index of the sorted DataFrame
train_val_df = train_val_df .reset_index(drop=True)

Loading files: 100%|██████████| 4/4 [00:01<00:00,  2.75file/s]


In [4]:
train_val_df.head(10)

Unnamed: 0,game_id,event_idx,play_type,shot_type,shot_dist,game_time,goals_home,goals_away,attack_team_name,period,...,last_event_type,x_coord_last_event,y_coord_last_event,Time_from_the_last_event,Distance_from_the_last_event,Rebound,change_shot_angle,Speed,angle_net,is_goal
0,2016020978,4,Shot,Wrist Shot,61.294372,9,0,0,Colorado Avalanche,1,...,Faceoff,0.0,0.0,9,45.453273,False,,5.050364,-28.237446,0
1,2016020978,5,Shot,Wrist Shot,8.944272,10,0,0,Colorado Avalanche,1,...,Shot,-35.0,29.0,1,54.230987,True,-35.197503,54.230987,-63.434949,0
2,2016020978,16,Shot,Wrist Shot,39.0,243,0,0,Colorado Avalanche,1,...,Giveaway,54.0,35.0,43,108.853112,False,,2.531468,-22.619865,0
3,2016020978,17,Shot,Wrist Shot,39.962482,253,0,0,Carolina Hurricanes,1,...,Shot,-53.0,15.0,10,108.166538,True,25.671421,10.816654,31.70143,0
4,2016020978,20,Shot,Wrist Shot,18.867962,349,0,0,Colorado Avalanche,1,...,Giveaway,-66.0,-32.0,56,20.615528,False,,0.368134,57.994617,0
5,2016020978,22,Goal,Wrist Shot,49.73932,368,0,1,Carolina Hurricanes,1,...,Blocked Shot,42.0,24.0,3,49.162994,False,,16.387665,-30.17352,1
6,2016020978,28,Shot,Wrist Shot,17.204651,484,0,1,Colorado Avalanche,1,...,Missed Shot,-40.0,-8.0,11,39.357337,False,,3.57794,-35.537678,0
7,2016020978,31,Shot,Wrist Shot,53.0,551,0,1,Carolina Hurricanes,1,...,Faceoff,69.0,-22.0,31,25.70992,False,,0.829352,-31.890792,0
8,2016020978,35,Shot,Wrist Shot,60.033324,562,0,1,Carolina Hurricanes,1,...,Missed Shot,40.0,29.0,3,59.076222,False,,19.692074,-29.981639,0
9,2016020978,36,Shot,Wrap-around,6.082763,592,0,1,Colorado Avalanche,1,...,Shot,37.0,-30.0,30,129.247824,True,67.14518,4.308261,80.537678,0


In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.calibration import CalibrationDisplay

### Training and Validation sets split

### 1. Train with only **Distance** and **Angle**

#### 1.1 Training and Validating

In [None]:
train_X_1,train_Y,val_X_1,val_Y = split_data(train_val_df,0.25,True)
train_X_1

In [None]:
xgboost1= XGBClassifier()
xgboost1.fit(train_X_1, train_Y)
xgboost1_path = './models/xgboost1.joblib'
dump(xgboost1,xgboost1_path)

# Discuss : TODO

#### 1.2

##### ROC AUC

In [None]:
xgboost1=load(xgboost1_path)
val_res = xgboost1.predict_proba(val_X_1)
fpr,tpr,threshold = roc_auc_plot(val_Y,val_res,'Xgboost','distance and angle')

##### Goal rate

In [None]:
probas_and_label = helper_df(val_Y,val_res)
probas_and_label

In [None]:
start_length = int(len(probas_and_label)*0.01)
plt.plot(probas_and_label['rank'].iloc[start_length:],probas_and_label['#goal/#goal+#shot'].iloc[start_length:],label="distance and angle")
plt.xlim([105,-5])
plt.ylim([-5,105])
plt.grid(True)
plt.ylabel("Goals/(Shots+Goals) (%)")
plt.xlabel("Shot probability model percentile")
plt.legend()

In [None]:
plt.plot(probas_and_label['rank'],probas_and_label['cum_percent'],label="distance and angle")
plt.xlim([105,-5])
plt.ylim([-5,105])
plt.grid(True)
plt.ylabel("Proportion (%)")
plt.xlabel("Shot probability model percentile")
plt.legend()

In [None]:
disp = CalibrationDisplay.from_estimator(xgboost1,val_X_1,val_Y,n_bins=15)
plt.grid(True)

In [None]:
from FeatureEngineering_2 import *

In [6]:
json_path = '../nhl_data/2016020978.json'

In [None]:
df_1 = json_reader(json_path)

In [None]:
df_1

In [None]:
len(df_1)

In [None]:
new_features(json_path,df_1)

In [None]:
with open(json_path) as f:
        
        game_json = json.load(f)

        games = game_json['liveData']['plays']['allPlays']

In [None]:
for index,play in enumerate(games):
    print(index)

In [None]:
games[0]['result']['event']

In [7]:
json_reader(json_path)

Unnamed: 0,game_id,event_idx,play_type,shot_type,shot_dist,game_time,goals_home,goals_away,attack_team_name,period,...,last_event_type,x_coord_last_event,y_coord_last_event,Time_from_the_last_event,Distance_from_the_last_event,Rebound,change_shot_angle,Speed,angle_net,is_goal
0,2016020978,4,Shot,Wrist Shot,61.294372,9,0,0,Colorado Avalanche,1,...,Faceoff,0.0,0.0,9,45.453273,False,,5.050364,-28.237446,0
1,2016020978,5,Shot,Wrist Shot,8.944272,10,0,0,Colorado Avalanche,1,...,Shot,-35.0,29.0,1,54.230987,True,-35.197503,54.230987,-63.434949,0
2,2016020978,16,Shot,Wrist Shot,39.0,243,0,0,Colorado Avalanche,1,...,Giveaway,54.0,35.0,43,108.853112,False,,2.531468,-22.619865,0
3,2016020978,17,Shot,Wrist Shot,39.962482,253,0,0,Carolina Hurricanes,1,...,Shot,-53.0,15.0,10,108.166538,True,25.671421,10.816654,31.70143,0
4,2016020978,20,Shot,Wrist Shot,18.867962,349,0,0,Colorado Avalanche,1,...,Giveaway,-66.0,-32.0,56,20.615528,False,,0.368134,57.994617,0
5,2016020978,22,Goal,Wrist Shot,49.73932,368,0,1,Carolina Hurricanes,1,...,Blocked Shot,42.0,24.0,3,49.162994,False,,16.387665,-30.17352,1
6,2016020978,28,Shot,Wrist Shot,17.204651,484,0,1,Colorado Avalanche,1,...,Missed Shot,-40.0,-8.0,11,39.357337,False,,3.57794,-35.537678,0
7,2016020978,31,Shot,Wrist Shot,53.0,551,0,1,Carolina Hurricanes,1,...,Faceoff,69.0,-22.0,31,25.70992,False,,0.829352,-31.890792,0
8,2016020978,35,Shot,Wrist Shot,60.033324,562,0,1,Carolina Hurricanes,1,...,Missed Shot,40.0,29.0,3,59.076222,False,,19.692074,-29.981639,0
9,2016020978,36,Shot,Wrap-around,6.082763,592,0,1,Colorado Avalanche,1,...,Shot,37.0,-30.0,30,129.247824,True,67.14518,4.308261,80.537678,0
