In [None]:
###############
### Imports ###
###############

from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import math
from math import pi
from zipfile import ZipFile
import seaborn as sns
import matplotlib.pyplot as plt

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from joblib import dump

In [None]:
#################
### Functions ###
#################

###############################
### File Manipulation Funcs ###
###############################

# File listing function
def zlist_files(data_path, zipname, cond):
    zipped_files = join(data_path, zipname)
    with ZipFile(zipped_files) as z:
        flist = [f for f in z.namelist() if cond in f]
        fout = [z.open(f) for f in flist]
        return fout

# File listing function
def list_files(data_path, cond):
    flist = [f for f in listdir(data_path) if isfile(join(data_path, f))]
    fcond = [join(data_path, f) for f in flist if cond in f]
    return fcond

# Create pandas data loader for multi-json loads
def pandas_loader(flist, idx_name):
    dfs = [pd.read_json(f) for f in flist]
    df = pd.concat((idf.set_index(idx_name) for idf in dfs), 
                   axis=1, join='inner').reset_index()
    return df

#################################
### Feature Engineering Funcs ###
#################################

# For time based columns
def transformation(column):
    max_value = column.max()
    sin_values = [math.sin((2*pi*x)/max_value) for x in list(column)]
    cos_values = [math.cos((2*pi*x)/max_value) for x in list(column)]
    return sin_values, cos_values

###################
### Stats Funcs ###
###################

# Verify column skew for normalization requirement
def skewness_test(df, col_list):
    x_var = df[col_list]
    skew_data = pd.DataFrame()
    skew_data['features'] = x_var.columns
    skew_data['skew'] = x_var.skew().values
    return skew_data

# Verify Column Vif scores for multicollinearity 
def vif_test(df, col_list):
    x_var = df[col_list]
    vif_data = pd.DataFrame()
    vif_data['features'] = x_var.columns
    vif_data['vif'] = [variance_inflation_factor(x_var.values, i)\
                      for i in range(len(x_var.columns))]
    return vif_data

########################################
### Model Reporting & Plotting Funcs ###
########################################

# Nicely Formatted Confusion Matrix
def formatted_cf(yt, yp, n):
    ## Confusion Matrix source: https://medium.com/@dtuk81/confusion-matrix-visualization-fc31e3f30fea
    cf_matrix = confusion_matrix(yt,yp)
    
    # Get Labels
    xy_label = sorted(set(yt))

    # CF values as string formated numbers
    group_counts = ["{0:0.0f}".format(value) for value in \
                    cf_matrix.flatten()]

    # CF values as string formated percentages 
    group_percentages = ["{0:.2%}".format(value) for value in \
                         cf_matrix.flatten()/np.sum(cf_matrix)]

    # Zip Names, Counts, and Percentages together into a list
    labels = [f"{v1}\n{v2}" for v1, v2 in \
              zip(group_counts,group_percentages)]

    # Make labels a nd array from a 0d list
    labels = np.asarray(labels).reshape(n,n)
    
    # Set size of plot
    fig, ax = plt.subplots(figsize=(8,8))
    
    # Call Heat Map
    sns.heatmap(cf_matrix, 
                annot=labels, 
                fmt="", 
                cmap='Blues',
                xticklabels=xy_label,
                yticklabels=xy_label)
    
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.title("Confusion Matrix")
    
    plt.show()

# Classification Report
def model_reporting(ytest, ypred, label_list, n):
    
    # Relabel predicted labels with ground truth labels
    relabel = np.choose(ypred, label_list).astype(np.int64)
    
    print("\n\n", classification_report(ytest, relabel), "\n\n")
    
    formatted_cf(ytest, relabel, n)
    
###################################
### Pipeline & Gridsearch Funcs ###
###################################

# Pipeline model with Gridsearch
def pipe_grid_model(xtrain, ytrain,
                    numeric_features, categorical_features,
                    estimator='LogisticRegression'):
    
    # Numerical features pipeline with StandardScaler
    numeric_transformer = Pipeline([('scaler', StandardScaler()),
                                    ('pca', PCA())])
    # Categorical features pipeline with OHE
    categorical_transformer = OneHotEncoder(drop='first')
    
    # Combine Num and Cat feature Pipelines into preprocessor
    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer, numeric_features),
                     ('cat', categorical_transformer, categorical_features)])
    
    if estimator == 'LogisticRegression':
        # Set pipeline with Preprocessor, Regressor
        pipe = Pipeline([('preprocessor', preprocessor),
                         ('reg', LogisticRegression())])

        # Parametrs grid
        param_grid = [{'preprocessor__num__pca__n_components': np.arange(50,500,50),
                       'reg__penalty': ['elasticnet'],
                       'reg__solver': ['saga'],
                       'reg__random_state': [0],
                       'reg__l1_ratio': [0.25,0.5,0.75],
                       'reg__fit_intercept': [True, False],
                       'reg__max_iter': [1000,2000],
                       'reg__C': [0.25,0.5,1,2,3,4],
                       'reg__tol': [0.000001,0.00001,0.0001,0.001]}]

    if estimator == 'XGBClassifier':
        # Set pipeline with Scaler, Regressor
        pipe = Pipeline([('preprocessor', preprocessor),
                         ('xgb', XGBClassifier())])

        # Parametrs grid
        param_grid = [{'preprocessor__num__pca__n_components': np.arange(50,500,50),
                       'xgb__objective': ['binary:logistic'],
                       'xgb__booster': ['gblinear', 'gbtree'],
                       'xgb__eval_metric': ['error'],
                       'xgb__feature_selector': ['shuffle', 'cyclic'],
                       'xgb__max_depth': [3, 4, 5, 7],
                       'xgb__learning_rate': [0.1, 0.01, 0.05],
                       'xgb__gamma': [0, 0.25, 1],
                       'xgb__reg_lambda': [0, 1, 10],
                       'xgb__scale_pos_weight': [1, 3, 5],
                       'xgb__subsample': [0.5, 0.7, 0.9],
                       'xgb__colsample_bytree': [0.5, 0.7, 0.9],
                       'xgb__n_estimators': np.arange(50,400,50),
                       'xgb__num_round': [500, 1000],
                       'xgb__random_state': [0]}]

    # Call grid function with parameters and pipeline
    grid = HalvingRandomSearchCV(pipe, param_grid, cv=10)
    grid.fit(xtrain, ytrain)
        
    # Review best params
    print(grid.best_params_)
    
    # Create model
    model = grid.best_estimator_
    
    return model

In [None]:
#################
### LOAD DATA ###
#################

data_path = '../data'

# Load Statistics
stats_df = pandas_loader(zlist_files(data_path, 'nfl_data_2002_2021.zip', 'stats'), 'id')
#stats_df.head(10)

# Load Record
record_df = pandas_loader(zlist_files(data_path, 'nfl_data_2002_2021.zip', 'record'), 'id')
#record_df.head(10)

# Load matches
matches_df = pd.read_json(zlist_files(data_path, 'nfl_data_2002_2021.zip', 'matches')[0])
#matches_df.head(10)

# Load teams
teams_df = pd.read_json(zlist_files(data_path, 'nfl_data_2002_2021.zip', 'teams')[0])
#teams_df.head(10)

In [None]:
#########################
### JOIN DATA & CLEAN ###
#########################

# Remove Duplicate columns from Stats and Records df
stats_df = stats_df.loc[:,~stats_df.columns.duplicated()]
record_df = record_df.loc[:,~record_df.columns.duplicated()]

# Merge Stats and Record dfs
merged_df = stats_df.merge(record_df, how='left', on=['id'])
#merged_df.head(10)

# Merge Merged df with Teams df
remerge_df = merged_df.merge(teams_df, how='left', left_on='team_id_x', 
                             right_on='id')
#remerge_df.head(35)

# Merge Remerged df to Matches df
nfl_df = matches_df.merge(remerge_df, left_on=['season', 'home_team'], 
                          right_on=['season_x','team_id_x'])\
                   .merge(remerge_df, left_on=['season', 'away_team'], 
                          right_on=['season_x','team_id_x'],
                          suffixes=('_home', '_away'))

# Convert date_time column to datetime object
nfl_df['date_time'] = pd.to_datetime(nfl_df['date_time'], utc=True)
# Sort NFL df by date_time
nfl_df = nfl_df.sort_values(['date_time'])

# Drop redundant columns like id_x/id_y, etc
drop_cols = [col for col in nfl_df.columns if '_x_' in col or '_y_' in col]
nfl_df = nfl_df.drop(drop_cols, axis=1)

# Drop Rank and PG columns
drop_rankpg_cols = [col for col in nfl_df.columns\
                    if 'rank' in col or 'pg' in col]
nfl_df = nfl_df.drop(drop_rankpg_cols, axis=1)

# Drop stats columns with 999 values in it
drop_999_cols = [col for col in nfl_df.columns[nfl_df.isin([999]).any()]\
                   if 'stat' in col]
nfl_df = nfl_df.drop(drop_999_cols, axis=1)

# Drop stats and record columns with only 0 values
drop_0_cols = [col for col, is_zero in ((nfl_df == 0).sum() == nfl_df.shape[0])\
               .items() if is_zero and ('stat' in col or 'record' in col)]
nfl_df = nfl_df.drop(drop_0_cols, axis=1)

# Replace 999 values in matches info
replace_999_cols = ['home_team_win', 'home_team_score', 
                    'away_team_win', 'away_team_score']
nfl_df[replace_999_cols] = nfl_df[replace_999_cols].replace(999,0)

# Drop Abbreviation team name col
drop_abb_col = [col for col in nfl_df.columns if 'abbreviation' in col]
nfl_df = nfl_df.drop(drop_abb_col, axis=1)

# Replace remaining NaNs with 0
nfl_df = nfl_df.fillna(0)

nfl_df.info()

In [None]:
#########################
### DATA VERIFICATION ###
#########################

# Float and Int cols with Stats and Record
verification_cols = [col for col in list(nfl_df.select_dtypes\
                                         (include=['float64', 'int64']).columns)\
                    if 'stat' in col or 'record' in col]

# Skewness of the data for int and float columns
skew_data = skewness_test(nfl_df, verification_cols)
print(skew_data[(skew_data['skew'] > 0.5) | (skew_data['skew'] < -0.5)]\
      .sort_values('skew', ascending=False))
skew_data['skew'].hist(bins='auto')
plt.show()
        
# View Columns with high multicollinearity
vif_data = vif_test(nfl_df, verification_cols)
print(vif_data[vif_data['vif'] > 1].sort_values('vif', ascending=False))
vif_data[vif_data['vif'] < 1000]['vif'].hist(bins='auto')
plt.show()

In [None]:
###########################
### FEATURE ENGINEERING ###
###########################

# Date_time to Month, Day of Week, Time of Day as ints
nfl_df['month'] = nfl_df['date_time'].dt.month.astype('int64')
nfl_df['day_week'] = nfl_df['date_time'].dt.dayofweek.astype('int64')
nfl_df['time_day'] = nfl_df['date_time'].dt.hour.astype('int64')

# Calculate Cosine and Sine for new dates
for col in ['month', 'day_week', 'time_day']:
    time_sine, time_cos = transformation(nfl_df[col])
    nfl_df[col[0:3]+'_sine'] = time_sine
    nfl_df[col[0:3]+'_cos'] = time_cos

In [None]:
###################
### X & y SPLIT ###
###################

# X cols list
keep_cols = ['stat', 'record', 'sine', 'cos',
             'division', 'conference', 'team_name', 'location']

remove_cols = ['team_win', 'team_Score', 'id', 'date_time',
               'month', 'day_week', 'time_day']

X_cols =  [col for col in list(nfl_df.columns)\
                    if any(kc in col for kc in keep_cols)\
           and any(rc not in col for rc in remove_cols)]

## Split X and y into Train and Test
# X,y Train
X_train = nfl_df[~nfl_df['season'].isin([2021])][X_cols]
y_train = nfl_df[~nfl_df['season'].isin([2021])]['home_team_win']
# X,y Test
X_test = nfl_df[nfl_df['season'].isin([2021])][X_cols]
y_test = nfl_df[nfl_df['season'].isin([2021])]['home_team_win']

In [None]:
######################################
### MODELLING: LOGISTIC REGRESSION ###
######################################

# Numerical cols to StandardScaler
num_cols = [col for col in list(X_train.select_dtypes\
                                         (include=['float64', 'int64']).columns)\
                    if 'stat' in col or 'record' in col]

# Categorical cols to OHE
cat_cols = ['division_home', 'conference_home', 
            'location_home', 'team_name_home',
            'division_away', 'conference_away', 
            'location_away', 'team_name_away']


logreg_model = pipe_grid_model(X_train, y_train,
                               num_cols, cat_cols,
                               estimator='LogisticRegression')

In [None]:
###################
### SAVE MODELS ###
###################

# Model path
model_path = '../model'

# file names
log_modelName = 'winprob_logreg_2002_2020.joblib'

# Save files to model folder
dump(logreg_model, join(model_path, log_modelName))

In [None]:
############################################
### 2021 PREDICTIONS, TABLES, & PLOTTING ###
############################################

## 2021 DATA 

# Fit Model
logreg_model.fit(X_train, y_train)

# Predict labels
pred_label = logreg_model.predict(X_test)

# Probabilities
prob = logreg_model.predict_proba(X_test)

# Create confusion matrix and classification report
model_reporting(y_test, pred_label, list(np.unique(y_test)), 2)

# Weekly results Table
nfl_df2021 = nfl_df[nfl_df['season'] == 2021][['team_name_home',
                                               'team_name_away',
                                               'week',
                                               'home_team_win']]

# Change column names
nfl_df2021 = nfl_df2021.rename(columns={'team_name_home':'home_team',
                                        'team_name_away':'away_team'})

# Add probability column to df
nfl_df2021['home_team_win_probability'] = prob[:,1]
# Add predicted labels
nfl_df2021['predicted_winner'] = pred_label

# Map actual winner name to column
nfl_df2021['actual_winner'] = nfl_df2021.apply(lambda x: x.home_team if x.home_team_win == 1 else x.away_team, axis=1)
# Map predicted winner name to column
nfl_df2021['predicted_winner'] = nfl_df2021.apply(lambda x: x.home_team if x.predicted_winner == 1 else x.away_team, axis=1)
# Map win probability
nfl_df2021['win_probability'] = nfl_df2021.apply(lambda x: x.home_team_win_probability if x.predicted_winner == x.home_team else 1 - x.home_team_win_probability, axis=1)
# Map correct predicition
nfl_df2021['correct_prediction'] = (nfl_df2021['predicted_winner'] == nfl_df2021['actual_winner']).astype(int)

# Drop columns
nfl_df2021 = nfl_df2021.drop(columns=['home_team_win_probability', 'home_team_win'])
# Show top predictions
print(nfl_df2021.sort_values(by='win_probability', ascending=False).reset_index(drop=True).head(10))

# Show per week scores
logreg_correct = nfl_df2021.loc[nfl_df2021['correct_prediction'] == 1].groupby('week')['correct_prediction'].sum()
# Get number of games
num_games = nfl_df2021.groupby('week')['correct_prediction'].size()
# Divide correct games by
logreg_results = logreg_correct / num_games

print(logreg_results)

In [None]:
#####################################
### MODELLING: XGB CLASSIFICATION ###
#####################################

# Numerical cols to StandardScaler
num_cols = [col for col in list(X_train.select_dtypes\
                                         (include=['float64', 'int64']).columns)\
                    if 'stat' in col or 'record' in col]

# Categorical cols to OHE
cat_cols = ['division_home', 'conference_home', 
            'location_home', 'team_name_home',
            'division_away', 'conference_away', 
            'location_away', 'team_name_away']


xgb_model = pipe_grid_model(X_train, y_train,
                            num_cols, cat_cols,
                            estimator='XGBClassifier')

In [None]:
###################
### SAVE MODELS ###
###################

# Model path
model_path = '../model'

# file names
xgb_modelName = 'winprob_xgb_2002_2020.joblib'

# Save files to model folder
dump(xgb_model, join(model_path, xgb_modelName))

In [None]:
############################################
### 2021 PREDICTIONS, TABLES, & PLOTTING ###
############################################

## 2021 DATA 

# Fit Model
xgb_model.fit(X_train, y_train)

# Predict labels
pred_label = xgb_model.predict(X_test)

# Probabilities
prob = xgb_model.predict_proba(X_test)

# Create confusion matrix and classification report
model_reporting(y_test, pred_label, list(np.unique(y_test)), 2)

# Weekly results Table
nfl_df2021 = nfl_df[nfl_df['season'] == 2021][['team_name_home',
                                               'team_name_away',
                                               'week',
                                               'home_team_win']]

# Change column names
nfl_df2021 = nfl_df2021.rename(columns={'team_name_home':'home_team',
                                        'team_name_away':'away_team'})

# Add probability column to df
nfl_df2021['home_team_win_probability'] = prob[:,1]
# Add predicted labels
nfl_df2021['predicted_winner'] = pred_label

# Map actual winner name to column
nfl_df2021['actual_winner'] = nfl_df2021.apply(lambda x: x.home_team if x.home_team_win == 1 else x.away_team, axis=1)
# Map predicted winner name to column
nfl_df2021['predicted_winner'] = nfl_df2021.apply(lambda x: x.home_team if x.predicted_winner == 1 else x.away_team, axis=1)
# Map win probability
nfl_df2021['win_probability'] = nfl_df2021.apply(lambda x: x.home_team_win_probability if x.predicted_winner == x.home_team else 1 - x.home_team_win_probability, axis=1)
# Map correct predicition
nfl_df2021['correct_prediction'] = (nfl_df2021['predicted_winner'] == nfl_df2021['actual_winner']).astype(int)

# Drop columns
nfl_df2021 = nfl_df2021.drop(columns=['home_team_win_probability', 'home_team_win'])
# Show top predictions
print(nfl_df2021.sort_values(by='win_probability', ascending=False).reset_index(drop=True).head(10))

# Show per week scores
xgb_correct = nfl_df2021.loc[nfl_df2021['correct_prediction'] == 1].groupby('week')['correct_prediction'].sum()
# Get number of games
num_games = nfl_df2021.groupby('week')['correct_prediction'].size()
# Divide correct games by
xgb_results = xgb_correct / num_games

print(xgb_results)

In [None]:
######################################
### LOGISTIC REGRESSION VS XGBOOST ###
######################################

results_comp = pd.DataFrame()
results_comp.index = xgb_results.index[0:13]
results_comp['xgb'] = xgb_results.values[0:13]
results_comp['logreg'] = logreg_results.values[0:13]
results_comp['dif'] = xgb_results.values[0:13] - logreg_results.values[0:13]
results_comp['best_week'] = np.where(results_comp['xgb'] > results_comp['logreg'], 
                                     'xgb', 'logreg')
results_comp['best_week'] = np.where(results_comp['xgb'] == results_comp['logreg'], 
                                     'tie',results_comp['best_week'])
results_comp['best_week_val'] = np.where(results_comp['xgb'] > results_comp['logreg'], 
                                     results_comp['xgb'], results_comp['logreg'])
results_comp.loc['mean'] = results_comp.mean()
results_comp['best_week'].loc['mean'] = max(results_comp['best_week'].value_counts())
results_comp

In [None]:
###########################
### WEEK 14 PREDICTIONS ###
###########################

# Week 14
table_cols = ['home_team','away_team', 'predicted_winner','win_probability']
week14_df = pd.DataFrame()
week14_df[table_cols] = nfl_df2021[nfl_df2021['week']==14][table_cols]

week14_df

In [None]:
###########################
### WEEK 15 PREDICTIONS ###
###########################

# Week 15
table_cols = ['home_team','away_team', 'predicted_winner','win_probability']
week15_df = pd.DataFrame()
week15_df[table_cols] = nfl_df2021[nfl_df2021['week']==15][table_cols]

week15_df

In [None]:
###########################
### WEEK 16 PREDICTIONS ###
###########################

# Week 16
table_cols = ['home_team','away_team', 'predicted_winner','win_probability']
week16_df = pd.DataFrame()
week16_df[table_cols] = nfl_df2021[nfl_df2021['week']==16][table_cols]

week16_df

In [None]:
###########################
### WEEK 17 PREDICTIONS ###
###########################

# Week 17
table_cols = ['home_team','away_team', 'predicted_winner','win_probability']
week17_df = pd.DataFrame()
week17_df[table_cols] = nfl_df2021[nfl_df2021['week']==17][table_cols]

week17_df

In [None]:
###########################
### WEEK 18 PREDICTIONS ###
###########################

# Week 18
table_cols = ['home_team','away_team', 'predicted_winner','win_probability']
week18_df = pd.DataFrame()
week18_df[table_cols] = nfl_df2021[nfl_df2021['week']==18][table_cols]

week18_df