# Setup notebook and import data

In [3]:
import pandas as pd
import os
import zipfile


In [4]:
# helper function that will check if the data has been unzipped into the data folder and if not, unpack it

def unzip_if_empty(directory, zip_file):
    """Unzip contents if the specified directory is empty."""
    # Check if the directory is empty
    if not os.listdir(directory):  # List is empty if the directory is empty
        print(f"The directory {directory} is empty. Unzipping the file...")
        # Open the ZIP file
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            # Extract all the contents into the directory
            zip_ref.extractall(directory)
        print("Unzip completed.")
    else:
        print(f"The directory {directory} is not empty.")

unzip_if_empty('./data', 'march-machine-learning-mania-2024.zip')

The directory ./data is not empty.


In [5]:
# create a list of the the csv file names
file_names = []
for file in os.listdir('./data'):
    file_names.append(file[:-4])

file_names

['2024_tourney_seeds',
 'Cities',
 'Conferences',
 'MConferenceTourneyGames',
 'MGameCities',
 'MMasseyOrdinals_thruSeason2024_day128',
 'MNCAATourneyCompactResults',
 'MNCAATourneyDetailedResults',
 'MNCAATourneySeedRoundSlots',
 'MNCAATourneySeeds',
 'MNCAATourneySlots',
 'MRegularSeasonCompactResults',
 'MRegularSeasonDetailedResults',
 'MSeasons',
 'MSecondaryTourneyCompactResults',
 'MSecondaryTourneyTeams',
 'MTeamCoaches',
 'MTeamConferences',
 'MTeams',
 'MTeamSpellings',
 'sample_submission',
 'WGameCities',
 'WNCAATourneyCompactResults',
 'WNCAATourneyDetailedResults',
 'WNCAATourneySeeds',
 'WNCAATourneySlots',
 'WRegularSeasonCompactResults',
 'WRegularSeasonDetailedResults',
 'WSeasons',
 'WTeamConferences',
 'WTeams',
 'WTeamSpellings']

# Data Cleaning

## Narrow that data
- We will focus on these files on the dataset:
    - MNCAATourneyDetailedResults: details stats from past mens tournament games. we will use the previous two seasons as an indicator of how a team will perform in the tourney. Using the past two seasons, becuase the make up of the team (individual players) will not change much in two years.
    - WNCAATourneyDetailedResults: the same information for womens teams.
    - MRegularSeasonDetailedResults: detailed stats for each mens team in the regular season. will limit this data to the 2024 season.
    - WRegularSeasonDetailedResults: same detailed regular season stats for womens teams.
    - 2024_tourney_seeds: starting tournament rankings for both mens and womens teams. a lower seed indicates a stronger team.

- We will limit the teams to 64 mens teams and 64 womens teams because there are only 64 teams invited to play in the end of season tournament. 
- Using these files we will assemble one dataset and limit it to two years of historical performance
- will use the MTeams and WTeams files at the end to map team names to team ID's

In [6]:
# import data
m_reg_season = pd.read_csv('./data/MRegularSeasonDetailedResults.csv')
w_reg_season = pd.read_csv('./data/WRegularSeasonDetailedResults.csv')
m_hist_tourney = pd.read_csv('./data/MNCAATourneyDetailedResults.csv')
w_hist_tourney = pd.read_csv('./data/WNCAATourneyDetailedResults.csv')
tourney_seeds = pd.read_csv('./data/2024_tourney_seeds.csv')
m_teams = pd.read_csv('./data/MTeams.csv')
w_teams = pd.read_csv('./data/WTeams.csv')


### Assemble dataframes into one df for training
- first we will concat the mens and womens data frames into one data frame
- then we will combine the regular season data and historical tournament data.
- the goal is to create a dataframe where a single row is a single game. the label for each row will be a true or false if the team won that particular game.
- adding the tournament seeds for each team will add a feature for training.
- we can also use the seeds to make a baseline estimate of winners. we can simply pick the higher seed for any particular matchup and that will give a goal for the model to beat.

In [7]:
m_reg_season.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [8]:
w_reg_season.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2010,11,3103,63,3237,49,H,0,23,54,...,13,6,10,11,27,11,23,7,6,19
1,2010,11,3104,73,3399,68,N,0,26,62,...,21,14,27,14,26,7,20,4,2,27
2,2010,11,3110,71,3224,59,A,0,29,62,...,14,19,23,17,23,8,15,6,0,15
3,2010,11,3111,63,3267,58,A,0,27,52,...,26,16,25,22,22,15,11,14,5,14
4,2010,11,3119,74,3447,70,H,1,30,74,...,17,11,21,21,32,12,14,4,2,14


In [9]:
# add a column to each df that indicates if it is mens data or womens. will make this a boolean so it can be 0 or 1
m_reg_season['mens_data'] = True
w_reg_season['mens_data'] = False

# limit regular season data to only 2024 and then concat the mens and womens data into one df
m_reg_season = m_reg_season[m_reg_season['Season'] == 2024]
w_reg_season = w_reg_season[w_reg_season['Season'] == 2024]
reg_season = pd.concat([m_reg_season, w_reg_season], axis=0)

# add a column that indicates if each row (game) is a tournament game. will all be false for this df
reg_season['tourney_game'] = False


In [10]:
reg_season.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,mens_data,tourney_game
107634,2024,0,1101,64,1329,59,A,0,26,57,...,20,6,26,13,12,9,2,16,True,False
107635,2024,0,1103,81,1355,75,A,0,26,57,...,13,5,18,14,12,8,2,17,True,False
107636,2024,0,1104,105,1287,73,H,0,32,57,...,19,11,15,7,14,6,3,25,True,False
107637,2024,0,1112,122,1288,59,H,0,42,76,...,15,6,17,10,25,3,6,25,True,False
107638,2024,0,1114,71,1402,66,H,0,22,59,...,22,17,31,14,22,2,4,23,True,False


### Do the same for historical tournament data. since we do not have 2024 tournament data, limit this data to the previous two years

In [11]:
# add a column to each df that indicates if it is mens data or womens. will make this a boolean so it can be 0 or 1
m_hist_tourney['mens_data'] = True
w_hist_tourney['mens_data'] = False

# limit tournament data to only 2022 and 2023 and then concat the mens and womens data into one df
m_hist_tourney = m_hist_tourney[m_hist_tourney['Season'] >= 2022]
w_hist_tourney = w_hist_tourney[w_hist_tourney['Season'] >= 2022]
hist_tourney = pd.concat([m_hist_tourney, w_hist_tourney], axis=0)

# add a column that indicates if each row (game) is a tournament game. will all be true for this df
hist_tourney['tourney_game'] = True

In [12]:
hist_tourney.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,mens_data,tourney_game
1181,2022,134,1231,66,1461,58,N,0,26,62,...,18,7,23,6,18,2,1,17,True,True
1182,2022,134,1411,76,1394,67,N,0,23,55,...,19,7,29,17,8,9,3,22,True,True
1183,2022,135,1323,89,1353,87,N,2,37,72,...,6,13,28,22,13,1,6,15,True,True
1184,2022,135,1460,93,1136,82,N,0,29,61,...,23,10,22,7,13,11,5,24,True,True
1185,2022,136,1116,75,1436,71,N,0,24,56,...,17,5,26,13,6,1,1,21,True,True


### Combine historical tournament data and 2024 regular season data

In [13]:
data = pd.concat([hist_tourney, reg_season], axis=0)

# add seeds column
data = pd.merge(data, tourney_seeds, left_on='WTeamID', right_on='TeamID', how='left')
data.drop(columns=['Tournament', 'TeamID'], inplace=True) # drop some redundant columns

### Assign "Seed" to teams without one
- if a team does not have a seed it indicates that they were not invited to play in the 2024 tournament.
- these teams may have played in the tournament in previous years, so rather than dropping these rows we will assign a "Seed" of 20. This will indicate that they were not a strong team in 2024.
- Teams that are seeded have a value with a letter followed by a number between 1 and 16. we are only interested in the number so we will trim the letter from seeded teams

In [14]:
data['Seed'] = data['Seed'].fillna('20')
data['Seed'] = data['Seed'].apply(lambda x: x[1:] if len(x) == 3 else x)

# convert to an int
data['Seed'] = data['Seed'].astype('int64')

In [15]:
data['WLoc'].unique()

array(['N', 'H', 'A'], dtype=object)

#### WLoc column
- The 'WLoc' column indicates the location of the game for the winning team (WTeamID).
    - N = neautral location
    - H = home game
    - A = away game
- will need to encode these to a numeric values

In [16]:
# use pandas "get_dummies" method to one hot encode 'WLoc' feature
data = pd.get_dummies(data, columns=['WLoc'])

### Add "IsWin" feature
- In order to add a feature that will be the target (i.e. whether or not the team won the game) we will need to reorganize the data
- assign each row a "gameID" to give each game a unique value
- Split the winner and loser features into seperate dataframes and the rename them to neutral names. for example, instead of "WTeamID" for winning team ID, just TeamID
- Add an "IsWin" column to each df with the appropriate value, and then concat them back together

In [17]:
# add a gameID col
data['gameID'] = range(1, len(data) + 1)

# split data into winners and losers
winner_cols = ['gameID', 'WTeamID', 'WScore', 'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst',
    'WTO', 'WStl', 'WBlk', 'WPF', 'mens_data', 'tourney_game', 'Seed', 'WLoc_A', 'WLoc_H', 'WLoc_N']

loser_cols = ['gameID', 'LTeamID', 'LScore', 'NumOT', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst',
    'LTO', 'LStl', 'LBlk', 'LPF', 'mens_data', 'tourney_game', 'Seed', 'WLoc_A', 'WLoc_H', 'WLoc_N']

winners = data[winner_cols]
loosers = data[loser_cols]

In [18]:
# rename columns to neutral names
winners = winners.rename(columns={'WTeamID': 'TeamID', 'WScore': 'Score', 'WFGM': 'FGM', 'WFGA': 'FGA', 'WFGM3': 'FGM3',
    'WFGA3': 'FGA3', 'WFTM': 'FTM', 'WFTA': 'FTA', 'WOR': 'OR', 'WDR': 'DR', 'WAst': 'Ast', 'WTO': 'TO', 'WStl': 'Stl', 'WBlk': 'Blk',
    'WPF': 'PF', 'WLoc_A': 'Loc_A', 'WLoc_H': 'Loc_H', 'WLoc_N': 'Loc_N'})

loosers = loosers.rename(columns={'LTeamID': 'TeamID', 'LScore': 'Score', 'LFGM': 'FGM', 'LFGA': 'FGA', 'LFGM3': 'FGM3', 'LFGA3': 'FGA3',
    'LFTM': 'FTM', 'LFTA': 'FTA', 'LOR': 'OR', 'LDR': 'DR', 'LAst': 'Ast', 'LTO': 'TO', 'LStl': 'Stl', 'LBlk': 'Blk', 'LPF': 'PF', 'WLoc_A': 'Loc_A',
    'WLoc_H': 'Loc_H', 'WLoc_N': 'Loc_N' })

In [19]:
# add "IsWin" column with appropriate value
winners['IsWin'] = True
loosers['IsWin'] = False

# for loosers flip the value of the location columns

# temporarily split the neutral games out
neutral = loosers[loosers['Loc_N'] == True]
loosers = loosers[loosers['Loc_N'] == False]

# flip values
loosers['Loc_A'] = ~loosers['Loc_A']
loosers['Loc_H'] = ~loosers['Loc_H']

# concat neutral games back in
loosers = pd.concat([loosers, neutral], axis=0)

### Add opponent stats columns
- concat winners and loosers back together width wise and rename the columns to indicate they are opponent game stats
- combine winners and loosers back together vertically

In [20]:
winner_opponent = loosers.add_prefix('opp_')
loser_opponent = winners.add_prefix('opp_')

winners = pd.concat([winners, loser_opponent], axis=1)
loosers = pd.concat([loosers, winner_opponent], axis=1)

data = pd.concat([winners, loosers], axis=0)

# drop redundant columns
data.drop(columns=['opp_gameID', 'opp_mens_data', 'opp_tourney_game', 'opp_Loc_A', 'opp_Loc_H', 'opp_Loc_N', 'opp_IsWin'], inplace=True)

In [28]:
data.to_csv('prepared_data.csv', index=False)

In [None]:
data.head()

#### Create aggregate data for each team
- in order to predict one teams performace against another, we will create some data aggregations for each team. i.e. average score, winning percentage, and averages or medians of other game stats.
- In the final product, the user will be asked to enter two teams and the model will use these aggregated stats for those teams as inputs to make a prediction of which team will win

In [21]:
# split data aggregations into home, away and neutral location games
neutral_games = data[data['Loc_N'] == True]
home_games = data[data['Loc_H'] == True]
away_games = data[data['Loc_A'] == True]

In [27]:
# perform aggregations for each team
neutral_games = neutral_games.groupby('TeamID').agg({
    'Score': 'mean',   # Average score
    'FGM': 'mean',     # Average field goals made
    'FGA': 'mean',     # Average field goals attempted
    'FGM3': 'mean',    # Average three-point field goals made
    'FGA3': 'mean',    # Average three-point field goals attempted
    'FTM': 'mean',     # Average free throws made
    'FTA': 'mean',     # Average free throws attempted
    'OR': 'mean',      # Average offensive rebounds
    'DR': 'mean',      # Average defensive rebounds
    'Ast': 'mean',     # Average assists
    'TO': 'mean',      # Average turnovers
    'Stl': 'mean',     # Average steals
    'Blk': 'mean',     # Average blocks
    'PF': 'mean'       # Average personal fouls
}).reset_index()

home_games = home_games.groupby('TeamID').agg({
    'Score': 'mean',
    'FGM': 'mean',
    'FGA': 'mean',
    'FGM3': 'mean',
    'FGA3': 'mean',
    'FTM': 'mean',
    'FTA': 'mean',
    'OR': 'mean',
    'DR': 'mean',
    'Ast': 'mean',
    'TO': 'mean',
    'Stl': 'mean',
    'Blk': 'mean',
    'PF': 'mean'
}).reset_index()

away_games = away_games.groupby('TeamID').agg({
    'Score': 'mean',
    'FGM': 'mean',
    'FGA': 'mean',
    'FGM3': 'mean',
    'FGA3': 'mean',
    'FTM': 'mean',
    'FTA': 'mean',
    'OR': 'mean',
    'DR': 'mean',
    'Ast': 'mean',
    'TO': 'mean',
    'Stl': 'mean',
    'Blk': 'mean',
    'PF': 'mean'
}).reset_index()

neutral_games.to_csv('neutral_game_aggregations.csv', index=False)
home_games.to_csv('home_game_aggregations.csv', index=False)
away_games.to_csv('away_game_aggregations.csv', index=False)

# Data Exploration

In [None]:
# there are two rows for each game. one for each team
data.sort_values(by='gameID').head()

In [None]:
data.info()

In [None]:
# drop opp_NumOT since it is redundanct
del data['opp_NumOT']

In [None]:
data.describe()

#### Column info
- gameID: unique id for each game. there will be two rows for each id. one for each team
- TeamID: id to identify each team
- Score: teams score for that game
- NumOT: if the game went into overtime, this number indicates the number of overtime periods played
- FGM: field goals made
- FGA: field goals attempted
- FGM3: 3-point field goals made
- FGA3: 3-point field goals attempted
- FTM: free throws made
- FTA: free throws attempted
- OR: offensive rebounds
- DR: defensive rebounds
- Ast: assists
- TO: turn overs
- Stl: steals
- Blk: blocks
- PF: total personal fouls commited by the team in that game
- mens_data: if this is a mens game
- tourney_game: if this is a tournament game
- Seed: the teams seed in the 2024 NCAA tourney. if the team was not invited to the 2024 tournament a seed of 20 was assigned to indicate a weaker team
- Loc_A: if the location the game was away
- Loc_H: if the location the game was home
- Loc_N: if the location the game was neutral
- IsWin: team won the game
- opp_TeamID: opponents teamID
- opp_Score: opponents score
- opp_FGM: opponent field goals made
- opp_FGA: opponent field goals attempted
- opp_FGM3: opponent 3-point field goals made
- opp_FGA3: opponent 3-point field goals attempted
- opp_FTM: opponent free throws made
- opp_FTA: opponent free throws attempted
- opp_OR: opponent offensive rebounds
- opp_DR: opponent defensive rebounds
- opp_Ast: opponent assists
- opp_TO: opponent turn overs
- opp_Stl: opponent steals
- opp_Blk: opponent blocks
- opp_PF: opponent total personal fouls commited in that game
- opp_Seed: opponent seed

#### Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# selected features
features = [
    # 'gameID',
    # 'TeamID',
    'Score',
    # 'NumOT',
    'FGM',
    'FGA',
    'FGM3',
    'FGA3',
    'FTM',
    'FTA',
    'OR',
    'DR',
    'Ast',
    'TO',
    'Stl',
    'Blk',
    'PF',
    # 'mens_data',
    # 'tourney_game',
    'Seed',
    'Loc_A',
    'Loc_H',
    'Loc_N',
    'IsWin',
    # 'opp_TeamID',
    'opp_Score',
    'opp_FGM',
    'opp_FGA',
    'opp_FGM3',
    'opp_FGA3',
    'opp_FTM',
    'opp_FTA',
    'opp_OR',
    'opp_DR',
    'opp_Ast',
    'opp_TO',
    'opp_Stl',
    'opp_Blk',
    'opp_PF',
    'opp_Seed'
    ]

In [None]:
# histogram
data[features].hist(bins=30, figsize=(15, 14))
plt.show()

In [None]:
# create correlation matrix
correlation_matrix = data.corr()
correlation_matrix['IsWin'].sort_values(ascending=False)

#### Stats correlated with winning games
- Highly correlated:
    - Score (duh)
    - Field goals made
    - Assists
    - Defensive rebounds
- Somewhat correlated:
    - Free throws made
    - 3-point field goals made
    - If the game was played at home
- Low correlation:
    - Steals
    - Blocks
    - Offensive rebounds (this is surprising since offensive rebounds grant a team an extra offensive possision and another opportunity to score)
- Negative correlations (detrimental to winning the game):
    - Turnovers
    - Personal Fouls
    - If the game was played away

- Opponent stat correlations mirror these. i.e. an opponent that has a lot of field goals made is detrimental to winning the game
- This gives a good idea of which features to focus on

# Model Selection

## Split and scale the data

In [None]:
# set a random state
rand_state = 54

In [None]:
# split target from data
X = data.copy()
X.drop(columns=[
    'IsWin',
    'gameID',
    'NumOT',
    'mens_data',
    'tourney_game',
    'Seed',
    'opp_Seed'
], inplace=True)
y = data['IsWin']

In [None]:
import numpy as np
import xgboost

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn import ensemble, naive_bayes
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, Ridge, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rand_state)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Transform both training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Test some models and their accuracy on the data

In [None]:
def evaluate_classifier(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = metrics.accuracy_score(y_test, y_pred)
    
    # Print the metrics
    print(f"{model.__class__.__name__}:")
    print(f"Accuracy = {accuracy:.4f}")
    print("-"*100)

In [None]:
models = [
    LogisticRegression(max_iter=1000),
    KNeighborsClassifier(),
    RidgeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    SVC(),
    xgboost.XGBClassifier(),
    SGDClassifier(),
    DecisionTreeClassifier(),
    AdaBoostClassifier(algorithm='SAMME'),
]

for model in models:
    evaluate_classifier(model, X_train_scaled, X_test_scaled, y_train, y_test)

### Initial results
- Best performing models are LogisticRegression, SVC and XGBClassifier
- will try to reduce dimensions and see how it affects the results

In [None]:
from sklearn.decomposition import PCA

# reduce to the number of dimensions that will contain 95% of the variance of the data
pca = PCA(n_components=0.95)

X_pca = pca.fit_transform(X)
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=rand_state)

In [None]:
for model in models:
    evaluate_classifier(model, X_train_pca, X_test_pca, y_train, y_test)

### Ineffective dimensionality reduction
- using pca reduction significantly reduced accuracy

### Ensemble models

In [None]:
# create a list of ensemble classifier models to try
classifier_models = [
    (ensemble.BaggingClassifier(estimator=LogisticRegression(random_state=rand_state, max_iter=1000), n_estimators=20), 'BaggingClassifier'),
    (ensemble.RandomForestClassifier(random_state=rand_state), 'RandomForestClassifier'),
    (ensemble.ExtraTreesClassifier(random_state=rand_state), 'ExtraTreesClassifier'),
    (ensemble.AdaBoostClassifier(algorithm='SAMME', random_state=rand_state), 'AdaBoostClassifier'),
    (ensemble.GradientBoostingClassifier(random_state=rand_state), 'GradientBoostingClassifier'),
    (ensemble.HistGradientBoostingClassifier(random_state=rand_state), 'HistGradientBoostingClassifier'),
    (xgboost.XGBClassifier(random_state=rand_state), 'XGBoostClassifer')
]

In [None]:
for model, name in classifier_models:
    evaluate_classifier(model, X_train_scaled, X_test_scaled, y_train, y_test)

### Voting Classifer

In [None]:
clf1 = ensemble.BaggingClassifier(estimator=RidgeClassifier(random_state=rand_state))
clf2 = naive_bayes.GaussianNB()
clf3 = ensemble.HistGradientBoostingClassifier(random_state=rand_state)
clf4 = ensemble.GradientBoostingClassifier(random_state=rand_state)
clf5 = xgboost.XGBClassifier(random_state=rand_state)

estimators = [
    ('bagging', clf1),
    ('naive_bayes', clf2),
    ('HistGradient', clf3),
    ('GrandientBoost', clf4),
    ('XGBoost', clf5)
]

voting_classifier = ensemble.VotingClassifier(estimators=estimators)
voting_classifier.fit(X_train_scaled, y_train)

yclassifier_pred = voting_classifier.predict(X_test_scaled)

accuracy = metrics.accuracy_score(y_test, yclassifier_pred)
print("Accuracy: ", accuracy)

#### Regression models

In [None]:
# create a list of regression models to be used
model_list_reg = [
    (ensemble.BaggingRegressor(estimator=Ridge(random_state=rand_state), n_estimators=20), 'BaggingRegressor'),
    (ensemble.RandomForestRegressor(random_state=rand_state), 'RandomForestRegressor'),
    (ensemble.ExtraTreesRegressor(random_state=rand_state), 'ExtraTreesRegressor'),
    (ensemble.AdaBoostRegressor(random_state=rand_state), 'AdaBoostRegressor'),
    (ensemble.GradientBoostingRegressor(random_state=rand_state), 'GradientBoostingRegressor'),
    (ensemble.HistGradientBoostingRegressor(random_state=rand_state), 'HistGradientBoostingRegressor'),
    (xgboost.XGBRegressor(random_state=rand_state), 'XGBRegressor')
]

In [None]:
def fit_model_reg(reg):
    reg.fit(X_train_scaled, y_train)
    yreg_pred = reg.predict(X_test_scaled)

    mse = metrics.mean_squared_error(y_test, yreg_pred)
    print("MSE: ", mse)

In [None]:
for reg, name in model_list_reg:
    print("-"*1000)
    print(name)
    fit_model_reg(reg)

### Stack

In [None]:
estimators = [
    ('ridge', Ridge(random_state=rand_state)),
    ('linear', LinearRegression()),
    ('knr', KNeighborsRegressor()),
    ('xgboost', xgboost.XGBRegressor(random_state=rand_state))
]

final_estimator = ensemble.HistGradientBoostingRegressor(random_state=rand_state)

stack_regressor = ensemble.StackingRegressor(estimators=estimators, final_estimator=final_estimator)

stack_regressor.fit(X_train_scaled, y_train)
yreg_pred = stack_regressor.predict(X_test_scaled)

mse = metrics.mean_squared_error(y_test, yreg_pred)
print("MSE: ", mse)

### Stack of stacks
- use best performing ensemble regression models as the estimators

In [None]:
estimators_2 = [
    ('gradientboot', ensemble.GradientBoostingRegressor(random_state=rand_state)),
    ('histgradient', ensemble.HistGradientBoostingRegressor(random_state=rand_state)),
    ('xgboost', xgboost.XGBRegressor(random_state=rand_state)),
    ('bagging', ensemble.BaggingRegressor(random_state=rand_state))
]

final_estimator = ensemble.StackingRegressor(
    estimators=estimators_2,
    final_estimator=ensemble.HistGradientBoostingRegressor(random_state=rand_state)
)

# run the previous estimators also
estimators = [
    ('ridge', Ridge(random_state=rand_state)),
    ('linear', LinearRegression()),
    ('knr', KNeighborsRegressor()),
    ('xgboost', xgboost.XGBRegressor(random_state=rand_state))
]

stack_of_stacks_reg = ensemble.StackingRegressor(estimators=estimators, final_estimator=final_estimator)

stack_of_stacks_reg.fit(X_train_scaled, y_train)
yreg_pred = stack_of_stacks_reg.predict(X_test_scaled)

mse = metrics.mean_squared_error(y_test, yreg_pred)
print("MSE: ", mse)

### Voting Regressor

In [None]:
reg1 = ensemble.GradientBoostingRegressor(random_state=rand_state)
reg2 = ensemble.HistGradientBoostingRegressor(random_state=rand_state)
reg3 = xgboost.XGBRegressor(random_state=rand_state)
reg4 = ensemble.BaggingRegressor(random_state=rand_state)

estimators = [
    ('gradientBoost', reg1),
    ('histGradient', reg2),
    ('xgbRegressor', reg3),
    ('bagging', reg4),
]

voting_regressor = ensemble.VotingRegressor(estimators=estimators)
voting_regressor.fit(X_train_scaled, y_train)

yreg_pred = voting_regressor.predict(X_test_scaled)

mse = metrics.mean_squared_error(y_test, yreg_pred)
print("MSE: ", mse)

## Results
#### Initial models
- LogisticRegression: Accuracy = 0.8840

### After PCA dimensionality reduction
- XGBClassifier: Accuracy = 0.5016

### Ensemble classification models
- BaggingClassifier: Accuracy = 0.8822

### Ensemble Voting Classifiction model
- BaggingClassifier: Accuracy = 0.8822

### Ensemble Regression model
- HistGradientBoostingRegressor: MSE = 0.10200270964882302

### Ensemble Stacking regressor
- estimators used:
    - Ridge
    - LinearRegression
    - KNeighborsRegressor
    - XGBRegressor
    - Final estimator: ensemble.HistGradientBoostingRegressor
    - MSE:  0.0887182854997212

### Ensemble Stack of Stacks regressor
- Additional models used as estimators:
    - ensemble.GradientBoostingRegressor
    - ensemble.HistGradientBoostingRegressor
    - xgboost.XGBRegressor
    - ensemble.BaggingRegressor
    - Previous stack used as the final estimator
- MSE:  0.08994140523121094

### Voting Ensemble Regressor
- Estimators used:
    - ensemble.GradientBoostingRegressor
    - ensemble.HistGradientBoostingRegressor
    - xgboost.XGBRegressor
    - ensemble.BaggingRegressor
- MSE:  0.10309174643982315

### Test Models to determine accuracy

In [None]:
logRegressor = LogisticRegression(max_iter=1000, random_state=rand_state)
baggingClassifier = ensemble.BaggingClassifier(estimator=LogisticRegression(random_state=rand_state, max_iter=1000), n_estimators=20)
histGradRegressor = ensemble.HistGradientBoostingRegressor(random_state=rand_state)

test_models = {
    'logRegressor': logRegressor,
    'baggingClassifier': baggingClassifier,
    'histGradRegressor': histGradRegressor,
    'stack_regressor': stack_regressor,
    'stack_of_stacks_reg': stack_of_stacks_reg,
    'voting_classifier': voting_classifier,
    'voting_regressor': voting_regressor
}

In [None]:
for key, value in test_models.items():    
    value.fit(X_train_scaled, y_train)
    predictions = value.predict(X_train_scaled)

    prediction_df = X_train.copy()
    prediction_df['IsWin'] = y_train

    threshold = 0.5
    binary_predictions = np.where(predictions > threshold, 1, 0)
    prediction_df['Predicted_Label'] = binary_predictions

    accuracy = metrics.accuracy_score(prediction_df['IsWin'], prediction_df['Predicted_Label'])
    print("Model: ", key)
    print("Accuracy:", accuracy)
    print(metrics.classification_report(prediction_df['IsWin'], prediction_df['Predicted_Label']))

# Fine Tuning
- After testing, it appears that the voting regressor is performing best on the data
- will fine tune each of the estimator models that make up the voting regressor

In [None]:
from sklearn.model_selection import GridSearchCV

# fine tune GradientBoostingRegressor
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=ensemble.GradientBoostingRegressor(random_state=rand_state),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train_scaled, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: ", -grid_search.best_score_)


In [None]:
# Parameter grid for HistGradientBoostingRegressor
param_grid_hg = {
    'max_iter': [100, 200],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_samples_leaf': [20, 40, 60]
}

# Grid search object
grid_hg = GridSearchCV(
    ensemble.HistGradientBoostingRegressor(random_state=rand_state),
    param_grid_hg,
    cv=5,
    verbose=1,
    scoring='neg_mean_squared_error'
)

grid_hg.fit(X_train_scaled, y_train)
print("Best parameters for HistGradientBoostingRegressor:", grid_hg.best_params_)

In [None]:
# Parameter grid for XGBRegressor
param_grid_xgb = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Grid search object
grid_xgb = GridSearchCV(
    xgboost.XGBRegressor(random_state=rand_state),
    param_grid_xgb,
    cv=5,
    verbose=1,
    scoring='neg_mean_squared_error'
)

grid_xgb.fit(X_train_scaled, y_train)
print("Best parameters for XGBRegressor:", grid_xgb.best_params_)

In [None]:
# Parameter grid for BaggingRegressor
param_grid_bag = {
    'n_estimators': [10, 20, 30],
    'max_samples': [0.8, 0.9, 1.0],
    'max_features': [0.8, 0.9, 1.0]
}

# Grid search object
grid_bag = GridSearchCV(
    ensemble.BaggingRegressor(random_state=rand_state),
    param_grid_bag,
    cv=5,
    verbose=1,
    scoring='neg_mean_squared_error'
)

grid_bag.fit(X_train_scaled, y_train)
print("Best parameters for BaggingRegressor:", grid_bag.best_params_)

# Train and Export Final model

In [None]:
gbr_best_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
hgbr_best_params = {'learning_rate': 0.1, 'max_depth': 10, 'max_iter': 200, 'min_samples_leaf': 60}
XGBR_best_params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.05,
    'max_depth': 7,
    'n_estimators': 200,
    'subsample': 0.8
    }
bag_reg_best_params = {'max_features': 0.9, 'max_samples': 0.8, 'n_estimators': 30}

In [None]:
# Assemble the voting regressor with the best parameters
voting_regressor = ensemble.VotingRegressor(
    estimators=[
        ('gradientBoost', ensemble.GradientBoostingRegressor(**gbr_best_params)),
        ('histGradient', ensemble.HistGradientBoostingRegressor(**hgbr_best_params)),
        ('xgbRegressor', xgboost.XGBRegressor(**XGBR_best_params)),
        ('bagging', ensemble.BaggingRegressor(**bag_reg_best_params))
    ]
)
voting_regressor.fit(X_train_scaled, y_train)
yreg_pred = voting_regressor.predict(X_test_scaled)
mse = metrics.mean_squared_error(y_test, yreg_pred)
print("MSE of tuned VotingRegressor: ", mse)


In [None]:
predictions = voting_regressor.predict(X_train_scaled)

prediction_df = X_train.copy()
prediction_df['IsWin'] = y_train

threshold = 0.5
binary_predictions = np.where(predictions > threshold, 1, 0)
prediction_df['Predicted_Label'] = binary_predictions

accuracy = metrics.accuracy_score(prediction_df['IsWin'], prediction_df['Predicted_Label'])
print("Accuracy:", accuracy)
print(metrics.classification_report(prediction_df['IsWin'], prediction_df['Predicted_Label']))

In [None]:
prediction_df['Predicted_Label'] = prediction_df['Predicted_Label'].astype(bool)

In [None]:
prediction_df[
    (prediction_df['IsWin'] == False)
    & (prediction_df['Predicted_Label'] == True)
]

## Save the trained model

In [None]:
from joblib import dump, load

# Save the model to a file
dump(model, 'voting_regressor.pkl')

In [None]:
# Load the model from the file
model = load('voting_regressor.pkl')

In [23]:
def create_game_helper(teamID_1, teamID_2):
    """ helper function to add the two selected teams agg stats into in DF """
    # select two teams and concat them into on df. then add the 'Loc_A', 'Loc_H', 'Loc_N' columns
    predict_game = neutral_games[neutral_games['TeamID'] == teamID_1].reset_index(drop=True)
    team2 = neutral_games[neutral_games['TeamID'] == teamID_2].reset_index(drop=True)

    # for team2 drop the teamID and add prefix for columns
    team2 = team2.add_prefix('opp_')

    # predict_game = pd.concat([team1, team2], axis=1)
    for col in team2.columns:
        predict_game[col] = team2[col]

    # add location columns
    predict_game['Loc_A'] = False
    predict_game['Loc_H'] = False
    predict_game['Loc_N'] = True

    return predict_game

In [24]:
test_loaded_model = create_game_helper(1152, 1541)

In [26]:
test_loaded_model.columns

Index(['TeamID', 'Score', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR',
       'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'opp_TeamID', 'opp_Score',
       'opp_FGM', 'opp_FGA', 'opp_FGM3', 'opp_FGA3', 'opp_FTM', 'opp_FTA',
       'opp_OR', 'opp_DR', 'opp_Ast', 'opp_TO', 'opp_Stl', 'opp_Blk', 'opp_PF',
       'Loc_A', 'Loc_H', 'Loc_N'],
      dtype='object')