# Madness of March
## Jim Haines & Josh McCoy
### [Project Website](https://joshmccoy2.github.io/NCAA_March_Madness/)

## Current Datasets
[Kaggle datasets](https://www.kaggle.com/competitions/mens-march-mania-2022/data)

# ETL & EDA

In [2]:
# Import necessary libraries
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import xgboost as xgb
import requests
import numpy as np
import csv
import re
import time
import Levenshtein 
from io import StringIO
from io import StringIO
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import brier_score_loss, accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, r2_score, roc_curve, auc, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from scipy.special import expit 
from scipy.special import expit  
from xgboost import XGBClassifier
from bs4 import BeautifulSoup
from fuzzywuzzy import process

We can first look at how some basic factors affect the amount the winning team wins by. We want to look at the difference in score to see how dominant a team is against a competitor. A team who wins by a lot is typically a sign that this team is significantly better than its oponent.

# Regular Season Data

In [3]:
# To do this we can look at regular season game stats
detailed_results_data = 'MRegularSeasonDetailedResults.csv'
detailed_results = pd.read_csv(detailed_results_data)
detailed_results

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113236,2024,132,1120,86,1196,67,N,0,31,61,...,13,18,25,13,26,9,11,8,6,17
113237,2024,132,1182,57,1433,51,N,0,17,57,...,24,12,14,9,25,9,16,10,9,16
113238,2024,132,1228,93,1458,87,N,0,30,57,...,20,20,23,13,17,17,7,7,1,20
113239,2024,132,1412,85,1396,69,N,0,31,63,...,21,14,17,11,26,17,14,6,6,18



Let's check the datatypes

In [4]:
detailed_results.dtypes

Season      int64
DayNum      int64
WTeamID     int64
WScore      int64
LTeamID     int64
LScore      int64
WLoc       object
NumOT       int64
WFGM        int64
WFGA        int64
WFGM3       int64
WFGA3       int64
WFTM        int64
WFTA        int64
WOR         int64
WDR         int64
WAst        int64
WTO         int64
WStl        int64
WBlk        int64
WPF         int64
LFGM        int64
LFGA        int64
LFGM3       int64
LFGA3       int64
LFTM        int64
LFTA        int64
LOR         int64
LDR         int64
LAst        int64
LTO         int64
LStl        int64
LBlk        int64
LPF         int64
dtype: object

All the dtypes look correct, so lets move on to our analysis

In [5]:
# Create variable for amount by which winning team wins
detailed_results['SCOREDIFF'] = detailed_results['WScore'] - detailed_results['LScore']

When you see W or L this stands for Winning or Losing team.

In [6]:
# Winner stats
detailed_results.dtypes
detailed_results['WFGPCT'] = detailed_results['WFGM']/detailed_results['WFGA']       # field goal pt completion %
detailed_results['W3PCT'] = detailed_results['WFGM3']/detailed_results['WFGA3']      # 3 pt completion %
detailed_results['WFTPCT'] = detailed_results['WFTM']/detailed_results['WFTA']       # free throw completion %
detailed_results['WORBCHANCE'] =  detailed_results['WOR'] + detailed_results['LDR']  # total potential rebounds
detailed_results['WORPCT'] = detailed_results['WOR']/detailed_results['WORBCHANCE']  # rebound completion %

In [7]:
# Losing stats
detailed_results['LFGPCT'] = detailed_results['LFGM']/detailed_results['LFGA']       
detailed_results['L3PCT'] = detailed_results['LFGM3']/detailed_results['LFGA3']
detailed_results['LFTPCT'] = detailed_results['LFTM']/detailed_results['LFTA']
detailed_results['LORBCHANCE'] =  detailed_results['LOR'] + detailed_results['WDR']
detailed_results['LORPCT'] = detailed_results['LOR']/detailed_results['LORBCHANCE']

We can now take a more detailed look at the correlation between winner field goal % and score difference.

We can look at other factors as well:

Despite the common phrase "board man gets paid", we can see that there is a less strong correlation between rebound % and score difference. Perhaps this saying holds more true a higher level of play.

We can also take a look at detailed results from the actual tournament to see if there are any obvious differences from the regular season data. To do this we can replicate the above regular season analysis.

# Tournament Data

In [8]:
detailed_tourney_data = 'MNCAATourneyDetailedResults.csv'
detailed_tourney_results = pd.read_csv(detailed_tourney_data)
detailed_tourney_results

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1310,2023,146,1274,88,1400,81,N,0,29,49,...,25,11,15,9,14,20,13,6,1,23
1311,2023,146,1361,57,1166,56,N,0,25,66,...,17,10,11,8,24,9,9,3,2,11
1312,2023,152,1163,72,1274,59,N,0,28,57,...,20,12,12,12,17,10,9,8,1,12
1313,2023,152,1361,72,1194,71,N,0,25,57,...,22,16,21,7,24,6,9,6,2,17


In [9]:
detailed_tourney_results.dtypes

Season      int64
DayNum      int64
WTeamID     int64
WScore      int64
LTeamID     int64
LScore      int64
WLoc       object
NumOT       int64
WFGM        int64
WFGA        int64
WFGM3       int64
WFGA3       int64
WFTM        int64
WFTA        int64
WOR         int64
WDR         int64
WAst        int64
WTO         int64
WStl        int64
WBlk        int64
WPF         int64
LFGM        int64
LFGA        int64
LFGM3       int64
LFGA3       int64
LFTM        int64
LFTA        int64
LOR         int64
LDR         int64
LAst        int64
LTO         int64
LStl        int64
LBlk        int64
LPF         int64
dtype: object

In [10]:
#Create variable for amount by which winning team wins
detailed_tourney_results['SCOREDIFF'] = detailed_tourney_results['WScore'] - detailed_tourney_results['LScore']

In [11]:
# Winner stats
detailed_tourney_results.dtypes
detailed_tourney_results['WFGPCT'] = detailed_tourney_results['WFGM']/detailed_tourney_results['WFGA']       # field goal pt completion %
detailed_tourney_results['W3PCT'] = detailed_tourney_results['WFGM3']/detailed_tourney_results['WFGA3']      # 3 pt completion %
detailed_tourney_results['WFTPCT'] = detailed_tourney_results['WFTM']/detailed_tourney_results['WFTA']       # free throw completion %
detailed_tourney_results['WORBCHANCE'] =  detailed_tourney_results['WOR'] + detailed_tourney_results['LDR']  # total potential rebounds
detailed_tourney_results['WORPCT'] = detailed_tourney_results['WOR']/detailed_tourney_results['WORBCHANCE']  # rebound completion %

In [12]:
# Losing stats
detailed_tourney_results['LFGPCT'] = detailed_tourney_results['LFGM']/detailed_tourney_results['LFGA']       
detailed_tourney_results['L3PCT'] = detailed_tourney_results['LFGM3']/detailed_tourney_results['LFGA3']
detailed_tourney_results['LFTPCT'] = detailed_tourney_results['LFTM']/detailed_tourney_results['LFTA']
detailed_tourney_results['LORBCHANCE'] =  detailed_tourney_results['LOR'] + detailed_tourney_results['WDR']
detailed_tourney_results['LORPCT'] = detailed_tourney_results['LOR']/detailed_tourney_results['LORBCHANCE']

In [13]:
detailed_tourney_results.dtypes

Season          int64
DayNum          int64
WTeamID         int64
WScore          int64
LTeamID         int64
LScore          int64
WLoc           object
NumOT           int64
WFGM            int64
WFGA            int64
WFGM3           int64
WFGA3           int64
WFTM            int64
WFTA            int64
WOR             int64
WDR             int64
WAst            int64
WTO             int64
WStl            int64
WBlk            int64
WPF             int64
LFGM            int64
LFGA            int64
LFGM3           int64
LFGA3           int64
LFTM            int64
LFTA            int64
LOR             int64
LDR             int64
LAst            int64
LTO             int64
LStl            int64
LBlk            int64
LPF             int64
SCOREDIFF       int64
WFGPCT        float64
W3PCT         float64
WFTPCT        float64
WORBCHANCE      int64
WORPCT        float64
LFGPCT        float64
L3PCT         float64
LFTPCT        float64
LORBCHANCE      int64
LORPCT        float64
dtype: obj

In [14]:
detailed_tourney_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,WFGPCT,W3PCT,WFTPCT,WORBCHANCE,WORPCT,LFGPCT,L3PCT,LFTPCT,LORBCHANCE,LORPCT
0,2003,134,1421,92,1411,84,N,1,32,69,...,0.463768,0.37931,0.653846,42,0.333333,0.432836,0.387097,0.451613,47,0.361702
1,2003,136,1112,80,1436,51,N,0,31,66,...,0.469697,0.304348,0.785714,37,0.297297,0.3125,0.25,1.0,44,0.181818
2,2003,136,1113,84,1272,71,N,0,31,59,...,0.525424,0.428571,0.727273,32,0.3125,0.362319,0.25,0.666667,47,0.425532
3,2003,136,1141,79,1166,73,N,0,29,53,...,0.54717,0.428571,0.72,28,0.392857,0.45,0.411765,0.705882,34,0.411765
4,2003,136,1143,76,1301,74,N,1,27,64,...,0.421875,0.35,0.652174,44,0.409091,0.446429,0.428571,0.75,30,0.333333


# Merge with MTeams.csv to get TeamName

In [15]:
# mteams_df = pd.read_csv('MTeams.csv')

# # Assuming df_team_data is your first dataframe containing data about the teams
# # df_seed_data is your second dataframe containing team IDs and their associated seed in a tournament
# # and you have columns WTeamID and LTeamID in df_team_data

# # Merge for WTeamID
# merged_df_winning = pd.merge(detailed_tourney_results, mteams_df, left_on=['WTeamID'], right_on=['TeamID'], how='left')
# merged_df_winning.rename(columns={'TeamName': 'TeamName1'}, inplace=True)

# # Merge for LTeamID
# merged_df_losing = pd.merge(detailed_tourney_results, mteams_df, left_on=['LTeamID'], right_on=['TeamID'], how='left')
# merged_df_losing.rename(columns={'TeamName': 'TeamName2'}, inplace=True)

# # Combine the two merged dataframes
# merged_df_names = pd.concat([merged_df_winning, merged_df_losing], ignore_index=True)

# # Display the resulting DataFrame
# merged_df_names

In [16]:
# Load the MTeams.csv file
mteams_df = pd.read_csv('MTeams.csv')

# Assuming detailed_tourney_results is your dataframe containing data about the tournament results
# and you have columns WTeamID and LTeamID in detailed_tourney_results

# Merge for WTeamID
merged_df_winning = pd.merge(detailed_tourney_results, mteams_df, left_on=['WTeamID'], right_on=['TeamID'], how='left')
merged_df_winning.rename(columns={'TeamName': 'TeamName1'}, inplace=True)

# Merge for LTeamID
merged_df_losing = pd.merge(detailed_tourney_results, mteams_df, left_on=['LTeamID'], right_on=['TeamID'], how='left')
merged_df_losing.rename(columns={'TeamName': 'TeamName2'}, inplace=True)

# Combine the two merged dataframes horizontally
merged_df_names = pd.concat([merged_df_winning, merged_df_losing[['TeamName2']]], axis=1)


# Display the resulting DataFrame
merged_df_names


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGPCT,L3PCT,LFTPCT,LORBCHANCE,LORPCT,TeamID,TeamName1,FirstD1Season,LastD1Season,TeamName2
0,2003,134,1421,92,1411,84,N,1,32,69,...,0.432836,0.387097,0.451613,47,0.361702,1421,UNC Asheville,1987,2024,TX Southern
1,2003,136,1112,80,1436,51,N,0,31,66,...,0.312500,0.250000,1.000000,44,0.181818,1112,Arizona,1985,2024,Vermont
2,2003,136,1113,84,1272,71,N,0,31,59,...,0.362319,0.250000,0.666667,47,0.425532,1113,Arizona St,1985,2024,Memphis
3,2003,136,1141,79,1166,73,N,0,29,53,...,0.450000,0.411765,0.705882,34,0.411765,1141,C Michigan,1985,2024,Creighton
4,2003,136,1143,76,1301,74,N,1,27,64,...,0.446429,0.428571,0.750000,30,0.333333,1143,California,1985,2024,NC State
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1310,2023,146,1274,88,1400,81,N,0,29,49,...,0.500000,0.400000,0.733333,28,0.321429,1274,Miami FL,1986,2024,Texas
1311,2023,146,1361,57,1166,56,N,0,25,66,...,0.400000,0.117647,0.909091,31,0.258065,1361,San Diego St,1985,2024,Creighton
1312,2023,152,1163,72,1274,59,N,0,28,57,...,0.322581,0.350000,1.000000,39,0.307692,1163,Connecticut,1985,2024,Miami FL
1313,2023,152,1361,72,1194,71,N,0,25,57,...,0.442308,0.409091,0.761905,30,0.233333,1361,San Diego St,1985,2024,FL Atlantic


# Merge with MNCAATourneySeeds.csv to get seeds of each team

In [17]:
# # Assuming df_team_data is your first dataframe containing data about the teams
# # df_seed_data is your second dataframe containing team IDs and their associated seed in a tournament
# # and you have columns WTeamID and LTeamID in df_team_data

# # Merge for WTeamID
# merged_df_winning = pd.merge(merged_df_names, seeds_df, left_on=['WTeamID', 'Season'], right_on=['TeamID', 'Season'], how='left')
# merged_df_winning.rename(columns={'Seed': 'WTeamSeed'}, inplace=True)

# # Merge for LTeamID
# merged_df_losing = pd.merge(merged_df_names, seeds_df, left_on=['LTeamID', 'Season'], right_on=['TeamID', 'Season'], how='left')
# merged_df_losing.rename(columns={'Seed': 'LTeamSeed'}, inplace=True)

# # Combine the two merged dataframes
# merged_df_seeds = pd.concat([merged_df_winning, merged_df_losing], ignore_index=True)

# # Display the resulting DataFrame
# merged_df_seeds

In [18]:
# Load the seeds dataframe
seeds_df = pd.read_csv('MNCAATourneySeeds.csv')

# Extract the integer part from the Seed column and store it as integer
seeds_df['Seed_correct'] = seeds_df['Seed'].str.extract('(\d+)').astype(int)
seeds_df

Unnamed: 0,Season,Seed,TeamID,Seed_correct
0,1985,W01,1207,1
1,1985,W02,1210,2
2,1985,W03,1228,3
3,1985,W04,1260,4
4,1985,W05,1374,5
...,...,...,...,...
2485,2023,Z12,1433,12
2486,2023,Z13,1233,13
2487,2023,Z14,1213,14
2488,2023,Z15,1421,15


In [19]:
# Assuming merged_df_names is your already merged dataframe containing team names
# seeds_df is your dataframe containing team IDs and their associated seed in a tournament
# and you have columns WTeamID and LTeamID in merged_df_names

# Merge for WTeamID
merged_df_winning = pd.merge(merged_df_names, seeds_df, left_on=['WTeamID', 'Season'], right_on=['TeamID', 'Season'], how='left')
merged_df_winning.rename(columns={'Seed_correct': 'WTeamSeedCorr'}, inplace=True)

# Merge for LTeamID
merged_df_losing = pd.merge(merged_df_names, seeds_df, left_on=['LTeamID', 'Season'], right_on=['TeamID', 'Season'], how='left')
merged_df_losing.rename(columns={'Seed_correct': 'LTeamSeedCorr'}, inplace=True)

# Combine the two merged dataframes horizontally
merged_df_seeds = pd.concat([merged_df_winning, merged_df_losing[['LTeamSeedCorr']]], axis=1)

# Drop unnecessary columns from the losing team dataframe
#merged_df_seeds.drop(['TeamID', 'TeamName'], axis=1, inplace=True)

# Display the resulting DataFrame
merged_df_seeds


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LORPCT,TeamID_x,TeamName1,FirstD1Season,LastD1Season,TeamName2,Seed,TeamID_y,WTeamSeedCorr,LTeamSeedCorr
0,2003,134,1421,92,1411,84,N,1,32,69,...,0.361702,1421,UNC Asheville,1987,2024,TX Southern,X16b,1421,16,16
1,2003,136,1112,80,1436,51,N,0,31,66,...,0.181818,1112,Arizona,1985,2024,Vermont,Z01,1112,1,16
2,2003,136,1113,84,1272,71,N,0,31,59,...,0.425532,1113,Arizona St,1985,2024,Memphis,Z10,1113,10,7
3,2003,136,1141,79,1166,73,N,0,29,53,...,0.411765,1141,C Michigan,1985,2024,Creighton,Z11,1141,11,6
4,2003,136,1143,76,1301,74,N,1,27,64,...,0.333333,1143,California,1985,2024,NC State,W08,1143,8,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1310,2023,146,1274,88,1400,81,N,0,29,49,...,0.321429,1274,Miami FL,1986,2024,Texas,Y05,1274,5,2
1311,2023,146,1361,57,1166,56,N,0,25,66,...,0.258065,1361,San Diego St,1985,2024,Creighton,X05,1361,5,6
1312,2023,152,1163,72,1274,59,N,0,28,57,...,0.307692,1163,Connecticut,1985,2024,Miami FL,Z04,1163,4,5
1313,2023,152,1361,72,1194,71,N,0,25,57,...,0.233333,1361,San Diego St,1985,2024,FL Atlantic,X05,1361,5,9


In [20]:
merged_df_seeds.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'SCOREDIFF', 'WFGPCT', 'W3PCT', 'WFTPCT', 'WORBCHANCE', 'WORPCT',
       'LFGPCT', 'L3PCT', 'LFTPCT', 'LORBCHANCE', 'LORPCT', 'TeamID_x',
       'TeamName1', 'FirstD1Season', 'LastD1Season', 'TeamName2', 'Seed',
       'TeamID_y', 'WTeamSeedCorr', 'LTeamSeedCorr'],
      dtype='object')

# Merge 247 data (stars)

In [21]:
# import requests
# from bs4 import BeautifulSoup
# import csv
# import re

# def scrape_season(year):
#     url = f"https://247sports.com/Season/{year}-Basketball/RecruitRankings/?InstitutionGroup=HighSchool"
#     print(f"Scraping URL: {url}")

#     headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

#     try:
#         response = requests.get(url, headers=headers)
#         response.raise_for_status()
#     except requests.exceptions.HTTPError as err:
#         print(f"HTTP Error: {err}")
#         return
#     except Exception as err:
#         print(f"Error occurred: {err}")
#         return

#     if not response.text:
#         print(f"No data received for {year}, skipping...")
#         return

#     soup = BeautifulSoup(response.text, 'html.parser')
#     recruits = soup.find_all('li', class_='rankings-page__list-item')

#     if not recruits:
#         print(f"No recruits found for {year}, check if page structure is different.")
#         return

#     with open(f'recruits_{year}.csv', 'w', newline='', encoding='utf-8') as file:
#         writer = csv.writer(file)
#         writer.writerow(['Rank', 'Player', 'High School', 'Position', 'Height / Weight', 'Rating', 'Team'])

#         for recruit in recruits:
#             rank = recruit.find('div', class_='primary').get_text(strip=True)
#             name = recruit.find('a', class_='rankings-page__name-link').get_text(strip=True)
#             high_school = recruit.find('div', class_='recruit').get_text(strip=True)
#             position = recruit.find('div', class_='position').get_text(strip=True)
#             height_weight = recruit.find('div', class_='metrics').get_text(strip=True)
#             rating_info = recruit.find('div', class_='rating').get_text(strip=True)
#             match = re.search(r'\b(100|[1-9]?[0-9])', rating_info)
#             rating = match.group(0) if match else 'N/A'
            
#             # Extract team name from alt attribute of img tag within a tag within div with class "status"
#             status_div = recruit.find('div', class_='status')
#             team_alt = 'N/A'
#             if status_div:
#                 img_link = status_div.find('a', class_='img-link')
#                 if img_link:
#                     img_tag = img_link.find('img')
#                     if img_tag:
#                         team_alt = img_tag.get('alt')

#             writer.writerow([rank, name, high_school, position, height_weight, rating, team_alt])

# # Loop through the years
# for year in range(2011, 2025):
#     scrape_season(year)

In [22]:
playerRating_df = pd.DataFrame()

# Loop through each year, read the CSV, add a 'Year' column, and append to the main DataFrame
for year in range(2011, 2025):
    file_name = f'recruits_{year}.csv'
    if os.path.exists(file_name):  # Check if the file exists
        df = pd.read_csv(file_name)
        df['Year'] = year  # Add a column for the year
        playerRating_df = pd.concat([playerRating_df, df], ignore_index=True)

# Save the combined data to a new CSV file
playerRating_df.to_csv('combined_recruits.csv', index=False)

# If you want to see the DataFrame
playerRating_df

Unnamed: 0,Rank,Player,High School,Position,Height / Weight,Rating,Team,Year
0,1,Anthony Davis,"Anthony DavisPerspectives Charter (Chicago, IL)",PF,6-10 / 220,100,Kentucky,2011
1,2,Michael Kidd-Gilchrist,"Michael Kidd-GilchristSt. Patrick (Elizabeth, NJ)",SF,6-6 / 190,100,Kentucky,2011
2,3,Austin Rivers,"Austin RiversWinter Park (Winter Park, FL)",PG,6-3 / 175,100,Duke,2011
3,4,Bradley Beal,"Bradley BealChaminade (Saint Louis, MO)",SG,6-4 / 195,100,Florida,2011
4,5,Quincy Miller,Quincy MillerWestchester Country Day School (H...,PF,6-8 / 193,100,Baylor,2011
...,...,...,...,...,...,...,...,...
2925,146,Bryce Dortch,Bryce DortchBrimmer And May School (Chestnut H...,PF,6-8 / 190,88,Rutgers,2024
2926,147,Angelo Ciaravino,"Angelo CiaravinoMount Carmel (Chicago, IL)",SF,6-5 / 175,88,Northwestern,2024
2927,148,Malcolm Thomas,"Malcolm ThomasDeMatha Catholic (Hyattsville, MD)",PF,6-8 / 195,88,Villanova,2024
2928,149,Chris Tadjo,Chris TadjoNBA Academy Latin America (Montreal...,PF,6-8 / 220,88,Iowa,2024


In [23]:
def rating_to_stars(rating):
    if 100 >= rating >= 98:
        return 5
    elif 97 >= rating >= 94:
        return 4
    elif 93 >= rating >= 0:
        return 3
    else:
        return None  # or some default value for ratings outside the specified ranges

# Apply the function to the 'Rating' column to create the 'Star' column
playerRating_df['Stars'] = playerRating_df['Rating'].apply(rating_to_stars)

# Now you can check the first few rows of your DataFrame
playerRating_df.head()

Unnamed: 0,Rank,Player,High School,Position,Height / Weight,Rating,Team,Year,Stars
0,1,Anthony Davis,"Anthony DavisPerspectives Charter (Chicago, IL)",PF,6-10 / 220,100,Kentucky,2011,5
1,2,Michael Kidd-Gilchrist,"Michael Kidd-GilchristSt. Patrick (Elizabeth, NJ)",SF,6-6 / 190,100,Kentucky,2011,5
2,3,Austin Rivers,"Austin RiversWinter Park (Winter Park, FL)",PG,6-3 / 175,100,Duke,2011,5
3,4,Bradley Beal,"Bradley BealChaminade (Saint Louis, MO)",SG,6-4 / 195,100,Florida,2011,5
4,5,Quincy Miller,Quincy MillerWestchester Country Day School (H...,PF,6-8 / 193,100,Baylor,2011,5


In [24]:
top_players = playerRating_df.groupby('Year')['Rating'].min()
print(top_players)

Year
2011    70
2012    10
2013    83
2014    87
2015    87
2016    10
2017    87
2018    88
2019    88
2020    90
2021    89
2022    89
2023    88
2024    88
Name: Rating, dtype: int64


In [25]:
def adjust_rating(row):
    if row['Rating'] == 10:
        return 101
    elif row['Player'] == 'Josh Jackson':
        return 102
    else:
        return row['Rating']

# Apply the function to each row
playerRating_df['Rating'] = playerRating_df.apply(adjust_rating, axis=1)

# Now you can check the first few rows of your DataFrame
playerRating_df.head()


Unnamed: 0,Rank,Player,High School,Position,Height / Weight,Rating,Team,Year,Stars
0,1,Anthony Davis,"Anthony DavisPerspectives Charter (Chicago, IL)",PF,6-10 / 220,100,Kentucky,2011,5
1,2,Michael Kidd-Gilchrist,"Michael Kidd-GilchristSt. Patrick (Elizabeth, NJ)",SF,6-6 / 190,100,Kentucky,2011,5
2,3,Austin Rivers,"Austin RiversWinter Park (Winter Park, FL)",PG,6-3 / 175,100,Duke,2011,5
3,4,Bradley Beal,"Bradley BealChaminade (Saint Louis, MO)",SG,6-4 / 195,100,Florida,2011,5
4,5,Quincy Miller,Quincy MillerWestchester Country Day School (H...,PF,6-8 / 193,100,Baylor,2011,5


In [26]:
columns = ['Rank', 'Player', 'Team', 'Year', 'Stars']

playerRating_df = playerRating_df[columns]
playerRating_df

Unnamed: 0,Rank,Player,Team,Year,Stars
0,1,Anthony Davis,Kentucky,2011,5
1,2,Michael Kidd-Gilchrist,Kentucky,2011,5
2,3,Austin Rivers,Duke,2011,5
3,4,Bradley Beal,Florida,2011,5
4,5,Quincy Miller,Baylor,2011,5
...,...,...,...,...,...
2925,146,Bryce Dortch,Rutgers,2024,3
2926,147,Angelo Ciaravino,Northwestern,2024,3
2927,148,Malcolm Thomas,Villanova,2024,3
2928,149,Chris Tadjo,Iowa,2024,3


In [27]:
playerRating_df

Unnamed: 0,Rank,Player,Team,Year,Stars
0,1,Anthony Davis,Kentucky,2011,5
1,2,Michael Kidd-Gilchrist,Kentucky,2011,5
2,3,Austin Rivers,Duke,2011,5
3,4,Bradley Beal,Florida,2011,5
4,5,Quincy Miller,Baylor,2011,5
...,...,...,...,...,...
2925,146,Bryce Dortch,Rutgers,2024,3
2926,147,Angelo Ciaravino,Northwestern,2024,3
2927,148,Malcolm Thomas,Villanova,2024,3
2928,149,Chris Tadjo,Iowa,2024,3


In [28]:
# Group the DataFrame by 'Team' and 'Year', and count occurrences of 3, 4, and 5 stars
team_year_counts = playerRating_df.groupby(['Team', 'Year'])['Stars'].value_counts().unstack(fill_value=0)

# Calculate the total stars by multiplying the count of each star rating by its corresponding rating
total_stars = (team_year_counts[[3, 4, 5]] * [3, 4, 5]).sum(axis=1)

# Select only the columns corresponding to 3, 4, and 5 stars
byTeamStars = team_year_counts[[3, 4, 5]]

# Add the total stars to the DataFrame
byTeamStars['Total Stars'] = total_stars

# Reset index to make 'Team' and 'Year' columns regular columns instead of index
byTeamStars = byTeamStars.reset_index()

# Rename columns for clarity
byTeamStars.columns.name = None  # Remove column name for better formatting
byTeamStars.columns = ['Team', 'Year', '3 Stars', '4 Stars', '5 Stars', 'Total Stars']

# Print the resulting DataFrame
byTeamStars

Unnamed: 0,Team,Year,3 Stars,4 Stars,5 Stars,Total Stars
0,Alabama,2011,2,2,0,14
1,Alabama,2012,0,0,1,5
2,Alabama,2013,1,1,0,7
3,Alabama,2014,2,2,0,14
4,Alabama,2015,3,0,0,9
...,...,...,...,...,...,...
1272,Xavier,2021,2,0,0,6
1273,Xavier,2022,2,0,0,6
1274,Xavier,2023,4,0,0,12
1275,Xavier,2024,1,0,0,3


In [29]:
# Convert the "Year" column to string type
byTeamStars['Year'] = byTeamStars['Year'].astype(str)

In [30]:
byTeamStars.dtypes

Team           object
Year           object
3 Stars         int64
4 Stars         int64
5 Stars         int64
Total Stars     int64
dtype: object

In [31]:
merged_df_seeds.dtypes

Season             int64
DayNum             int64
WTeamID            int64
WScore             int64
LTeamID            int64
LScore             int64
WLoc              object
NumOT              int64
WFGM               int64
WFGA               int64
WFGM3              int64
WFGA3              int64
WFTM               int64
WFTA               int64
WOR                int64
WDR                int64
WAst               int64
WTO                int64
WStl               int64
WBlk               int64
WPF                int64
LFGM               int64
LFGA               int64
LFGM3              int64
LFGA3              int64
LFTM               int64
LFTA               int64
LOR                int64
LDR                int64
LAst               int64
LTO                int64
LStl               int64
LBlk               int64
LPF                int64
SCOREDIFF          int64
WFGPCT           float64
W3PCT            float64
WFTPCT           float64
WORBCHANCE         int64
WORPCT           float64


In [32]:
# # Perform fuzzy merge on merged_df_seeds['TeamName1'] and byTeamStars['Team']
# merged_df_seeds['TeamName1'] = merged_df_seeds['TeamName1'].apply(lambda x: process.extractOne(x, byTeamStars['Team'])[0])
# merged_df_seeds['TeamName2'] = merged_df_seeds['TeamName2'].apply(lambda x: process.extractOne(x, byTeamStars['Team'])[0])

# # Merge merged_df_seeds with byTeamStars on 'Team' and 'Year' columns
# merged_df_with_stars = pd.merge(merged_df_seeds, byTeamStars, left_on=['TeamName1', 'Season'], right_on=['Team', 'Year'], how='left')
# merged_df_with_stars = pd.merge(merged_df_with_stars, byTeamStars, left_on=['TeamName2', 'Season'], right_on=['Team', 'Year'], how='left', suffixes=('_Team1', '_Team2'))

# # Drop redundant columns after merging
# merged_df_with_stars.drop(['Team_Team1', 'Year_Team1', 'Team_Team2', 'Year_Team2'], axis=1, inplace=True)

# # Print the resulting DataFrame
# print(merged_df_with_stars)


# Many Nans because of year discrepancies, and Stars dont go back as far

def fuzzy_merge(df, df_to_match, key1, key2, threshold=90, limit=1):
    matches = df_to_match[key2].apply(lambda x: process.extractOne(x, df[key1], scorer=process.extractOne))
    df['matched_name'] = [match[0] if isinstance(match, tuple) and match[1] >= threshold else None for match in matches]
    df['matched_score'] = [match[1] if isinstance(match, tuple) and match[1] >= threshold else None for match in matches]
    return df


merged_df_seeds = fuzzy_merge(merged_df_seeds, byTeamStars, 'TeamName1', 'Team', threshold=80)
merged_df_seeds = fuzzy_merge(merged_df_seeds, byTeamStars, 'TeamName2', 'Team', threshold=80)
final_merged_df = pd.merge(merged_df_seeds, byTeamStars, left_on='matched_name', right_on='Team', how='left')


In [33]:
from fuzzywuzzy import process , fuzz
import pandas as pd

# Define a function to perform fuzzy matching and merge the dataframes
def fuzzy_merge(df1, df2, key1, key2, threshold=90, limit=1):
    s = df2[key2].tolist()

    m = df1[key1].apply(lambda x: process.extractOne(x, s, scorer=fuzz.token_sort_ratio))    
    df1['matches'] = m

    m2 = df1['matches'].apply(lambda x: x[0] if x[1] >= threshold else None)
    df1['matches'] = m2

    return df1

# Perform fuzzy matching and merge the dataframes for TeamName1
merged_df_seeds = fuzzy_merge(merged_df_seeds, byTeamStars, 'TeamName1', 'Team', threshold=80)
merged_df_seeds = pd.merge(merged_df_seeds, byTeamStars, left_on=['matches', 'Season'], right_on=['Team', 'Year'], how='left', suffixes=('_TeamName1', '_Team'))
merged_df_seeds.drop(columns=['Team', 'Year', 'matches'], inplace=True)

# Perform fuzzy matching and merge the dataframes for TeamName2
merged_df_seeds = fuzzy_merge(merged_df_seeds, byTeamStars, 'TeamName2', 'Team', threshold=80)
merged_df_seeds = pd.merge(merged_df_seeds, byTeamStars, left_on=['matches', 'Season'], right_on=['Team', 'Year'], how='left', suffixes=('_TeamName2', '_Team'))
merged_df_seeds.drop(columns=['Team', 'Year', 'matches'], inplace=True)

# Now you have the merged dataframe with data from byTeamStars for each team in each row of merged_df_seeds



ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [34]:
# Access the row and display all columns and values
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(merged_df_seeds.iloc[0])


In [35]:
# # Assuming merged_df_seeds is your merged dataframe

# # Define the columns of interest
# columns_of_interest = ['3 Stars_1', '4 Stars_1', '5 Stars_1', 'Total Stars_1', '3 Stars_2', '4 Stars_2', '5 Stars_2', 'Total Stars_2']

# # Count the number of NaN values in each column
# nan_counts = merged_df_seeds[columns_of_interest].isna().sum()

# # Print the results
# print("Number of NaN values in each column:")
# print(nan_counts)


In [36]:
merged_df_seeds.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'SCOREDIFF', 'WFGPCT', 'W3PCT', 'WFTPCT', 'WORBCHANCE', 'WORPCT',
       'LFGPCT', 'L3PCT', 'LFTPCT', 'LORBCHANCE', 'LORPCT', 'TeamID_x',
       'TeamName1', 'FirstD1Season', 'LastD1Season', 'TeamName2', 'Seed',
       'TeamID_y', 'WTeamSeedCorr', 'LTeamSeedCorr', 'matches'],
      dtype='object')

In [37]:
merged_df_seeds.head(1)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,TeamID_x,TeamName1,FirstD1Season,LastD1Season,TeamName2,Seed,TeamID_y,WTeamSeedCorr,LTeamSeedCorr,matches
0,2003,134,1421,92,1411,84,N,1,32,69,...,1421,UNC Asheville,1987,2024,TX Southern,X16b,1421,16,16,UNC Asheville


In [38]:
filtered_df = merged_df_seeds[merged_df_seeds['Season'] >= 2011]

In [39]:
filtered_df.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'SCOREDIFF', 'WFGPCT', 'W3PCT', 'WFTPCT', 'WORBCHANCE', 'WORPCT',
       'LFGPCT', 'L3PCT', 'LFTPCT', 'LORBCHANCE', 'LORPCT', 'TeamID_x',
       'TeamName1', 'FirstD1Season', 'LastD1Season', 'TeamName2', 'Seed',
       'TeamID_y', 'WTeamSeedCorr', 'LTeamSeedCorr', 'matches'],
      dtype='object')

In [40]:

additional_columns_to_drop = ['3 Stars_1', '4 Stars_1', '5 Stars_1', 'Total Stars_1',
                              'Team_byTeamStars_1', 'Year_byTeamStars_1',
                              '3 Stars_byTeamStars_1', '4 Stars_byTeamStars_1',
                              '5 Stars_byTeamStars_1', 'Total Stars_byTeamStars_1']

# Drop the additional columns
filtered_df = filtered_df.drop(columns=additional_columns_to_drop)

KeyError: "['3 Stars_1' '4 Stars_1' '5 Stars_1' 'Total Stars_1' 'Team_byTeamStars_1'\n 'Year_byTeamStars_1' '3 Stars_byTeamStars_1' '4 Stars_byTeamStars_1'\n '5 Stars_byTeamStars_1' 'Total Stars_byTeamStars_1'] not found in axis"

In [41]:
filtered_df.head(1)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,TeamID_x,TeamName1,FirstD1Season,LastD1Season,TeamName2,Seed,TeamID_y,WTeamSeedCorr,LTeamSeedCorr,matches
512,2011,134,1155,70,1412,52,N,0,26,50,...,1155,Clemson,1985,2024,UAB,W12a,1155,12,12,Clemson


In [42]:
filtered_df.shape

(803, 55)

In [43]:
pd.set_option('display.max_columns', None)
filtered_df.isna().sum()

Season            0
DayNum            0
WTeamID           0
WScore            0
LTeamID           0
LScore            0
WLoc              0
NumOT             0
WFGM              0
WFGA              0
WFGM3             0
WFGA3             0
WFTM              0
WFTA              0
WOR               0
WDR               0
WAst              0
WTO               0
WStl              0
WBlk              0
WPF               0
LFGM              0
LFGA              0
LFGM3             0
LFGA3             0
LFTM              0
LFTA              0
LOR               0
LDR               0
LAst              0
LTO               0
LStl              0
LBlk              0
LPF               0
SCOREDIFF         0
WFGPCT            0
W3PCT             0
WFTPCT            0
WORBCHANCE        0
WORPCT            0
LFGPCT            0
L3PCT             0
LFTPCT            0
LORBCHANCE        0
LORPCT            0
TeamID_x          0
TeamName1         0
FirstD1Season     0
LastD1Season      0
TeamName2         0


# KENPOM

In [44]:
Pomeroy_data = 'kenpom_2023.csv'
pomeroy_df = pd.read_csv(Pomeroy_data)
pomeroy_df

# Display df
pomeroy_df

Unnamed: 0.1,Unnamed: 0,Season,TeamName,adj_o,adj_d,adj_tempo,luck,sos_adj_o,sos_adj_d
0,0,2011,Ohio St.,125.4,88.4,66.0,0.043,107.4,98.3
1,1,2011,Duke,118.8,87.2,70.1,0.006,106.0,97.4
2,2,2011,Kansas,119.8,88.3,69.6,0.071,106.1,98.7
3,3,2011,Texas,114.0,85.3,67.2,-0.055,105.6,97.8
4,4,2011,Purdue,116.1,87.2,67.1,-0.004,108.1,97.3
...,...,...,...,...,...,...,...,...,...
4218,358,2023,Florida A&M;,89.0,110.7,64.1,-0.013,103.0,105.2
4219,359,2023,IUPUI,93.1,117.9,66.9,-0.079,102.1,108.0
4220,360,2023,Green Bay,90.6,116.6,65.1,0.031,104.4,106.9
4221,361,2023,LIU,85.7,114.3,72.0,-0.027,101.2,108.6


In [45]:
merged_df_seeds['Season'] = merged_df_seeds['Season'].astype(str)
merged_df_seeds.dtypes

Season            object
DayNum             int64
WTeamID            int64
WScore             int64
LTeamID            int64
LScore             int64
WLoc              object
NumOT              int64
WFGM               int64
WFGA               int64
WFGM3              int64
WFGA3              int64
WFTM               int64
WFTA               int64
WOR                int64
WDR                int64
WAst               int64
WTO                int64
WStl               int64
WBlk               int64
WPF                int64
LFGM               int64
LFGA               int64
LFGM3              int64
LFGA3              int64
LFTM               int64
LFTA               int64
LOR                int64
LDR                int64
LAst               int64
LTO                int64
LStl               int64
LBlk               int64
LPF                int64
SCOREDIFF          int64
WFGPCT           float64
W3PCT            float64
WFTPCT           float64
WORBCHANCE         int64
WORPCT           float64


In [46]:
pomeroy_df['Season'] = pomeroy_df['Season'].astype(str)
pomeroy_df.dtypes

Unnamed: 0      int64
Season         object
TeamName       object
adj_o         float64
adj_d         float64
adj_tempo     float64
luck          float64
sos_adj_o     float64
sos_adj_d     float64
dtype: object

In [None]:

# Define a function to perform fuzzy matching and merge the dataframes
def fuzzy_merge(df1, df2, on, left_prefix, right_prefix):
    merged_df = df1.copy()
    for index, row in merged_df.iterrows():
        # Fuzzy match team names
        w_team_name = row['TeamName1']
        l_team_name = row['TeamName2']
        
        # Fuzzy match W team name
        w_match = process.extractOne(w_team_name, df2['TeamName'])
        l_match = process.extractOne(l_team_name, df2['TeamName'])
        
        # Find the corresponding rows in df2
        w_row = df2[df2['TeamName'] == w_match[0]]
        l_row = df2[df2['TeamName'] == l_match[0]]
        
        # Rename columns and merge for W team
        w_row.columns = [f"{right_prefix}W_{col}" if col != on else col for col in w_row.columns]
        for col in w_row.columns:
            merged_df.loc[index, f"{left_prefix}{col}"] = w_row[col].values[0]
        
        # Rename columns and merge for L team
        l_row.columns = [f"{right_prefix}L_{col}" if col != on else col for col in l_row.columns]
        for col in l_row.columns:
            merged_df.loc[index, f"{left_prefix}{col}"] = l_row[col].values[0]
    
    return merged_df

# Perform fuzzy merge and rename columns
merged_df = fuzzy_merge(merged_df_seeds, pomeroy_df, on='TeamName', left_prefix='W', right_prefix='Pomeroy_')

# Display the merged dataframe
merged_df


In [None]:
merged_df.columns

In [None]:
def remove_prefix(column_name):
    return column_name.replace('WPomeroy_', '')

# Rename columns
merged_df.columns = [remove_prefix(col) for col in merged_df.columns]

In [None]:
merged_df.columns

# Clean Data

In [None]:
import pandas as pd

def prepare_data(df_data):
    df = df_data.copy()
    df.rename(columns={'WLoc': 'location'}, inplace=True)

    # Drop unnecessary columns
    df.drop(columns=['TeamID_x', 'TeamName1', 'FirstD1Season', 'LastD1Season', 'TeamName2', 'Seed',
                     'TeamID_y', 'W_Unnamed: 0', 'W_Season', 'WTeamName',
                     'L_Unnamed: 0', 'L_Season'], inplace=True)

    dfswap = df[['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'location', 'NumOT',
                 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
                 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF',
                 'WTeamSeedCorr', 'LTeamSeedCorr', 'SCOREDIFF', 'WFGPCT', 'W3PCT', 'WFTPCT', 'WORBCHANCE', 'WORPCT',
                 'LFGPCT', 'L3PCT', 'LFTPCT', 'LORBCHANCE', 'LORPCT']]

    df.columns = df.columns.str.replace('W', 'T1_')
    df.columns = df.columns.str.replace('L', 'T2_')
    dfswap.columns = dfswap.columns.str.replace('L', 'T1_')
    dfswap.columns = dfswap.columns.str.replace('W', 'T2_')

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location == 'N', 'location'] = '0'
    output.loc[output.location == 'H', 'location'] = '1'
    output.loc[output.location == 'A', 'location'] = '-1'
    output.location = output.location.astype(int)
    output['PointDiff'] = output['T1_Score'] - output['T2_Score']
    output['Outcome'] = (output['PointDiff'] > 0).astype(int)

    return output



In [None]:
#tournament_data = prepare_data(detailed_tourney_results)
tournament_data = prepare_data(merged_df)

In [None]:
tournament_data

In [None]:
tournament_data.columns

# Get Average amounts

In [None]:
# Choose the features that you want
# boxscore_cols = ['T1_Score', 'T2_Score',
#         'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_OR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_PF', 
#         'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_OR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk',  
#         'PointDiff','T1_FGPCT', 'T1_3PCT', 'T1_FTPCT', 'T1_ORBCHANCE',
#        'T1_ORPCT', 'T2_FGPCT', 'T2_3PCT', 'T2_FTPCT', 'T2_ORBCHANCE',
#        'T2_ORPCT', 'T1_TeamSeedCorr', 'T2_TeamSeedCorr', 'T1__adj_o',
#        'T1__adj_d', 'T1__adj_tempo', 'T1__luck', 'T1__sos_adj_o',
#        'T1__sos_adj_d', 'T2__adj_o', 'T2__adj_d', 'T2__adj_tempo', 'T2__luck',
#        'T2__sos_adj_o', 'T2__sos_adj_d']

boxscore_cols = ['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID', 'T2_Score',
       'location', 'NumOT', 'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_FTM',
       'T1_FTA', 'T1_OR', 'T1_DR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_Blk',
       'T1_PF', 'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_FTM', 'T2_FTA',
       'T2_OR', 'T2_DR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk', 'T2_PF',
       'SCOREDIFF', 'T1_FGPCT', 'T1_3PCT', 'T1_FTPCT', 'T1_ORBCHANCE',
       'T1_ORPCT', 'T2_FGPCT', 'T2_3PCT', 'T2_FTPCT', 'T2_ORBCHANCE',
       'T2_ORPCT', 'T1_TeamSeedCorr', 'T2_TeamSeedCorr', '3 Stars_1',
       '4 Stars_1', '5 Stars_1', 'Total Stars_1', '3 Stars_2', '4 Stars_2',
       '5 Stars_2', 'Total Stars_2', 'T1__adj_o', 'T1__adj_d', 'T1__adj_tempo',
       'T1__luck', 'T1__sos_adj_o', 'T1__sos_adj_d', 'T2__adj_o', 'T2__adj_d',
       'T2__adj_tempo', 'T2__luck', 'T2__sos_adj_o', 'T2__sos_adj_d',
       'PointDiff', 'Outcome']

In [None]:
# Get season averages by team and yearb
funcs = [np.mean]
season_statistics = tournament_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg(funcs).reset_index()
season_statistics.columns = [''.join(col).strip() for col in season_statistics.columns.values]
season_statistics

In [None]:
season_statistics.columns

In [None]:
tournament_data = pd.merge(tournament_data, season_statistics, on = ['Season', 'T1_TeamID'], how = 'left')

In [None]:
tournament_data.columns

In [None]:
# Calculate %FGM and %FGM3 for Team 1
tournament_data['T1_FG_PCT'] = (tournament_data['T1_FGMmean'] / tournament_data['T1_FGAmean']) * 100
tournament_data['T1_FGM3_PCT'] = (tournament_data['T1_FGM3mean'] / tournament_data['T1_FGA3mean']) * 100

# Calculate %FGM and %FGM3 for Team 2
tournament_data['T2_FG_PCT'] = (tournament_data['T2_FGMmean'] / tournament_data['T2_FGAmean']) * 100
tournament_data['T2_FGM3_PCT'] = (tournament_data['T2_FGM3mean'] / tournament_data['T2_FGA3mean']) * 100


In [None]:
# Convert DataFrame's columns to a list
columns_list = tournament_data.columns.tolist()

# Print the list of columns to display all columns
print(columns_list)


# XGBoost RUN 1

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


# Load your dataframe
df = tournament_data.copy()


# Select the features
selected_features = ['T1_Scoremean', 'T2_Scoremean', 'T1_FG_PCT', 'T1_FGM3_PCT', 'T2_FG_PCT', 'T2_FGM3_PCT']

# Extract the features and target variable
X = df[selected_features]
y = df['Outcome']  # Assuming 'Outcome' indicates the winner (1 for Team 1, 0 for Team 2)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost model
model = XGBClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:

# Make probability predictions on the test data
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate the Brier score
brier_score = brier_score_loss(y_test, y_pred_proba)

print("Brier Score:", brier_score)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.metrics import mean_squared_error

# Learning Curve
train_sizes, train_scores, valid_scores = learning_curve(model, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=5)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, valid_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.title("Learning Curve")
plt.legend(loc="best")
plt.show()

# Validation Curve
param_range = np.arange(1, 11)
train_scores, valid_scores = validation_curve(model, X, y, param_name="max_depth", param_range=param_range, cv=5)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(param_range, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g")
plt.plot(param_range, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(param_range, valid_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.xlabel("Max Depth")
plt.ylabel("Score")
plt.title("Validation Curve")
plt.legend(loc="best")
plt.show()

# Feature Importance Plot
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(10, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X.columns)[sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Feature Importance Plot')
plt.show()

# Residual Plot (if regression)
if isinstance(model, XGBRegressor):
    y_pred = model.predict(X)
    residuals = y - y_pred
    plt.figure(figsize=(10, 6))
    plt.scatter(y_pred, residuals)
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residual Plot')
    plt.axhline(y=0, color='r', linestyle='-')
    plt.show()


# XGBoost RUN 2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load your dataframe
df = tournament_data.copy()

# Add additional features if needed
# For example, you can add interactions between features, polynomial features, or other derived features

# Select features
selected_features = ['T1_Scoremean', 'T2_Scoremean', 'T1_FGMmean', 'T1_FGAmean', 'T1_FGM3mean', 'T1_FGA3mean', 
                     'T1_ORmean', 'T1_Astmean', 'T1_TOmean', 'T1_Stlmean', 'T1_PFmean', 
                     'T2_FGMmean', 'T2_FGAmean', 'T2_FGM3mean', 'T2_FGA3mean', 'T2_ORmean', 
                     'T2_Astmean', 'T2_TOmean', 'T2_Stlmean', 'T2_Blkmean', 'PointDiffmean','T1_FG_PCT',
                     'T1_FGM3_PCT', 'T2_FG_PCT', 'T2_FGM3_PCT', '5 Stars_2mean', '5 Stars_1mean']

# Extract the features and target variable
X = df[selected_features]
y = df['Outcome']  # Assuming 'Outcome' indicates the winner (1 for Team 1, 0 for Team 2)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost model
model = XGBClassifier()

# Define hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001]
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Train the best model
best_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:

# Make probability predictions on the test data
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate the Brier score
brier_score = brier_score_loss(y_test, y_pred_proba)

print("Brier Score:", brier_score)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt="d", cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

# Generate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
# Plot feature importance
feature_importance = best_model.feature_importances_
sorted_idx = np.argsort(feature_importance)

plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X.columns)[sorted_idx])
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance Plot')
plt.show()


# XGBoost RUN 3

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


# Load your dataframe
df = tournament_data.copy()

label_encoder = LabelEncoder()

# Encode the 'T1_TeamSeed' and 'T2_TeamSeed' columns
df['T1_TeamSeedCorr'] = label_encoder.fit_transform(df['T1_TeamSeedCorr'])
df['T2_TeamSeedCorr'] = label_encoder.fit_transform(df['T2_TeamSeedCorr'])


# Feature engineering: Add new features or transform existing ones
# For example, you can try adding interactions between features or polynomial features

# Select features
selected_features = ['T1_Scoremean', 'T2_Scoremean', 'T1_FG_PCT', 'T1_FGM3_PCT', 'T2_FG_PCT', 'T2_FGM3_PCT', 'PointDiffmean', 'T1_TeamSeedCorr', 'T2_TeamSeedCorr']

# Extract the features and target variable
X = df[selected_features]
y = df['Outcome']  # Assuming 'Outcome' indicates the winner (1 for Team 1, 0 for Team 2)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001]
}

# Initialize the XGBoost model
model = XGBClassifier()

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Train the best model
best_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Improved Accuracy:", accuracy)


In [None]:

# Make probability predictions on the test data
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate the Brier score
brier_score = brier_score_loss(y_test, y_pred_proba)

print("Brier Score:", brier_score)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt="d", cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

# Generate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
# Plot feature importance
feature_importance = best_model.feature_importances_
sorted_idx = np.argsort(feature_importance)

plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X.columns)[sorted_idx])
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance Plot')
plt.show()


# Run 4

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


# Load your dataframe
df = tournament_data.copy()

label_encoder = LabelEncoder()

# Encode the 'T1_TeamSeed' and 'T2_TeamSeed' columns
df['T1_TeamSeedCorr'] = label_encoder.fit_transform(df['T1_TeamSeedCorr'])
df['T2_TeamSeedCorr'] = label_encoder.fit_transform(df['T2_TeamSeedCorr'])


# Feature engineering: Add new features or transform existing ones
# For example, you can try adding interactions between features or polynomial features

# Select features
selected_features = ['T1_Scoremean', 'T2_Scoremean', 'T1_FGMmean',
       'T1_FGAmean', 'T1_FGM3mean', 'T1_FGA3mean', 'T1_ORmean', 'T1_Astmean',
       'T1_TOmean', 'T1_Stlmean', 'T1_PFmean', 'T2_FGMmean', 'T2_FGAmean',
       'T2_FGM3mean', 'T2_FGA3mean', 'T2_ORmean', 'T2_Astmean', 'T2_TOmean',
       'T2_Stlmean', 'T2_Blkmean', 'PointDiffmean', 'T1_FGPCTmean',
       'T1_3PCTmean', 'T1_FTPCTmean', 'T1_ORBCHANCEmean', 'T1_ORPCTmean',
       'T2_FGPCTmean', 'T2_3PCTmean', 'T2_FTPCTmean', 'T2_ORBCHANCEmean',
       'T2_ORPCTmean', 'T1_TeamSeedCorrmean', 'T2_TeamSeedCorrmean',
       'T1__adj_omean', 'T1__adj_dmean', 'T1__adj_tempomean', 'T1__luckmean',
       'T1__sos_adj_omean', 'T1__sos_adj_dmean', 'T2__adj_omean',
       'T2__adj_dmean', 'T2__adj_tempomean', 'T2__luckmean',
       'T2__sos_adj_omean', 'T2__sos_adj_dmean']

# Extract the features and target variable
X = df[selected_features]
y = df['Outcome']  # Assuming 'Outcome' indicates the winner (1 for Team 1, 0 for Team 2)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001]
}

# Initialize the XGBoost model
model = XGBClassifier()

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Train the best model
best_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Improved Accuracy:", accuracy)

In [None]:

# Make probability predictions on the test data
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate the Brier score
brier_score = brier_score_loss(y_test, y_pred_proba)

print("Brier Score:", brier_score)
