# Madness of March
## Jim Haines & Josh McCoy
### [Project Website](https://joshmccoy2.github.io/NCAA_March_Madness/)

## Current Datasets
[Kaggle datasets](https://www.kaggle.com/competitions/mens-march-mania-2022/data)

# ETL & EDA

In [1]:
# Import necessary libraries
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import xgboost as xgb
import requests
import numpy as np
import csv
import re
import time
import Levenshtein 
from io import StringIO
from io import StringIO
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import brier_score_loss, accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, r2_score, roc_curve, auc, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from scipy.special import expit 
from scipy.special import expit  
from xgboost import XGBClassifier
from bs4 import BeautifulSoup
from fuzzywuzzy import process

We can first look at how some basic factors affect the amount the winning team wins by. We want to look at the difference in score to see how dominant a team is against a competitor. A team who wins by a lot is typically a sign that this team is significantly better than its oponent.

# Regular Season Data

In [2]:
# To do this we can look at regular season game stats
detailed_results_data = 'MRegularSeasonDetailedResults.csv'
detailed_results = pd.read_csv(detailed_results_data)
detailed_results

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100418,2022,98,1400,79,1242,76,H,0,28,67,...,13,15,23,5,24,10,15,3,5,21
100419,2022,98,1411,66,1126,63,A,0,24,59,...,21,15,24,5,23,10,19,13,2,23
100420,2022,98,1422,68,1441,49,A,0,23,56,...,24,8,11,10,18,5,16,8,2,12
100421,2022,98,1438,69,1181,68,A,0,31,65,...,17,18,22,11,25,14,14,3,9,11



Let's check the datatypes

In [3]:
detailed_results.dtypes

Season      int64
DayNum      int64
WTeamID     int64
WScore      int64
LTeamID     int64
LScore      int64
WLoc       object
NumOT       int64
WFGM        int64
WFGA        int64
WFGM3       int64
WFGA3       int64
WFTM        int64
WFTA        int64
WOR         int64
WDR         int64
WAst        int64
WTO         int64
WStl        int64
WBlk        int64
WPF         int64
LFGM        int64
LFGA        int64
LFGM3       int64
LFGA3       int64
LFTM        int64
LFTA        int64
LOR         int64
LDR         int64
LAst        int64
LTO         int64
LStl        int64
LBlk        int64
LPF         int64
dtype: object

All the dtypes look correct, so lets move on to our analysis

In [4]:
# Create variable for amount by which winning team wins
detailed_results['SCOREDIFF'] = detailed_results['WScore'] - detailed_results['LScore']

When you see W or L this stands for Winning or Losing team.

In [5]:
# Winner stats
detailed_results.dtypes
detailed_results['WFGPCT'] = detailed_results['WFGM']/detailed_results['WFGA']       # field goal pt completion %
detailed_results['W3PCT'] = detailed_results['WFGM3']/detailed_results['WFGA3']      # 3 pt completion %
detailed_results['WFTPCT'] = detailed_results['WFTM']/detailed_results['WFTA']       # free throw completion %
detailed_results['WORBCHANCE'] =  detailed_results['WOR'] + detailed_results['LDR']  # total potential rebounds
detailed_results['WORPCT'] = detailed_results['WOR']/detailed_results['WORBCHANCE']  # rebound completion %

In [6]:
# Losing stats
detailed_results['LFGPCT'] = detailed_results['LFGM']/detailed_results['LFGA']       
detailed_results['L3PCT'] = detailed_results['LFGM3']/detailed_results['LFGA3']
detailed_results['LFTPCT'] = detailed_results['LFTM']/detailed_results['LFTA']
detailed_results['LORBCHANCE'] =  detailed_results['LOR'] + detailed_results['WDR']
detailed_results['LORPCT'] = detailed_results['LOR']/detailed_results['LORBCHANCE']

We can now take a more detailed look at the correlation between winner field goal % and score difference.

We can look at other factors as well:

Despite the common phrase "board man gets paid", we can see that there is a less strong correlation between rebound % and score difference. Perhaps this saying holds more true a higher level of play.

We can also take a look at detailed results from the actual tournament to see if there are any obvious differences from the regular season data. To do this we can replicate the above regular season analysis.

# Tournament Data

In [7]:
detailed_tourney_data = 'MNCAATourneyDetailedResults.csv'
detailed_tourney_results = pd.read_csv(detailed_tourney_data)
detailed_tourney_results

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1310,2023,146,1274,88,1400,81,N,0,29,49,...,25,11,15,9,14,20,13,6,1,23
1311,2023,146,1361,57,1166,56,N,0,25,66,...,17,10,11,8,24,9,9,3,2,11
1312,2023,152,1163,72,1274,59,N,0,28,57,...,20,12,12,12,17,10,9,8,1,12
1313,2023,152,1361,72,1194,71,N,0,25,57,...,22,16,21,7,24,6,9,6,2,17


In [8]:
detailed_tourney_results.dtypes

Season      int64
DayNum      int64
WTeamID     int64
WScore      int64
LTeamID     int64
LScore      int64
WLoc       object
NumOT       int64
WFGM        int64
WFGA        int64
WFGM3       int64
WFGA3       int64
WFTM        int64
WFTA        int64
WOR         int64
WDR         int64
WAst        int64
WTO         int64
WStl        int64
WBlk        int64
WPF         int64
LFGM        int64
LFGA        int64
LFGM3       int64
LFGA3       int64
LFTM        int64
LFTA        int64
LOR         int64
LDR         int64
LAst        int64
LTO         int64
LStl        int64
LBlk        int64
LPF         int64
dtype: object

In [9]:
#Create variable for amount by which winning team wins
detailed_tourney_results['SCOREDIFF'] = detailed_tourney_results['WScore'] - detailed_tourney_results['LScore']

In [10]:
# Winner stats
detailed_tourney_results.dtypes
detailed_tourney_results['WFGPCT'] = detailed_tourney_results['WFGM']/detailed_tourney_results['WFGA']       # field goal pt completion %
detailed_tourney_results['W3PCT'] = detailed_tourney_results['WFGM3']/detailed_tourney_results['WFGA3']      # 3 pt completion %
detailed_tourney_results['WFTPCT'] = detailed_tourney_results['WFTM']/detailed_tourney_results['WFTA']       # free throw completion %
detailed_tourney_results['WORBCHANCE'] =  detailed_tourney_results['WOR'] + detailed_tourney_results['LDR']  # total potential rebounds
detailed_tourney_results['WORPCT'] = detailed_tourney_results['WOR']/detailed_tourney_results['WORBCHANCE']  # rebound completion %

In [11]:
# Losing stats
detailed_tourney_results['LFGPCT'] = detailed_tourney_results['LFGM']/detailed_tourney_results['LFGA']       
detailed_tourney_results['L3PCT'] = detailed_tourney_results['LFGM3']/detailed_tourney_results['LFGA3']
detailed_tourney_results['LFTPCT'] = detailed_tourney_results['LFTM']/detailed_tourney_results['LFTA']
detailed_tourney_results['LORBCHANCE'] =  detailed_tourney_results['LOR'] + detailed_tourney_results['WDR']
detailed_tourney_results['LORPCT'] = detailed_tourney_results['LOR']/detailed_tourney_results['LORBCHANCE']

In [12]:
detailed_tourney_results.dtypes

Season          int64
DayNum          int64
WTeamID         int64
WScore          int64
LTeamID         int64
LScore          int64
WLoc           object
NumOT           int64
WFGM            int64
WFGA            int64
WFGM3           int64
WFGA3           int64
WFTM            int64
WFTA            int64
WOR             int64
WDR             int64
WAst            int64
WTO             int64
WStl            int64
WBlk            int64
WPF             int64
LFGM            int64
LFGA            int64
LFGM3           int64
LFGA3           int64
LFTM            int64
LFTA            int64
LOR             int64
LDR             int64
LAst            int64
LTO             int64
LStl            int64
LBlk            int64
LPF             int64
SCOREDIFF       int64
WFGPCT        float64
W3PCT         float64
WFTPCT        float64
WORBCHANCE      int64
WORPCT        float64
LFGPCT        float64
L3PCT         float64
LFTPCT        float64
LORBCHANCE      int64
LORPCT        float64
dtype: obj

In [13]:
detailed_tourney_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,WFGPCT,W3PCT,WFTPCT,WORBCHANCE,WORPCT,LFGPCT,L3PCT,LFTPCT,LORBCHANCE,LORPCT
0,2003,134,1421,92,1411,84,N,1,32,69,...,0.463768,0.37931,0.653846,42,0.333333,0.432836,0.387097,0.451613,47,0.361702
1,2003,136,1112,80,1436,51,N,0,31,66,...,0.469697,0.304348,0.785714,37,0.297297,0.3125,0.25,1.0,44,0.181818
2,2003,136,1113,84,1272,71,N,0,31,59,...,0.525424,0.428571,0.727273,32,0.3125,0.362319,0.25,0.666667,47,0.425532
3,2003,136,1141,79,1166,73,N,0,29,53,...,0.54717,0.428571,0.72,28,0.392857,0.45,0.411765,0.705882,34,0.411765
4,2003,136,1143,76,1301,74,N,1,27,64,...,0.421875,0.35,0.652174,44,0.409091,0.446429,0.428571,0.75,30,0.333333


# Separating It out

In [14]:
# Create a new data frame for winning teams
df_winning = detailed_tourney_results[['Season', 'DayNum', 'NumOT', 'SCOREDIFF'] + [col for col in detailed_tourney_results.columns if col.startswith('W')]].copy()
df_winning['Outcome'] = 1
df_winning['Outcome'] = df_winning['Outcome'].astype(int)


# Create a new data frame for losing teams
df_losing = detailed_tourney_results[['Season', 'DayNum', 'NumOT', 'SCOREDIFF'] + [col for col in detailed_tourney_results.columns if col.startswith('L')]].copy()
df_losing['Outcome'] = 0
df_losing['Outcome'] = df_losing['Outcome'].astype(int)

df_winning.head()
df_losing.head()
# Remove the 'W' or 'L' prefix from the column names
df_winning.columns = [col[1:] if col.startswith('W') else col for col in df_winning.columns]
df_losing.columns = [col[1:] if col.startswith('L') else col for col in df_losing.columns]
# Combine the two DataFrames
combined_df = pd.concat([df_winning, df_losing], ignore_index=True)
combined_df.head(1)

Unnamed: 0,Season,DayNum,NumOT,SCOREDIFF,TeamID,Score,Loc,FGM,FGA,FGM3,...,TO,Stl,Blk,PF,FGPCT,3PCT,FTPCT,ORBCHANCE,ORPCT,Outcome
0,2003,134,1,8,1421,92,N,32,69,11,...,12,5,3,22,0.463768,0.37931,0.653846,42,0.333333,1


In [15]:
mteams_df = pd.read_csv('MTeams.csv')

# Merge on 'TeamID'
combined_df = pd.merge(combined_df, mteams_df[['TeamID', 'TeamName']], on='TeamID', how='left')
combined_df

Unnamed: 0,Season,DayNum,NumOT,SCOREDIFF,TeamID,Score,Loc,FGM,FGA,FGM3,...,Stl,Blk,PF,FGPCT,3PCT,FTPCT,ORBCHANCE,ORPCT,Outcome,TeamName
0,2003,134,1,8,1421,92,N,32,69,11,...,5,3,22,0.463768,0.379310,0.653846,42,0.333333,1,UNC Asheville
1,2003,136,0,29,1112,80,N,31,66,7,...,10,7,8,0.469697,0.304348,0.785714,37,0.297297,1,Arizona
2,2003,136,0,13,1113,84,N,31,59,6,...,7,4,19,0.525424,0.428571,0.727273,32,0.312500,1,Arizona St
3,2003,136,0,6,1141,79,N,29,53,3,...,13,1,19,0.547170,0.428571,0.720000,28,0.392857,1,C Michigan
4,2003,136,1,2,1143,76,N,27,64,7,...,8,2,14,0.421875,0.350000,0.652174,44,0.409091,1,California
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2625,2023,146,0,7,1400,81,,30,60,10,...,6,1,23,0.500000,0.400000,0.733333,28,0.321429,0,Texas
2626,2023,146,0,1,1166,56,,22,55,2,...,3,2,11,0.400000,0.117647,0.909091,31,0.258065,0,Creighton
2627,2023,152,0,13,1274,59,,20,62,7,...,8,1,12,0.322581,0.350000,1.000000,39,0.307692,0,Miami FL
2628,2023,152,0,1,1194,71,,23,52,9,...,6,2,17,0.442308,0.409091,0.761905,30,0.233333,0,FL Atlantic


In [16]:
columns_to_keep = ['Season', 'TeamID', 'TeamName']
combined_df = combined_df[columns_to_keep]

# Drop duplicates within each season
combined_df = combined_df.groupby('Season').apply(lambda x: x.drop_duplicates(subset=['TeamID', 'TeamName']))

# Reset index after dropping duplicates
combined_df = combined_df.reset_index(drop=True)

# Merge with MNCAATourneySeeds.csv to get seeds of each team

In [17]:
# Load the seeds dataframe
seeds_df = pd.read_csv('MNCAATourneySeeds.csv')

# Extract the integer part from the Seed column and store it as integer
seeds_df['Seed_correct'] = seeds_df['Seed'].str.extract('(\d+)').astype(int)
seeds_df

Unnamed: 0,Season,Seed,TeamID,Seed_correct
0,1985,W01,1207,1
1,1985,W02,1210,2
2,1985,W03,1228,3
3,1985,W04,1260,4
4,1985,W05,1374,5
...,...,...,...,...
2485,2023,Z12,1433,12
2486,2023,Z13,1233,13
2487,2023,Z14,1213,14
2488,2023,Z15,1421,15


In [18]:
merged_df = pd.merge(combined_df, seeds_df,on=['TeamID', 'Season'], how='left')

In [19]:
merged_df

Unnamed: 0,Season,TeamID,TeamName,Seed,Seed_correct
0,2003,1421,UNC Asheville,X16b,16
1,2003,1112,Arizona,Z01,1
2,2003,1113,Arizona St,Z10,10
3,2003,1141,C Michigan,Z11,11
4,2003,1143,California,W08,8
...,...,...,...,...,...
1330,2023,1179,Drake,Y12,12
1331,2023,1425,USC,W10,10
1332,2023,1235,Iowa St,Y06,6
1333,2023,1433,VCU,Z12,12


# Merge 247 data (stars)

In [20]:
playerRating_df = pd.DataFrame()

# Loop through each year, read the CSV, add a 'Year' column, and append to the main DataFrame
for year in range(2011, 2025):
    file_name = f'recruits_{year}.csv'
    if os.path.exists(file_name):  # Check if the file exists
        df = pd.read_csv(file_name)
        df['Year'] = year  # Add a column for the year
        playerRating_df = pd.concat([playerRating_df, df], ignore_index=True)

# Save the combined data to a new CSV file
playerRating_df.to_csv('combined_recruits.csv', index=False)

# If you want to see the DataFrame
playerRating_df

Unnamed: 0,Rank,Player,High School,Position,Height / Weight,Rating,Team,Year
0,1,Anthony Davis,"Anthony DavisPerspectives Charter (Chicago, IL)",PF,6-10 / 220,100,Kentucky,2011
1,2,Michael Kidd-Gilchrist,"Michael Kidd-GilchristSt. Patrick (Elizabeth, NJ)",SF,6-6 / 190,100,Kentucky,2011
2,3,Austin Rivers,"Austin RiversWinter Park (Winter Park, FL)",PG,6-3 / 175,100,Duke,2011
3,4,Bradley Beal,"Bradley BealChaminade (Saint Louis, MO)",SG,6-4 / 195,100,Florida,2011
4,5,Quincy Miller,Quincy MillerWestchester Country Day School (H...,PF,6-8 / 193,100,Baylor,2011
...,...,...,...,...,...,...,...,...
2925,146,Bryce Dortch,Bryce DortchBrimmer And May School (Chestnut H...,PF,6-8 / 190,88,Rutgers,2024
2926,147,Angelo Ciaravino,"Angelo CiaravinoMount Carmel (Chicago, IL)",SF,6-5 / 175,88,Northwestern,2024
2927,148,Malcolm Thomas,"Malcolm ThomasDeMatha Catholic (Hyattsville, MD)",PF,6-8 / 195,88,Villanova,2024
2928,149,Chris Tadjo,Chris TadjoNBA Academy Latin America (Montreal...,PF,6-8 / 220,88,Iowa,2024


In [21]:
def rating_to_stars(rating):
    if 100 >= rating >= 98:
        return 5
    elif 97 >= rating >= 94:
        return 4
    elif 93 >= rating >= 0:
        return 3
    else:
        return None  # or some default value for ratings outside the specified ranges

# Apply the function to the 'Rating' column to create the 'Star' column
playerRating_df['Stars'] = playerRating_df['Rating'].apply(rating_to_stars)

# Now you can check the first few rows of your DataFrame
playerRating_df.head()

Unnamed: 0,Rank,Player,High School,Position,Height / Weight,Rating,Team,Year,Stars
0,1,Anthony Davis,"Anthony DavisPerspectives Charter (Chicago, IL)",PF,6-10 / 220,100,Kentucky,2011,5
1,2,Michael Kidd-Gilchrist,"Michael Kidd-GilchristSt. Patrick (Elizabeth, NJ)",SF,6-6 / 190,100,Kentucky,2011,5
2,3,Austin Rivers,"Austin RiversWinter Park (Winter Park, FL)",PG,6-3 / 175,100,Duke,2011,5
3,4,Bradley Beal,"Bradley BealChaminade (Saint Louis, MO)",SG,6-4 / 195,100,Florida,2011,5
4,5,Quincy Miller,Quincy MillerWestchester Country Day School (H...,PF,6-8 / 193,100,Baylor,2011,5


In [22]:
top_players = playerRating_df.groupby('Year')['Rating'].min()
print(top_players)

Year
2011    70
2012    10
2013    83
2014    87
2015    87
2016    10
2017    87
2018    88
2019    88
2020    90
2021    89
2022    89
2023    88
2024    88
Name: Rating, dtype: int64


In [23]:
def adjust_rating(row):
    if row['Rating'] == 10:
        return 101
    elif row['Player'] == 'Josh Jackson':
        return 102
    else:
        return row['Rating']

# Apply the function to each row
playerRating_df['Rating'] = playerRating_df.apply(adjust_rating, axis=1)


In [24]:
columns = ['Rank', 'Player', 'Team', 'Year', 'Stars']

playerRating_df = playerRating_df[columns]
playerRating_df.head(1)

Unnamed: 0,Rank,Player,Team,Year,Stars
0,1,Anthony Davis,Kentucky,2011,5


In [25]:
# Group the DataFrame by 'Team' and 'Year', and count occurrences of 3, 4, and 5 stars
team_year_counts = playerRating_df.groupby(['Team', 'Year'])['Stars'].value_counts().unstack(fill_value=0)

# Calculate the total stars by multiplying the count of each star rating by its corresponding rating
total_stars = (team_year_counts[[3, 4, 5]] * [3, 4, 5]).sum(axis=1)

# Select only the columns corresponding to 3, 4, and 5 stars
byTeamStars = team_year_counts[[3, 4, 5]]

# Add the total stars to the DataFrame
byTeamStars['Total Stars'] = total_stars

# Reset index to make 'Team' and 'Year' columns regular columns instead of index
byTeamStars = byTeamStars.reset_index()

# Rename columns for clarity
byTeamStars.columns.name = None  # Remove column name for better formatting
byTeamStars.columns = ['Team', 'Year', '3 Stars', '4 Stars', '5 Stars', 'Total Stars']

# Print the resulting DataFrame
byTeamStars

Unnamed: 0,Team,Year,3 Stars,4 Stars,5 Stars,Total Stars
0,Alabama,2011,2,2,0,14
1,Alabama,2012,0,0,1,5
2,Alabama,2013,1,1,0,7
3,Alabama,2014,2,2,0,14
4,Alabama,2015,3,0,0,9
...,...,...,...,...,...,...
1272,Xavier,2021,2,0,0,6
1273,Xavier,2022,2,0,0,6
1274,Xavier,2023,4,0,0,12
1275,Xavier,2024,1,0,0,3


In [26]:
# Convert the "Year" column to string type
byTeamStars['Year'] = byTeamStars['Year'].astype(str)

In [27]:
byTeamStars.dtypes

Team           object
Year           object
3 Stars         int64
4 Stars         int64
5 Stars         int64
Total Stars     int64
dtype: object

In [28]:
byu_data = byTeamStars[byTeamStars['Team'] == 'Western Kentucky']
byu_data

Unnamed: 0,Team,Year,3 Stars,4 Stars,5 Stars,Total Stars
1236,Western Kentucky,2011,2,1,0,10
1237,Western Kentucky,2015,0,1,0,4
1238,Western Kentucky,2017,1,1,1,12
1239,Western Kentucky,2018,1,0,1,8
1240,Western Kentucky,2019,1,0,0,3
1241,Western Kentucky,2021,1,0,0,3


In [29]:
byTeamStars = byTeamStars.rename(columns={'Year': 'Season'})
byTeamStars['Season'] = byTeamStars['Season'].astype(int)
byTeamStars = byTeamStars.rename(columns={'Team': 'TeamName'})
byTeamStars.dtypes

TeamName       object
Season          int64
3 Stars         int64
4 Stars         int64
5 Stars         int64
Total Stars     int64
dtype: object

In [30]:
merged_df.dtypes

Season           int64
TeamID           int64
TeamName        object
Seed            object
Seed_correct     int64
dtype: object

In [31]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def fuzzy_merge(df1, df2, key1, key2, threshold=90, limit=1):
    """
    :param df1: the left table to join
    :param df2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df2[key2].tolist()

    m = df1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df1['matches'] = m

    m2 = df1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df1['matches'] = m2

    return df1

# Apply fuzzy matching on TeamName column
df1_matched = fuzzy_merge(merged_df, byTeamStars, 'TeamName', 'TeamName', threshold=80)

# Now merge the DataFrames on Season and matched TeamName
merged_df = pd.merge(df1_matched, byTeamStars, how='left', on=['Season', 'TeamName'])

# If needed, drop the 'matches' column
merged_df.drop('matches', axis=1, inplace=True)


In [32]:
merged_df.dtypes

Season            int64
TeamID            int64
TeamName         object
Seed             object
Seed_correct      int64
3 Stars         float64
4 Stars         float64
5 Stars         float64
Total Stars     float64
dtype: object

In [33]:
merged_df['Season'] = merged_df['Season'].astype(int)

### Now filter from 2011 

In [34]:
filtered_df = merged_df[merged_df['Season'] >= 2011]
filtered_df.head(60)

Unnamed: 0,Season,TeamID,TeamName,Seed,Seed_correct,3 Stars,4 Stars,5 Stars,Total Stars
520,2011,1155,Clemson,W12a,12,3.0,0.0,0.0,9.0
521,2011,1421,UNC Asheville,Y16b,16,,,,
522,2011,1427,UT San Antonio,W16b,16,,,,
523,2011,1433,VCU,Z11b,11,,,,
524,2011,1139,Butler,Y08,8,2.0,0.0,0.0,6.0
525,2011,1140,BYU,Y03,3,,,,
526,2011,1153,Cincinnati,X06,6,2.0,0.0,0.0,6.0
527,2011,1163,Connecticut,X03,3,0.0,1.0,1.0,9.0
528,2011,1196,Florida,Y02,2,1.0,0.0,1.0,8.0
529,2011,1211,Gonzaga,Y11,11,4.0,0.0,0.0,12.0


In [35]:
filtered_df.isna().sum()

Season            0
TeamID            0
TeamName          0
Seed              0
Seed_correct      0
3 Stars         414
4 Stars         414
5 Stars         414
Total Stars     414
dtype: int64

In [36]:
filtered_df[['3 Stars', '4 Stars', '5 Stars', 'Total Stars']] = filtered_df[['3 Stars', '4 Stars', '5 Stars', 'Total Stars']].fillna(0)
filtered_df.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[['3 Stars', '4 Stars', '5 Stars', 'Total Stars']] = filtered_df[['3 Stars', '4 Stars', '5 Stars', 'Total Stars']].fillna(0)


Season          0
TeamID          0
TeamName        0
Seed            0
Seed_correct    0
3 Stars         0
4 Stars         0
5 Stars         0
Total Stars     0
dtype: int64

# NOTES HERE JIM

THE Nans are fine it just means they didnt have anyone in the recruiting class that had stars most likely. We however need to verify this. We also have a flaw with Stars data because it is based on recruiting class. Therefore, we need to match this star data with player roster data to make it more useful and then count the number of stars on a team

# Eventually Merge in KENPOM

In [37]:
Pomeroy_data = 'kenpom_2023.csv'
pomeroy_df = pd.read_csv(Pomeroy_data)
pomeroy_df

# Display df
pomeroy_df
# team_count_per_season_pomeroy = pomeroy_df.groupby('Season')['TeamName'].nunique()
# team_count_per_season_pomeroy

Unnamed: 0.1,Unnamed: 0,Season,TeamName,adj_o,adj_d,adj_tempo,luck,sos_adj_o,sos_adj_d
0,0,2011,Ohio St.,125.4,88.4,66.0,0.043,107.4,98.3
1,1,2011,Duke,118.8,87.2,70.1,0.006,106.0,97.4
2,2,2011,Kansas,119.8,88.3,69.6,0.071,106.1,98.7
3,3,2011,Texas,114.0,85.3,67.2,-0.055,105.6,97.8
4,4,2011,Purdue,116.1,87.2,67.1,-0.004,108.1,97.3
...,...,...,...,...,...,...,...,...,...
4218,358,2023,Florida A&M;,89.0,110.7,64.1,-0.013,103.0,105.2
4219,359,2023,IUPUI,93.1,117.9,66.9,-0.079,102.1,108.0
4220,360,2023,Green Bay,90.6,116.6,65.1,0.031,104.4,106.9
4221,361,2023,LIU,85.7,114.3,72.0,-0.027,101.2,108.6


# DO NOT TOUCH

In [38]:
#import pandas as pd
#from fuzzywuzzy import process, fuzz

# Assuming filtered_df and pomeroy_df are already defined

# Define the fuzzy matching function
#def fuzzy_merge(row):
 #   return process.extractOne(row['TeamName'], filtered_df['TeamName'], scorer=fuzz.token_sort_ratio)[1]

# Apply the fuzzy matching function to pomeroy_df
#pomeroy_df['fuzzy_ratio'] = pomeroy_df.apply(fuzzy_merge, axis=1)

# Filter out low fuzzy match ratios if needed
#threshold = 70 # You can adjust this threshold as per your requirement
#pomeroy_df_filtered = pomeroy_df[pomeroy_df['fuzzy_ratio'] >= threshold]

# Merge the DataFrames based on Season and TeamName using a left merge
#merged_df = pd.merge(filtered_df, pomeroy_df_filtered, on=['Season', 'TeamName'], how='left')

# Drop the 'fuzzy_ratio' column if you don't need it anymore
#pomeroy_df_filtered.drop(columns=['fuzzy_ratio'], inplace=True)

# Now merged_df contains all rows from filtered_df, with matching rows from pomeroy_df_filtered if available
# You can locate the unmatched rows by checking for NaN values in columns introduced from pomeroy_df_filtered
#unmatched_rows = merged_df[merged_df['fuzzy_ratio'].isna()]

# You can then further analyze or store these unmatched rows as needed


In [39]:
#merged_df.head()
#merged_df['TeamID'] = merged_df['TeamID'].astype(int)
#merged_df
#merged_df.isna().sum()
#unmatched_rows
#Checking here 
# team_count_per_season = merged_df.groupby('Season')['TeamID'].nunique()
# team_count_per_season

# READ THIS: 

So we have an issue with the fuzzy match.... the names are drastically different

# Getting the Tournament Data for a game

In [40]:
detailed_tourney_results.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'SCOREDIFF', 'WFGPCT', 'W3PCT', 'WFTPCT', 'WORBCHANCE', 'WORPCT',
       'LFGPCT', 'L3PCT', 'LFTPCT', 'LORBCHANCE', 'LORPCT'],
      dtype='object')

# Clean Data

In [41]:
def prepare_data(df_data):
    df = df_data.copy()
    df.rename(columns={'WLoc': 'location'}, inplace=True)

    # Drop unnecessary columns

    dfswap = df[['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'location',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'SCOREDIFF', 'WFGPCT', 'W3PCT', 'WFTPCT', 'WORBCHANCE', 'WORPCT',
       'LFGPCT', 'L3PCT', 'LFTPCT', 'LORBCHANCE', 'LORPCT']]

    df.columns = df.columns.str.replace('W', 'T1_')
    df.columns = df.columns.str.replace('L', 'T2_')
    dfswap.columns = dfswap.columns.str.replace('L', 'T1_')
    dfswap.columns = dfswap.columns.str.replace('W', 'T2_')

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location == 'N', 'location'] = '0'
    output.loc[output.location == 'H', 'location'] = '1'
    output.loc[output.location == 'A', 'location'] = '-1'
    output.location = output.location.astype(int)
    output['PointDiff'] = output['T1_Score'] - output['T2_Score']
    output['Outcome'] = (output['PointDiff'] > 0).astype(int)

    return output

In [42]:
#tournament_data = prepare_data(detailed_tourney_results)
tournament_data = prepare_data(detailed_tourney_results)
tournament_data

Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score,location,NumOT,T1_FGM,T1_FGA,...,T1_FTPCT,T1_ORBCHANCE,T1_ORPCT,T2_FGPCT,T2_3PCT,T2_FTPCT,T2_ORBCHANCE,T2_ORPCT,PointDiff,Outcome
0,2003,134,1421,92,1411,84,0,1,32,69,...,0.653846,42,0.333333,0.432836,0.387097,0.451613,47,0.361702,8,1
1,2003,136,1112,80,1436,51,0,0,31,66,...,0.785714,37,0.297297,0.312500,0.250000,1.000000,44,0.181818,29,1
2,2003,136,1113,84,1272,71,0,0,31,59,...,0.727273,32,0.312500,0.362319,0.250000,0.666667,47,0.425532,13,1
3,2003,136,1141,79,1166,73,0,0,29,53,...,0.720000,28,0.392857,0.450000,0.411765,0.705882,34,0.411765,6,1
4,2003,136,1143,76,1301,74,0,1,27,64,...,0.652174,44,0.409091,0.446429,0.428571,0.750000,30,0.333333,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2625,2023,146,1400,81,1274,88,0,0,30,60,...,0.733333,28,0.321429,0.591837,0.250000,0.875000,18,0.222222,-7,0
2626,2023,146,1166,56,1361,57,0,0,22,55,...,0.909091,31,0.258065,0.378788,0.230769,0.666667,36,0.333333,-1,0
2627,2023,152,1274,59,1163,72,0,0,20,62,...,1.000000,39,0.307692,0.491228,0.346154,0.538462,30,0.433333,-13,0
2628,2023,152,1194,71,1361,72,0,0,23,52,...,0.761905,30,0.233333,0.438596,0.500000,0.590909,36,0.333333,-1,0


In [43]:
tournament_data.columns

Index(['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID', 'T2_Score',
       'location', 'NumOT', 'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_FTM',
       'T1_FTA', 'T1_OR', 'T1_DR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_Blk',
       'T1_PF', 'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_FTM', 'T2_FTA',
       'T2_OR', 'T2_DR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk', 'T2_PF',
       'SCOREDIFF', 'T1_FGPCT', 'T1_3PCT', 'T1_FTPCT', 'T1_ORBCHANCE',
       'T1_ORPCT', 'T2_FGPCT', 'T2_3PCT', 'T2_FTPCT', 'T2_ORBCHANCE',
       'T2_ORPCT', 'PointDiff', 'Outcome'],
      dtype='object')

Below this I am separating out team 1 and team 2 stats to then recombine

In [44]:

# Splitting dataframe based on T1 and T2
t1_columns = [col for col in tournament_data.columns if col.startswith('T1_')]
t2_columns = [col for col in tournament_data.columns if col.startswith('T2_')]

# Creating separate dataframes for T1 and T2
t1_df = tournament_data[['Season'] + ['DayNum'] + t1_columns + ['SCOREDIFF','T2_TeamID']]
t2_df = tournament_data[['Season'] + ['DayNum'] + t2_columns + ['SCOREDIFF','T1_TeamID']]
# Now t1_df and t2_df contain the desired dataframes with 'Season' and respective team information


In [45]:
t1_df
t1_df.rename(columns={'T1_TeamID': 'TeamID'}, inplace=True)
t2_df.rename(columns={'T2_TeamID': 'TeamID'}, inplace=True)

t2_df = t2_df[t2_df['Season'] >= 2011]
t2_df
team_count_per_season_t2 = t2_df.groupby('Season')['TeamID'].nunique()
team_count_per_season_t2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t1_df.rename(columns={'T1_TeamID': 'TeamID'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t2_df.rename(columns={'T2_TeamID': 'TeamID'}, inplace=True)


Season
2011    68
2012    68
2013    68
2014    68
2015    68
2016    68
2017    68
2018    68
2019    68
2021    67
2022    68
2023    68
Name: TeamID, dtype: int64

In [46]:
team_count_per_season_t1 = t1_df.groupby('Season')['TeamID'].nunique()
team_count_per_season_t1

Season
2003    65
2004    65
2005    65
2006    65
2007    65
2008    65
2009    65
2010    65
2011    68
2012    68
2013    68
2014    68
2015    68
2016    68
2017    68
2018    68
2019    68
2021    67
2022    68
2023    68
Name: TeamID, dtype: int64

In [47]:
merged_df_t1 = pd.merge(t1_df, merged_df, on=['Season', 'TeamID'], how='left')
merged_df_t1.rename(columns=lambda x: 'T1_' + x if x not in ['Season', 'TeamID'] else x, inplace=True)
merged_df_t1 = merged_df_t1[merged_df_t1['Season'] >= 2011]
merged_df_t1

Unnamed: 0,Season,T1_DayNum,TeamID,T1_T1_Score,T1_T1_FGM,T1_T1_FGA,T1_T1_FGM3,T1_T1_FGA3,T1_T1_FTM,T1_T1_FTA,...,T1_T1_ORPCT,T1_SCOREDIFF,T1_T2_TeamID,T1_TeamName,T1_Seed,T1_Seed_correct,T1_3 Stars,T1_4 Stars,T1_5 Stars,T1_Total Stars
512,2011,134,1155,70,26,50,4,13,14,16,...,0.153846,18,1412,Clemson,W12a,12,3.0,0.0,0.0,9.0
513,2011,134,1421,81,27,54,4,12,23,28,...,0.187500,4,1114,UNC Asheville,Y16b,16,,,,
514,2011,135,1427,70,23,54,4,16,20,30,...,0.263158,9,1106,UT San Antonio,W16b,16,,,,
515,2011,135,1433,59,20,59,9,24,10,15,...,0.395349,13,1425,VCU,Z11b,11,,,,
516,2011,136,1139,60,22,54,7,26,9,11,...,0.529412,2,1330,Butler,Y08,8,2.0,0.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2625,2023,146,1400,81,30,60,10,25,11,15,...,0.321429,7,1274,Texas,Y02,2,1.0,0.0,0.0,3.0
2626,2023,146,1166,56,22,55,2,17,10,11,...,0.258065,1,1361,Creighton,X06,6,,,,
2627,2023,152,1274,59,20,62,7,20,12,12,...,0.307692,13,1163,Miami FL,Y05,5,,,,
2628,2023,152,1194,71,23,52,9,22,16,21,...,0.233333,1,1361,FL Atlantic,W09,9,,,,


In [48]:
merged_df_t2 = pd.merge(t2_df, merged_df, on=['Season', 'TeamID'], how='left')
merged_df_t2.rename(columns=lambda x: 'T2_' + x if x not in ['Season', 'TeamID'] else x, inplace=True)

merged_df_t2 = merged_df_t2[merged_df_t2['Season'] >= 2011]

In [49]:
merged_df_t1
merged_df_t1.isna().sum()

Season               0
T1_DayNum            0
TeamID               0
T1_T1_Score          0
T1_T1_FGM            0
T1_T1_FGA            0
T1_T1_FGM3           0
T1_T1_FGA3           0
T1_T1_FTM            0
T1_T1_FTA            0
T1_T1_OR             0
T1_T1_DR             0
T1_T1_Ast            0
T1_T1_TO             0
T1_T1_Stl            0
T1_T1_Blk            0
T1_T1_PF             0
T1_T1_FGPCT          0
T1_T1_3PCT           0
T1_T1_FTPCT          0
T1_T1_ORBCHANCE      0
T1_T1_ORPCT          0
T1_SCOREDIFF         0
T1_T2_TeamID         0
T1_TeamName          0
T1_Seed              0
T1_Seed_correct      0
T1_3 Stars         644
T1_4 Stars         644
T1_5 Stars         644
T1_Total Stars     644
dtype: int64

In [50]:
combined_df = pd.merge(merged_df_t1, merged_df_t2, left_on=['Season', 'T1_T2_TeamID'], right_on=['Season', 'TeamID'], how='left')
combined_df
merged_df_t2.isna().sum()

Season               0
T2_DayNum            0
TeamID               0
T2_T2_Score          0
T2_T2_FGM            0
T2_T2_FGA            0
T2_T2_FGM3           0
T2_T2_FGA3           0
T2_T2_FTM            0
T2_T2_FTA            0
T2_T2_OR             0
T2_T2_DR             0
T2_T2_Ast            0
T2_T2_TO             0
T2_T2_Stl            0
T2_T2_Blk            0
T2_T2_PF             0
T2_T2_FGPCT          0
T2_T2_3PCT           0
T2_T2_FTPCT          0
T2_T2_ORBCHANCE      0
T2_T2_ORPCT          0
T2_SCOREDIFF         0
T2_T1_TeamID         0
T2_TeamName          0
T2_Seed              0
T2_Seed_correct      0
T2_3 Stars         644
T2_4 Stars         644
T2_5 Stars         644
T2_Total Stars     644
dtype: int64

# XGBoost RUN 1

In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


# Load your dataframe
df = tournament_data.copy()


# Select the features
selected_features = ['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID', 'T2_Score',
       'location', 'NumOT', 'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_FTM',
       'T1_FTA', 'T1_OR', 'T1_DR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_Blk',
       'T1_PF', 'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_FTM', 'T2_FTA',
       'T2_OR', 'T2_DR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk', 'T2_PF',
       'SCOREDIFF', 'T1_FGPCT', 'T1_3PCT', 'T1_FTPCT', 'T1_ORBCHANCE',
       'T1_ORPCT', 'T2_FGPCT', 'T2_3PCT', 'T2_FTPCT', 'T2_ORBCHANCE',
       'T2_ORPCT', 'PointDiff']

# Extract the features and target variable
X = df[selected_features]
y = df['Outcome']  # Assuming 'Outcome' indicates the winner (1 for Team 1, 0 for Team 2)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost model
model = XGBClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


In [52]:

# Make probability predictions on the test data
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate the Brier score
brier_score = brier_score_loss(y_test, y_pred_proba)

print("Brier Score:", brier_score)

NameError: name 'best_model' is not defined

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.metrics import mean_squared_error

# Learning Curve
train_sizes, train_scores, valid_scores = learning_curve(model, X, y, train_sizes=np.linspace(0.1, 1.0, 10), cv=5)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, valid_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.title("Learning Curve")
plt.legend(loc="best")
plt.show()

# Validation Curve
param_range = np.arange(1, 11)
train_scores, valid_scores = validation_curve(model, X, y, param_name="max_depth", param_range=param_range, cv=5)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(param_range, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g")
plt.plot(param_range, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(param_range, valid_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.xlabel("Max Depth")
plt.ylabel("Score")
plt.title("Validation Curve")
plt.legend(loc="best")
plt.show()

# Feature Importance Plot
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(10, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X.columns)[sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Feature Importance Plot')
plt.show()

# Residual Plot (if regression)
if isinstance(model, XGBRegressor):
    y_pred = model.predict(X)
    residuals = y - y_pred
    plt.figure(figsize=(10, 6))
    plt.scatter(y_pred, residuals)
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residual Plot')
    plt.axhline(y=0, color='r', linestyle='-')
    plt.show()
