In [1]:

import pandas as pd
import numpy as np
import pickle
from bs4 import BeautifulSoup
from bs4 import NavigableString

from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
from sklearn import model_selection 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Import Data

In [2]:
# Load necessary data
data_dir = 'mens-machine-learning-competition-2019/DataFiles/'
df_seeds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')
df_tour = pd.read_csv(data_dir + 'NCAATourneyCompactResults.csv')
df_regular_season = pd.read_csv(data_dir + 'RegularSeasonCompactResults.csv')
df_conference_games = pd.read_csv(data_dir + 'ConferenceTourneyGames.csv')
df_team_rpis = pd.read_csv("RPI/TeamRPI.csv") # scrapped from the web
df_teams = pd.read_csv(data_dir + "Teams.csv")
df_tourn_slots = pd.read_csv(data_dir + "NCAATourneySlots.csv")
df_team_conferences = pd.read_csv(data_dir + "TeamConferences.csv")
df_sos_rankings = pd.read_csv("sos.csv")

In [3]:
# matchups w/ teamid's
df_tourney_strong_seeds = df_seeds.copy().rename(columns={'Seed': 'StrongSeed', 'TeamID': 'FirstTeamID'})
df_tourney_slot_games = pd.merge(left=df_tourn_slots, right=df_tourney_strong_seeds, how='left', on=['Season', 'StrongSeed'])

df_tourney_week_seeds = df_seeds.copy().rename(columns={'Seed': 'WeakSeed', 'TeamID': 'SecondTeamID'})
df_tourney_slot_games = pd.merge(left=df_tourney_slot_games, right=df_tourney_week_seeds, how='left', on=['Season', 'WeakSeed'])
df_tourney_slot_games = df_tourney_slot_games.drop(df_tourney_slot_games[df_tourney_slot_games.StrongSeed.str.len() > 3].index)
df_tourney_slot_games['StrongSeed'] = df_tourney_slot_games['StrongSeed'].str[1:3].apply(pd.to_numeric)
df_tourney_slot_games['WeakSeed'] = df_tourney_slot_games['WeakSeed'].str[1:3].apply(pd.to_numeric)
# df_tourney_slot_games = df_tourney_slot_games.dropna()
df_tourney_slot_games.loc[df_tourney_slot_games.Season == 2019].head(32)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,FirstTeamID,SecondTeamID
2189,2019,R1W2,2,15,1277.0,1133.0
2190,2019,R1W3,3,14,1261.0,1463.0
2191,2019,R1W4,4,13,1439.0,1387.0
2192,2019,R1W5,5,12,1280.0,1251.0
2193,2019,R1W6,6,11,1268.0,1125.0
2194,2019,R1W7,7,10,1257.0,1278.0
2195,2019,R1W8,8,9,1433.0,1416.0
2196,2019,R1X1,1,16,1211.0,1192.0
2197,2019,R1X2,2,15,1276.0,1285.0
2198,2019,R1X3,3,14,1403.0,1297.0


In [4]:
# collect win counts for each team per season
df_regular_season.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
df_win_counts = df_regular_season.groupby(['Season', 'WTeamID']).count().reset_index()
df_win_counts = df_win_counts.rename(columns = {'LTeamID': 'RWinCount', 'WTeamID': 'TeamID'})

In [5]:
# collectin loss counts for each team per son
df_loss_counts = df_regular_season.groupby(['Season', 'LTeamID']).count().reset_index()
df_loss_counts = df_loss_counts.rename(columns = {'WTeamID': 'RLossCount', 'LTeamID': 'TeamID'})

In [6]:
# merge win/loss counts and show regular season win percentage
df_regular_season_totals = pd.merge(left=df_win_counts, right=df_loss_counts, how='left', on=['Season', 'TeamID'])
df_regular_season_totals['RWinPerc'] = df_regular_season_totals.RWinCount / (df_regular_season_totals.RWinCount + df_regular_season_totals.RLossCount)
df_regular_season_totals.head()

Unnamed: 0,Season,TeamID,RWinCount,RLossCount,RWinPerc
0,1985,1102,5,19.0,0.208333
1,1985,1103,9,14.0,0.391304
2,1985,1104,21,9.0,0.7
3,1985,1106,10,14.0,0.416667
4,1985,1108,19,6.0,0.76


In [7]:
# drop unused labels
df_conference_games.drop(labels=['ConfAbbrev', 'DayNum'], inplace=True, axis=1)

In [8]:
# gather win/loss totals for conference tournaments and win %
df_c_win_counts = df_conference_games.groupby(['Season', 'WTeamID']).count().reset_index()
df_c_win_counts = df_c_win_counts.rename(columns = {'LTeamID': 'CWinCount', 'WTeamID': 'TeamID'})
df_c_loss_counts = df_conference_games.groupby(['Season', 'LTeamID']).count().reset_index()
df_c_loss_counts = df_c_loss_counts.rename(columns = {'WTeamID': 'CLossCount', 'LTeamID': 'TeamID'})
df_conference_tournament_totals = pd.merge(left=df_c_win_counts, right=df_c_loss_counts, how='left', on=['Season', 'TeamID'])
df_conference_tournament_totals = df_conference_tournament_totals.fillna(value={'CWinCount': 0, 'CLossCount': 0})
df_conference_tournament_totals['CWinPerc'] = df_conference_tournament_totals.CWinCount / (df_conference_tournament_totals.CWinCount + df_conference_tournament_totals.CLossCount)
df_conference_tournament_totals.head()

Unnamed: 0,Season,TeamID,CWinCount,CLossCount,CWinPerc
0,2001,1104,1,1.0,0.5
1,2001,1106,3,0.0,1.0
2,2001,1108,2,1.0,0.666667
3,2001,1111,1,1.0,0.5
4,2001,1114,1,1.0,0.5


In [9]:
# combine regular season and conference tournament win/loss
df_win_totals = pd.merge(left=df_regular_season_totals, right=df_conference_tournament_totals, how='left', on=['Season', 'TeamID'])
df_win_totals.tail(10)

Unnamed: 0,Season,TeamID,RWinCount,RLossCount,RWinPerc,CWinCount,CLossCount,CWinPerc
11217,2019,1457,15,12.0,0.555556,,,
11218,2019,1458,23,10.0,0.69697,1.0,1.0,0.5
11219,2019,1459,26,4.0,0.866667,3.0,0.0,1.0
11220,2019,1460,19,13.0,0.59375,2.0,1.0,0.666667
11221,2019,1461,7,24.0,0.225806,,,
11222,2019,1462,18,15.0,0.545455,1.0,1.0,0.5
11223,2019,1463,21,7.0,0.75,2.0,0.0,1.0
11224,2019,1464,10,20.0,0.333333,,,
11225,2019,1465,12,14.0,0.461538,,,
11226,2019,1466,7,22.0,0.241379,,,


In [10]:
# convert seed into int
df_seeds['Seed_Int'] = df_seeds['Seed'].str[1:3]
df_seeds['Seed_Int'] = df_seeds['Seed_Int'].apply(pd.to_numeric)
# Drop unused labels
df_seeds.drop(labels=['Seed'], inplace=True, axis=1)
df_tour.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
# Rename to winning team
df_winseeds = df_seeds.rename(columns={'TeamID': 'WTeamID', 'Seed_Int': 'WSeed'})
# Rename lossing teams
df_losseeds = df_seeds.rename(columns={'TeamID': 'LTeamID', 'Seed_Int': 'LSeed'})
# Lets make a table with teams and their seeds

In [11]:
df_seeds.tail()

Unnamed: 0,Season,TeamID,Seed_Int
2279,2019,1332,12
2280,2019,1414,13
2281,2019,1330,14
2282,2019,1159,15
2283,2019,1205,16


In [12]:
df_tour.head()

Unnamed: 0,Season,WTeamID,LTeamID
0,1985,1116,1234
1,1985,1120,1345
2,1985,1207,1250
3,1985,1229,1425
4,1985,1242,1325


In [13]:
df_winseeds.head()

Unnamed: 0,Season,WTeamID,WSeed
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [14]:
df_losseeds.head()

Unnamed: 0,Season,LTeamID,LSeed
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [15]:
df_dummy = pd.merge(left=df_tour, right=df_winseeds, how='left', on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy, right=df_losseeds, how='left', on=['Season', 'LTeamID'])
df_concat['SeedDiff'] = df_concat.WSeed - df_concat.LSeed

In [16]:
df_concat.tail()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,SeedDiff
2179,2018,1242,1181,1,2,-1
2180,2018,1437,1403,1,3,-2
2181,2018,1276,1260,3,11,-8
2182,2018,1437,1242,1,1,0
2183,2018,1437,1276,1,3,-2


In [17]:
df_team_master = pd.merge(left=df_team_rpis, right=df_teams, how="left", on=["TeamName"])
df_team_master.drop(labels=['FirstD1Season', 'LastD1Season'], inplace=True, axis=1)
df_team_master.tail()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Season,TeamName,RPI,TeamID
7055,7055,7055,2019,New Hampshire,0.36,1306
7056,7056,7056,2019,MS Valley St,0.356,1290
7057,7057,7057,2019,Delaware St,0.349,1175
7058,7058,7058,2019,SC Upstate,0.347,1367
7059,7059,7059,2019,Alabama A&M,0.345,1105


In [18]:
df_seeds.head()

Unnamed: 0,Season,TeamID,Seed_Int
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [19]:
# add seeds
df_team_master = pd.merge(left=df_team_master, right=df_seeds, how="left", on=["Season", "TeamID"])
# drop teams that weren't in the tournament
df_team_master = pd.merge(left=df_team_master, right=df_sos_rankings, how="left", on=["Season", "TeamName"])
df_team_master = df_team_master.fillna(value={"SOS": 0})
df_team_master.tail()

Unnamed: 0,Unnamed: 0_x,Unnamed: 0.1,Season,TeamName,RPI,TeamID,Seed_Int,Unnamed: 0_y,SOS
7055,7055,7055,2019,New Hampshire,0.36,1306,,6709.0,-5.85
7056,7056,7056,2019,MS Valley St,0.356,1290,,6717.0,-6.98
7057,7057,7057,2019,Delaware St,0.349,1175,,6720.0,-10.11
7058,7058,7058,2019,SC Upstate,0.347,1367,,6701.0,-4.3
7059,7059,7059,2019,Alabama A&M,0.345,1105,,6713.0,-8.44


In [20]:
# drop teams that didn't participate in the tournament
# df_team_master = df_team_master.dropna()

In [21]:
df_team_master = df_team_master.rename(columns={'Seed_Int': 'Seed'})
df_team_master.head()

Unnamed: 0,Unnamed: 0_x,Unnamed: 0.1,Season,TeamName,RPI,TeamID,Seed,Unnamed: 0_y,SOS
0,0,0,2000,Duke,0.67,1181,1.0,1.0,8.88
1,1,1,2000,Kansas,0.657,1242,8.0,17.0,9.79
2,2,2,2000,Virginia,0.655,1438,,39.0,6.22
3,3,3,2000,North Carolina,0.653,1314,8.0,21.0,11.8
4,4,4,2000,Houston,0.648,1222,,133.0,6.44


In [22]:
# add win/loss totals
df_team_master = pd.merge(left=df_team_master, right=df_win_totals, how='left', on=["Season", "TeamID"])
df_team_master.head()

Unnamed: 0,Unnamed: 0_x,Unnamed: 0.1,Season,TeamName,RPI,TeamID,Seed,Unnamed: 0_y,SOS,RWinCount,RLossCount,RWinPerc,CWinCount,CLossCount,CWinPerc
0,0,0,2000,Duke,0.67,1181,1.0,1.0,8.88,27.0,4.0,0.870968,,,
1,1,1,2000,Kansas,0.657,1242,8.0,17.0,9.79,22.0,9.0,0.709677,,,
2,2,2,2000,Virginia,0.655,1438,,39.0,6.22,19.0,11.0,0.633333,,,
3,3,3,2000,North Carolina,0.653,1314,8.0,21.0,11.8,18.0,13.0,0.580645,,,
4,4,4,2000,Houston,0.648,1222,,133.0,6.44,8.0,22.0,0.266667,,,


In [23]:
top_40 = df_team_rpis.groupby("Season").head(40)
top_40 = pd.merge(left=top_40, right=df_teams, how="left", on=["TeamName"])
top_40.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Season,TeamName,RPI,TeamID,FirstD1Season,LastD1Season
0,0,0,2000,Duke,0.67,1181,1985,2019
1,1,1,2000,Kansas,0.657,1242,1985,2019
2,2,2,2000,Virginia,0.655,1438,1985,2019
3,3,3,2000,North Carolina,0.653,1314,1985,2019
4,4,4,2000,Houston,0.648,1222,1985,2019


In [24]:
df_regular_season_wins_again = pd.read_csv(data_dir + 'RegularSeasonCompactResults.csv')
df_regular_wins_only = df_regular_season_wins_again[['Season', 'WTeamID', 'DayNum']].copy()
df_regular_wins_only['win'] = 1
df_regular_wins_only = df_regular_wins_only.rename(columns={'WTeamID': 'TeamID'})

df_regular_losses_only = df_regular_season_wins_again[['Season', 'LTeamID', 'DayNum']].copy()
df_regular_losses_only['win'] = 0
df_regular_losses_only = df_regular_losses_only.rename(columns={'LTeamID': 'TeamID'})

games = [df_regular_wins_only, df_regular_losses_only]
df_last_ten_games = pd.concat(games).sort_values(['Season', 'DayNum']).groupby(['Season', 'TeamID']).tail(10).groupby(['Season', 'TeamID']).mean()[['win']].reset_index()   
df_last_ten_games = df_last_ten_games.rename(columns={'win': 'Last10WinPerc'})

In [25]:
df_team_master = pd.merge(left=df_team_master, right=df_last_ten_games, how="left", on=["Season", "TeamID"])
df_team_master.head()

Unnamed: 0,Unnamed: 0_x,Unnamed: 0.1,Season,TeamName,RPI,TeamID,Seed,Unnamed: 0_y,SOS,RWinCount,RLossCount,RWinPerc,CWinCount,CLossCount,CWinPerc,Last10WinPerc
0,0,0,2000,Duke,0.67,1181,1.0,1.0,8.88,27.0,4.0,0.870968,,,,0.9
1,1,1,2000,Kansas,0.657,1242,8.0,17.0,9.79,22.0,9.0,0.709677,,,,0.6
2,2,2,2000,Virginia,0.655,1438,,39.0,6.22,19.0,11.0,0.633333,,,,0.4
3,3,3,2000,North Carolina,0.653,1314,8.0,21.0,11.8,18.0,13.0,0.580645,,,,0.5
4,4,4,2000,Houston,0.648,1222,,133.0,6.44,8.0,22.0,0.266667,,,,0.1


In [26]:
top_40_games = top_40['TeamID'].to_list()
df_top_40_games = df_regular_season_wins_again[['Season', 'WTeamID', 'LTeamID', 'DayNum']].copy()
df_top_40_games = df_top_40_games.loc[(df_top_40_games['WTeamID'].isin(top_40_games)) | (df_top_40_games['LTeamID'].isin(top_40_games))]

df_top_40_regular_wins_only = df_top_40_games[['Season', 'WTeamID', 'DayNum']].copy()
df_top_40_regular_wins_only['win'] = 1
df_top_40_regular_wins_only = df_top_40_regular_wins_only.rename(columns={'WTeamID': 'TeamID'})

df_top_40_regular_losses_only = df_top_40_games[['Season', 'LTeamID', 'DayNum']].copy()
df_top_40_regular_losses_only['win'] = 0
df_top_40_regular_losses_only = df_top_40_regular_losses_only.rename(columns={'LTeamID': 'TeamID'})

games_40 = [df_top_40_regular_wins_only, df_top_40_regular_losses_only]
df_top_40_games = pd.concat(games_40).sort_values(['Season', 'DayNum']).groupby(['Season', 'TeamID']).tail(10).groupby(['Season', 'TeamID']).mean()[['win']].reset_index()
df_top_40_games = df_top_40_games.rename(columns={'win': 'Top40WinPerc'})             

In [27]:
df_team_master = pd.merge(left=df_team_master, right=df_top_40_games, how='left', on=['Season', 'TeamID'])
df_team_master.head()

Unnamed: 0,Unnamed: 0_x,Unnamed: 0.1,Season,TeamName,RPI,TeamID,Seed,Unnamed: 0_y,SOS,RWinCount,RLossCount,RWinPerc,CWinCount,CLossCount,CWinPerc,Last10WinPerc,Top40WinPerc
0,0,0,2000,Duke,0.67,1181,1.0,1.0,8.88,27.0,4.0,0.870968,,,,0.9,0.9
1,1,1,2000,Kansas,0.657,1242,8.0,17.0,9.79,22.0,9.0,0.709677,,,,0.6,0.6
2,2,2,2000,Virginia,0.655,1438,,39.0,6.22,19.0,11.0,0.633333,,,,0.4,0.4
3,3,3,2000,North Carolina,0.653,1314,8.0,21.0,11.8,18.0,13.0,0.580645,,,,0.5,0.5
4,4,4,2000,Houston,0.648,1222,,133.0,6.44,8.0,22.0,0.266667,,,,0.1,0.1


In [28]:
df_team_master.drop(labels=['Unnamed: 0_x', 'Unnamed: 0.1', 'Unnamed: 0_y'], inplace=True, axis=1)
df_team_master.tail()

Unnamed: 0,Season,TeamName,RPI,TeamID,Seed,SOS,RWinCount,RLossCount,RWinPerc,CWinCount,CLossCount,CWinPerc,Last10WinPerc,Top40WinPerc
7055,2019,New Hampshire,0.36,1306,,-5.85,3.0,24.0,0.111111,,,,0.2,0.0
7056,2019,MS Valley St,0.356,1290,,-6.98,5.0,26.0,0.16129,,,,0.3,0.0
7057,2019,Delaware St,0.349,1175,,-10.11,4.0,25.0,0.137931,1.0,1.0,0.5,0.3,
7058,2019,SC Upstate,0.347,1367,,-4.3,3.0,26.0,0.103448,,,,0.1,
7059,2019,Alabama A&M,0.345,1105,,-8.44,5.0,27.0,0.15625,,,,0.1,0.0


In [29]:
df_cleansed_master = df_team_master.drop(labels=["RWinCount", "RLossCount", "CWinCount", "CLossCount", "CWinPerc"], inplace=False, axis=1)
df_cleansed_master.tail()

Unnamed: 0,Season,TeamName,RPI,TeamID,Seed,SOS,RWinPerc,Last10WinPerc,Top40WinPerc
7055,2019,New Hampshire,0.36,1306,,-5.85,0.111111,0.2,0.0
7056,2019,MS Valley St,0.356,1290,,-6.98,0.16129,0.3,0.0
7057,2019,Delaware St,0.349,1175,,-10.11,0.137931,0.3,
7058,2019,SC Upstate,0.347,1367,,-4.3,0.103448,0.1,
7059,2019,Alabama A&M,0.345,1105,,-8.44,0.15625,0.1,0.0


In [30]:
df_tourney_slot_games.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,FirstTeamID,SecondTeamID
0,1985,R1W1,1,16,1207.0,1250.0
1,1985,R1W2,2,15,1210.0,1273.0
2,1985,R1W3,3,14,1228.0,1318.0
3,1985,R1W4,4,13,1260.0,1233.0
4,1985,R1W5,5,12,1374.0,1330.0


In [31]:
df_tournament_history = df_tour.loc[(df_tour['Season'] >= 2000) | (df_tour['Season'] < 2019)]
df_tournament_2019 = df_tourney_slot_games.loc[df_tourney_slot_games['Season'] == 2019]

In [32]:
df_tournament_2019.FirstTeamID = df_tournament_2019.FirstTeamID.astype(int)
df_tournament_2019.SecondTeamID = df_tournament_2019.SecondTeamID.astype(int)
df_tournament_2019.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,FirstTeamID,SecondTeamID
2189,2019,R1W2,2,15,1277,1133
2190,2019,R1W3,3,14,1261,1463
2191,2019,R1W4,4,13,1439,1387
2192,2019,R1W5,5,12,1280,1251
2193,2019,R1W6,6,11,1268,1125


In [33]:
df_current_year_dummy = df_cleansed_master.copy().rename(columns = { 'TeamID': 'FirstTeamID', "SOS": "FirstSOS", "ConfAbbrev": "FirstConfAbbrev", 'RPI': 'FirstRPI', 'Seed': 'FirstSeed', 'RWinPerc': 'FirstRegWinPerc', 'Last10WinPerc': 'FirstLast10WinPerc', 'Top40WinPerc': 'FirstTop40WinPerc' })
df_current_year_dummy_2 = df_cleansed_master.copy().rename(columns = { 'TeamID': 'SecondTeamID', "SOS": "SecondSOS", "ConfAbbrev": "SecondConfAbbrev", 'RPI': 'SecondRPI', 'Seed': 'SecondSeed', 'RWinPerc': 'SecondRegWinPerc', 'Last10WinPerc': 'SecondLast10WinPerc', 'Top40WinPerc': 'SecondTop40WinPerc'})
df_dummy = df_cleansed_master.rename(columns = { 'TeamID': 'WTeamID', "SOS": "FirstSOS", "ConfAbbrev": "FirstConfAbbrev", 'RPI': 'FirstRPI', 'Seed': 'FirstSeed', 'RWinPerc': 'FirstRegWinPerc', 'Last10WinPerc': 'FirstLast10WinPerc', 'Top40WinPerc': 'FirstTop40WinPerc' })
df_dummy_2 = df_cleansed_master.rename(columns = { 'TeamID': 'LTeamID', "SOS": "SecondSOS", "ConfAbbrev": "SecondConfAbbrev", 'RPI': 'SecondRPI', 'Seed': 'SecondSeed', 'RWinPerc': 'SecondRegWinPerc', 'Last10WinPerc': 'SecondLast10WinPerc', 'Top40WinPerc': 'SecondTop40WinPerc' })

df_current_year_dummy.head()


Unnamed: 0,Season,TeamName,FirstRPI,FirstTeamID,FirstSeed,FirstSOS,FirstRegWinPerc,FirstLast10WinPerc,FirstTop40WinPerc
0,2000,Duke,0.67,1181,1.0,8.88,0.870968,0.9,0.9
1,2000,Kansas,0.657,1242,8.0,9.79,0.709677,0.6,0.6
2,2000,Virginia,0.655,1438,,6.22,0.633333,0.4,0.4
3,2000,North Carolina,0.653,1314,8.0,11.8,0.580645,0.5,0.5
4,2000,Houston,0.648,1222,,6.44,0.266667,0.1,0.1


In [34]:
df_current_year_dummy_2.head()

Unnamed: 0,Season,TeamName,SecondRPI,SecondTeamID,SecondSeed,SecondSOS,SecondRegWinPerc,SecondLast10WinPerc,SecondTop40WinPerc
0,2000,Duke,0.67,1181,1.0,8.88,0.870968,0.9,0.9
1,2000,Kansas,0.657,1242,8.0,9.79,0.709677,0.6,0.6
2,2000,Virginia,0.655,1438,,6.22,0.633333,0.4,0.4
3,2000,North Carolina,0.653,1314,8.0,11.8,0.580645,0.5,0.5
4,2000,Houston,0.648,1222,,6.44,0.266667,0.1,0.1


In [35]:
df_tournament_games = pd.merge(left=df_tournament_history, right=df_dummy, how="left", on=["Season", "WTeamID"])
df_tournament_games.drop(labels=["TeamName"], inplace=True, axis=1)
df_tournament_games = pd.merge(left=df_tournament_games, right=df_dummy_2, how="left", on=["Season", "LTeamID"])
df_tournament_games.drop(labels=["TeamName"], inplace=True, axis=1)

df_tournament_2019_games = pd.merge(left=df_tournament_2019, right=df_current_year_dummy, how="left", on=["Season", "FirstTeamID"])
# df_tournament_2019_games.drop(labels=["TeamName"], inplace=True, axis=1)
df_tournament_2019_games = df_tournament_2019_games.rename(columns={'TeamName': 'FirstTeamName'})
df_tournament_2019_games = pd.merge(left=df_tournament_2019_games, right=df_current_year_dummy_2, how="left", on=["Season", "SecondTeamID"])
df_tournament_2019_games = df_tournament_2019_games.rename(columns={'TeamName': 'SecondTeamName'})
# df_tournament_2019_games.drop(labels=["TeamName"], inplace=True, axis=1)

df_tournament_2019_games.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,FirstTeamID,SecondTeamID,FirstTeamName,FirstRPI,FirstSeed,FirstSOS,FirstRegWinPerc,FirstLast10WinPerc,FirstTop40WinPerc,SecondTeamName,SecondRPI,SecondSeed,SecondSOS,SecondRegWinPerc,SecondLast10WinPerc,SecondTop40WinPerc
0,2019,R1W2,2,15,1277,1133,Michigan St,0.647,2.0,12.27,0.823529,0.9,0.9,Bradley,0.504,15.0,-1.47,0.575758,0.8,
1,2019,R1W3,3,14,1261,1463,LSU,0.626,3.0,8.9,0.8125,0.8,0.8,Yale,0.577,14.0,-1.72,0.75,0.7,0.0
2,2019,R1W4,4,13,1439,1387,Virginia Tech,0.605,4.0,7.94,0.75,0.6,0.6,St Louis,0.551,13.0,1.99,0.657143,0.7,0.25
3,2019,R1W5,5,12,1280,1251,Mississippi St,0.608,5.0,9.54,0.69697,0.7,0.7,Liberty,0.549,12.0,-5.12,0.806452,0.9,
4,2019,R1W6,6,11,1268,1125,Maryland,0.602,6.0,10.43,0.6875,0.5,0.5,Belmont,0.579,11.0,-3.19,0.833333,0.9,0.333333


In [36]:
df_tournament_games['RPIDifference'] = df_tournament_games['FirstRPI'] - df_tournament_games['SecondRPI']
df_tournament_games['SOSDifference'] = df_tournament_games['FirstSOS'] - df_tournament_games['SecondSOS']
df_tournament_games['SeedDifference'] = df_tournament_games['FirstSeed'] - df_tournament_games['SecondSeed']
df_tournament_games['RegWinPercDifference'] = df_tournament_games['FirstRegWinPerc'] - df_tournament_games['SecondRegWinPerc']
df_tournament_games['Last10WinPercDifference'] = df_tournament_games['FirstLast10WinPerc'] - df_tournament_games['SecondLast10WinPerc']
df_tournament_games['Top40WinPercDifference'] = df_tournament_games['FirstTop40WinPerc'] - df_tournament_games['SecondTop40WinPerc']

In [37]:
df_tournament_games.tail()

Unnamed: 0,Season,WTeamID,LTeamID,FirstRPI,FirstSeed,FirstSOS,FirstRegWinPerc,FirstLast10WinPerc,FirstTop40WinPerc,SecondRPI,...,SecondSOS,SecondRegWinPerc,SecondLast10WinPerc,SecondTop40WinPerc,RPIDifference,SOSDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference
2182,2018,1242,1181,0.657,1.0,11.87,0.794118,0.8,0.8,0.67,...,9.71,0.787879,0.7,0.7,-0.013,2.16,-1.0,0.006239,0.1,0.1
2183,2018,1437,1403,0.62,1.0,10.24,0.882353,0.8,0.8,0.619,...,9.6,0.727273,0.5,0.5,0.001,0.64,-2.0,0.15508,0.3,0.3
2184,2018,1276,1260,0.635,3.0,9.41,0.794118,0.9,0.9,0.522,...,1.81,0.84375,1.0,,0.113,7.6,-8.0,-0.049632,-0.1,
2185,2018,1437,1242,0.62,1.0,10.24,0.882353,0.8,0.8,0.657,...,11.87,0.794118,0.8,0.8,-0.037,-1.63,0.0,0.088235,0.0,0.0
2186,2018,1437,1276,0.62,1.0,10.24,0.882353,0.8,0.8,0.635,...,9.41,0.794118,0.9,0.9,-0.015,0.83,-2.0,0.088235,-0.1,-0.1


In [38]:
df_tournament_2019_games['RPIDifference'] = df_tournament_2019_games['FirstRPI'] - df_tournament_2019_games['SecondRPI']
df_tournament_2019_games['SOSDifference'] = df_tournament_2019_games['FirstSOS'] - df_tournament_2019_games['SecondSOS']
df_tournament_2019_games['SeedDifference'] = df_tournament_2019_games['FirstSeed'] - df_tournament_2019_games['SecondSeed']
df_tournament_2019_games['RegWinPercDifference'] = df_tournament_2019_games['FirstRegWinPerc'] - df_tournament_2019_games['SecondRegWinPerc']
df_tournament_2019_games['Last10WinPercDifference'] = df_tournament_2019_games['FirstLast10WinPerc'] - df_tournament_2019_games['SecondLast10WinPerc']
df_tournament_2019_games['Top40WinPercDifference'] = df_tournament_2019_games['FirstTop40WinPerc'] - df_tournament_2019_games['SecondTop40WinPerc']

In [39]:
df_tournament_2019_games.head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed,FirstTeamID,SecondTeamID,FirstTeamName,FirstRPI,FirstSeed,FirstSOS,...,SecondSOS,SecondRegWinPerc,SecondLast10WinPerc,SecondTop40WinPerc,RPIDifference,SOSDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference
0,2019,R1W2,2,15,1277,1133,Michigan St,0.647,2.0,12.27,...,-1.47,0.575758,0.8,,0.143,13.74,-13.0,0.247772,0.1,
1,2019,R1W3,3,14,1261,1463,LSU,0.626,3.0,8.9,...,-1.72,0.75,0.7,0.0,0.049,10.62,-11.0,0.0625,0.1,0.8
2,2019,R1W4,4,13,1439,1387,Virginia Tech,0.605,4.0,7.94,...,1.99,0.657143,0.7,0.25,0.054,5.95,-9.0,0.092857,-0.1,0.35
3,2019,R1W5,5,12,1280,1251,Mississippi St,0.608,5.0,9.54,...,-5.12,0.806452,0.9,,0.059,14.66,-7.0,-0.109482,-0.2,
4,2019,R1W6,6,11,1268,1125,Maryland,0.602,6.0,10.43,...,-3.19,0.833333,0.9,0.333333,0.023,13.62,-5.0,-0.145833,-0.4,0.166667


In [40]:
df_tournament_games.drop(labels=["FirstSOS", "SecondSOS", "FirstRPI", "SecondRPI", "FirstSeed", "SecondSeed", "FirstRegWinPerc", "SecondRegWinPerc", "FirstLast10WinPerc", "SecondLast10WinPerc", "FirstTop40WinPerc", "SecondTop40WinPerc"], inplace=True, axis=1)
df_tournament_2019_games.drop(labels=["FirstSOS", "SecondSOS", "FirstRPI", "SecondRPI", "FirstSeed", "SecondSeed", "FirstRegWinPerc", "SecondRegWinPerc", "FirstLast10WinPerc", "SecondLast10WinPerc", "FirstTop40WinPerc", "SecondTop40WinPerc"], inplace=True, axis=1)

In [41]:
df_tournament_games.tail()

Unnamed: 0,Season,WTeamID,LTeamID,RPIDifference,SOSDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference
2182,2018,1242,1181,-0.013,2.16,-1.0,0.006239,0.1,0.1
2183,2018,1437,1403,0.001,0.64,-2.0,0.15508,0.3,0.3
2184,2018,1276,1260,0.113,7.6,-8.0,-0.049632,-0.1,
2185,2018,1437,1242,-0.037,-1.63,0.0,0.088235,0.0,0.0
2186,2018,1437,1276,-0.015,0.83,-2.0,0.088235,-0.1,-0.1


In [42]:
df_tournament_games.drop(labels=["Season", "WTeamID", "LTeamID"], inplace=True, axis=1)
df_tournament_2019_games.drop(labels=["Season", "FirstTeamID", "SecondTeamID", "StrongSeed", "WeakSeed", "Slot"], inplace=True, axis=1)
df_tournament_games = df_tournament_games.dropna()

In [43]:
df_tournament_games.head()

Unnamed: 0,RPIDifference,SOSDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference
947,0.051,-4.12,3.0,0.12931,-0.1,-0.1
949,0.117,7.42,-7.0,0.0,-0.1,0.1
950,0.202,8.26,-9.0,0.052995,0.1,0.233333
952,0.183,13.62,-15.0,0.21645,0.0,0.566667
953,0.079,14.04,-11.0,0.120192,-0.2,0.366667


In [44]:
df_tournament_2019_games.head()

Unnamed: 0,FirstTeamName,SecondTeamName,RPIDifference,SOSDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference
0,Michigan St,Bradley,0.143,13.74,-13.0,0.247772,0.1,
1,LSU,Yale,0.049,10.62,-11.0,0.0625,0.1,0.8
2,Virginia Tech,St Louis,0.054,5.95,-9.0,0.092857,-0.1,0.35
3,Mississippi St,Liberty,0.059,14.66,-7.0,-0.109482,-0.2,
4,Maryland,Belmont,0.023,13.62,-5.0,-0.145833,-0.4,0.166667


In [45]:
df_tournament_games["win"] = 1

In [46]:
df_tournament_games.head()

Unnamed: 0,RPIDifference,SOSDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference,win
947,0.051,-4.12,3.0,0.12931,-0.1,-0.1,1
949,0.117,7.42,-7.0,0.0,-0.1,0.1,1
950,0.202,8.26,-9.0,0.052995,0.1,0.233333,1
952,0.183,13.62,-15.0,0.21645,0.0,0.566667,1
953,0.079,14.04,-11.0,0.120192,-0.2,0.366667,1


In [47]:
df_tournament_games_flipped = df_tournament_games.copy()
df_tournament_games_flipped['RPIDifference'] = -1 * df_tournament_games_flipped['RPIDifference']
df_tournament_games_flipped['SOSDifference'] = -1 * df_tournament_games_flipped['SOSDifference']
df_tournament_games_flipped['SeedDifference'] = -1 * df_tournament_games_flipped['SeedDifference']
df_tournament_games_flipped['RegWinPercDifference'] = -1 * df_tournament_games_flipped['RegWinPercDifference']
df_tournament_games_flipped['Last10WinPercDifference'] = -1 * df_tournament_games_flipped['Last10WinPercDifference']
df_tournament_games_flipped['Top40WinPercDifference'] = -1 * df_tournament_games_flipped['Top40WinPercDifference']
df_tournament_games_flipped['win'] = 0

In [48]:
df_tournament_games_flipped.head()

Unnamed: 0,RPIDifference,SOSDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference,win
947,-0.051,4.12,-3.0,-0.12931,0.1,0.1,0
949,-0.117,-7.42,7.0,-0.0,0.1,-0.1,0
950,-0.202,-8.26,9.0,-0.052995,-0.1,-0.233333,0
952,-0.183,-13.62,15.0,-0.21645,-0.0,-0.566667,0
953,-0.079,-14.04,11.0,-0.120192,0.2,-0.366667,0


In [49]:
frames = [df_tournament_games, df_tournament_games_flipped]
df_historical_games = pd.concat(frames)

In [50]:
df_historical_games.head()

Unnamed: 0,RPIDifference,SOSDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference,win
947,0.051,-4.12,3.0,0.12931,-0.1,-0.1,1
949,0.117,7.42,-7.0,0.0,-0.1,0.1,1
950,0.202,8.26,-9.0,0.052995,0.1,0.233333,1
952,0.183,13.62,-15.0,0.21645,0.0,0.566667,1
953,0.079,14.04,-11.0,0.120192,-0.2,0.366667,1


In [51]:
outcomes = df_historical_games["win"]
features = df_historical_games.drop('win', axis=1)

In [52]:
features.head()

Unnamed: 0,RPIDifference,SOSDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference
947,0.051,-4.12,3.0,0.12931,-0.1,-0.1
949,0.117,7.42,-7.0,0.0,-0.1,0.1
950,0.202,8.26,-9.0,0.052995,0.1,0.233333
952,0.183,13.62,-15.0,0.21645,0.0,0.566667
953,0.079,14.04,-11.0,0.120192,-0.2,0.366667


In [53]:
df_tournament_2019_games['Top40WinPercDifference'].fillna(0, inplace=True)
df_tournament_2019_games.head()

Unnamed: 0,FirstTeamName,SecondTeamName,RPIDifference,SOSDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference
0,Michigan St,Bradley,0.143,13.74,-13.0,0.247772,0.1,0.0
1,LSU,Yale,0.049,10.62,-11.0,0.0625,0.1,0.8
2,Virginia Tech,St Louis,0.054,5.95,-9.0,0.092857,-0.1,0.35
3,Mississippi St,Liberty,0.059,14.66,-7.0,-0.109482,-0.2,0.0
4,Maryland,Belmont,0.023,13.62,-5.0,-0.145833,-0.4,0.166667


In [54]:
from sklearn.metrics import accuracy_score

# X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)
# def createAndTestModel(depth = 1, split = 2, leaf = 0.01):
#     new_model = DecisionTreeClassifier(max_depth=depth, min_samples_split=split, min_samples_leaf=leaf)
#     new_model.fit(X_train, y_train)

#     # TODO: Make predictions
#     y_train_pred = new_model.predict(X_train)
#     y_test_pred = new_model.predict(X_test)
    
#     test_accuracy = accuracy_score(y_test, y_test_pred)
#     return test_accuracy

In [55]:
# best_depth = 1
# best_split = 2
# best_leaf = 0.01
# best_acc = 0.0
# for depth in range(1, 20):
#     for min_split in range(2, 10):
#         for min_leaf in range(1, 500):
#             leaf = min_leaf / 1000
#             test_acc = createAndTestModel(depth, min_split, leaf)
#             if test_acc > best_acc:
#                 best_depth = depth
#                 best_split = min_split
#                 best_leaf = leaf
#                 best_acc = test_acc
# print('Best Depth: ', best_depth)
# print('Best Split: ', best_split)
# print('Best Leaf: ', best_leaf)
# print('Best Acc: ', best_acc)
# X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)
# model = DecisionTreeClassifier(max_depth=9, min_samples_split=4, min_samples_leaf=0.012)
# model.fit(X_train, y_train)

In [56]:
# y_train_pred = model.predict(X_train)
# y_test_pred = model.predict(X_test)
# from sklearn.metrics import accuracy_score
# train_accuracy = accuracy_score(y_train, y_train_pred)
# test_accuracy = accuracy_score(y_test, y_test_pred)
# print('The training accuracy is', train_accuracy)
# print('The test accuracy is', test_accuracy)

In [57]:
df_tournament_matchups = df_tournament_2019_games[['FirstTeamName', 'SecondTeamName']]
df_tournament_2019_game_predictions_data = df_tournament_2019_games.drop(labels=['FirstTeamName', 'SecondTeamName'], axis=1)
df_tournament_2019_game_predictions_data.head(32)

Unnamed: 0,RPIDifference,SOSDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference
0,0.143,13.74,-13.0,0.247772,0.1,0.0
1,0.049,10.62,-11.0,0.0625,0.1,0.8
2,0.054,5.95,-9.0,0.092857,-0.1,0.35
3,0.059,14.66,-7.0,-0.109482,-0.2,0.0
4,0.023,13.62,-5.0,-0.145833,-0.4,0.166667
5,0.016,0.56,-3.0,-0.011586,-0.2,0.1
6,0.004,-1.96,-1.0,0.039315,0.2,0.2
7,0.158,12.48,-15.0,0.328446,0.0,0.0
8,0.084,16.67,-13.0,0.073529,-0.2,0.0
9,0.077,10.82,-11.0,0.0625,0.1,0.9


In [58]:
def runDecisionTreeSimulation():
    X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)
#     model = DecisionTreeClassifier(max_depth=9, min_samples_split=4, min_samples_leaf=0.012)
    dtc = DecisionTreeClassifier()
    params = {'max_depth': np.arange(1, 10), 'min_samples_split': np.arange(2, 30), 'min_samples_leaf': np.arange(0.01, 0.5, 0.01)}
    model = GridSearchCV(dtc, params, scoring='accuracy')
    model.fit(X_train, y_train)
    print(model.best_score_)
    print(model.best_params_)
    print(model.best_estimator_)
    results = model.predict(df_tournament_2019_game_predictions_data)
    return results

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

def runLinearRegressionSimulation():
    X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)
    logreg = LogisticRegression()
    params = {'C': np.logspace(start=-5, stop=5, num=10)}
    model = GridSearchCV(logreg, params, scoring='neg_log_loss', refit=True)
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    print('Best log_loss: {:.4}, with best C: {}'.format(model.best_score_, model.best_params_['C']))
    results = model.predict(df_tournament_2019_game_predictions_data)
    return results

In [59]:
predictions = runDecisionTreeSimulation()
linear_predictions = runLinearRegressionSimulation()

0.7220055710306407
{'max_depth': 4, 'min_samples_leaf': 0.06999999999999999, 'min_samples_split': 2}
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=0.06999999999999999, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Best log_loss: -0.5418, with best C: 46.41588833612782


In [60]:
df_tournament_2019_games['outcome'] = predictions
df_tournament_2019_games['linear_outcome'] = linear_predictions
df_tournament_2019_games.head(32)

Unnamed: 0,FirstTeamName,SecondTeamName,RPIDifference,SOSDifference,SeedDifference,RegWinPercDifference,Last10WinPercDifference,Top40WinPercDifference,outcome,linear_outcome
0,Michigan St,Bradley,0.143,13.74,-13.0,0.247772,0.1,0.0,1,1
1,LSU,Yale,0.049,10.62,-11.0,0.0625,0.1,0.8,1,1
2,Virginia Tech,St Louis,0.054,5.95,-9.0,0.092857,-0.1,0.35,1,1
3,Mississippi St,Liberty,0.059,14.66,-7.0,-0.109482,-0.2,0.0,1,1
4,Maryland,Belmont,0.023,13.62,-5.0,-0.145833,-0.4,0.166667,1,1
5,Louisville,Minnesota,0.016,0.56,-3.0,-0.011586,-0.2,0.1,0,0
6,VA Commonwealth,UCF,0.004,-1.96,-1.0,0.039315,0.2,0.2,0,0
7,Gonzaga,F Dickinson,0.158,12.48,-15.0,0.328446,0.0,0.0,1,1
8,Michigan,Montana,0.084,16.67,-13.0,0.073529,-0.2,0.0,1,1
9,Texas Tech,N Kentucky,0.077,10.82,-11.0,0.0625,0.1,0.9,1,1
