In [1]:
# Much of the work done here based on 
# https://www.kaggle.com/duvallwh/ncaa-2k19-starter-kernel-with-graphical-eda
# Using as template starter kernel to get off the ground with first attempt
# Decided to rely primarily on Massey Ordinal and Seed data
# Wanted to begin with a simple model both to help starting with first ever Kaggle Kernel and 
# I've found in prior bracket challenges simple can be better


# TLDR; this kernel ends in a model using tourney_compact_data, Massey Ordinal data from day 133, and seeds
import sklearn
import os

# Fixes XGB boost error that killed kernel
# https://stackoverflow.com/questions/51164771/python-xgboost-kernel-died
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import pandas as pd # data processing
import numpy as np # linear algebra analysis
import matplotlib.pyplot as plt
import seaborn as sns # visualization
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import warnings
from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings("ignore")

In [5]:
# Store path to data file on local computer
PATH = '/Users/kelson/Kaggle/MarchMadness/Data/'

# Read in sample submission
sub = pd.read_csv(f'{PATH}SampleSubmissionStage2.csv')

# Read in seeds
seeds = pd.read_csv(f'{PATH}Stage2DataFiles/NCAATourneySeeds.csv')

# Transform seed data into numerical value
seeds['seed_num'] = seeds.Seed.apply(lambda x: int(x[1:3]))

# Read in Basic Tourney Data
games = pd.read_csv(f'{PATH}Stage2DataFiles/NCAATourneyCompactResults.csv')

# According to Kaggle, Team1 is the team with the lower TeamID
games['Team1'] = np.where((games.WTeamID < games.LTeamID), games.WTeamID, games.LTeamID)
games['Team2'] = np.where((games.WTeamID > games.LTeamID), games.WTeamID, games.LTeamID)

# Represent a win by Team1 as a "1"
games['target'] = np.where((games['WTeamID'] < games['LTeamID']), 1, 0)


In [6]:
# Read in Massey Ordinals
mo_up_to_128 = pd.read_csv(f'{PATH}MasseyOrdinals_thru_2019_day_128.csv')

# Kaggle released 2019 day 133 seperatly, so must merge
mo_133 = pd.read_csv(f'{PATH}MasseyOrdinals_2019_only_day_133_61systems.csv')
mo = pd.concat([mo_up_to_128, mo_133])

# Use only most recent Massey Ordinal Ratingas
mo = mo[(mo['RankingDayNum'] >=128)]

In [7]:
# Add seed info to games df
games = games.merge(seeds, how='left', left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
games = games.merge(seeds, how='left', left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'], suffixes=('Team1', 'Team2'))
games.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,Team1,Team2,target,SeedTeam1,TeamIDTeam1,seed_numTeam1,SeedTeam2,TeamIDTeam2,seed_numTeam2
0,1985,136,1116,63,1234,54,N,0,1116,1234,1,X09,1116,9,X08,1234,8
1,1985,136,1120,59,1345,58,N,0,1120,1345,1,Z11,1120,11,Z06,1345,6
2,1985,136,1207,68,1250,43,N,0,1207,1250,1,W01,1207,1,W16,1250,16
3,1985,136,1229,58,1425,55,N,0,1229,1425,1,Y09,1229,9,Y08,1425,8
4,1985,136,1242,49,1325,38,N,0,1242,1325,1,Z03,1242,3,Z14,1325,14


In [8]:
# Add Massey Ordinal data to dataframe
# Team1
games = games.merge(mo, left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
# Team2
games = games.merge(mo, left_on=['Season', 'Team2', 'SystemName'], right_on=['Season', 'TeamID', 'SystemName'], suffixes=['Team1', 'Team2'])


In [9]:
features = ['seed_numTeam1', 'OrdinalRankTeam1', 'seed_numTeam2', 'OrdinalRankTeam2']
# Select only the data from desired features
X = games[features]
y = games['target']

In [10]:
# Split into training and testing data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 25)
X_train.head()

Unnamed: 0,seed_numTeam1,OrdinalRankTeam1,seed_numTeam2,OrdinalRankTeam2
109608,3,9,7,28
101957,16,214,16,184
56998,6,53,11,16
43690,10,45,7,46
78621,1,4,5,15


In [11]:
xgb = XGBClassifier()

# Fit model
xgb.fit(X_train, y_train)

# Retrieve predictions
preds = xgb.predict_proba(X_test).clip(0.025, .975)

In [12]:
print(log_loss(y_test, preds))


0.5097063832564824


In [10]:
# Compare to performance with only Massey Ordinals
features_mo = ['OrdinalRankTeam1', 'OrdinalRankTeam2']
X_mo = games[features_mo]
y_mo = games['target']
X_train_mo, X_test_mo, y_train_mo, y_test_mo = train_test_split(X_mo, y_mo, random_state = 4)



In [11]:
xgb_mo = XGBClassifier()
xgb_mo.fit(X_train_mo, y_train_mo)
preds_mo = xgb_mo.predict_proba(X_test_mo)
print(log_loss(y_test_mo, preds_mo))

0.5481320249401591


In [12]:
# Compare to performance with only Seeds
features_seeds = ['seed_numTeam1', 'seed_numTeam2']
X_seeds = games[features_seeds]
y_seeds = games['target']
X_train_seeds, X_test_seeds, y_train_seeds, y_test_seeds = train_test_split(X_seeds, y_seeds, random_state = 4)

In [13]:
xgb_seeds = XGBClassifier()
xgb_seeds.fit(X_train_seeds, y_train_seeds)
preds_seeds = xgb_seeds.predict_proba(X_test_seeds)

In [14]:
print(log_loss(y_test_seeds, preds_seeds))

0.5174880694979522


In [10]:
# Test if linear regression model works better
reg = LinearRegression()
reg.fit(X_train, y_train)
preds_linear = reg.predict(X_test)

In [11]:
print(log_loss(y_test, preds_linear))

0.5989370351024622


In [10]:
# Next, test logistic regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
preds_log = log_reg.predict(X_test)

In [11]:
print(log_loss(y_test, preds_log))

10.380825624170734


In [10]:
seeds.tail()

Unnamed: 0,Season,Seed,TeamID,seed_num
2281,2019,Z12,1332,12
2282,2019,Z13,1414,13
2283,2019,Z14,1330,14
2284,2019,Z15,1159,15
2285,2019,Z16,1205,16


In [13]:
# Add columns with individual titles to add seed and MO data
sub['Season'], sub['Team1'], sub['Team2'] = sub['ID'].str.split('_').str

# Make values numerical
sub[['Season', 'Team1', 'Team2']] = sub[['Season', 'Team1', 'Team2']].apply(pd.to_numeric)

# Add Seed data to submission
sub = pd.merge(sub, seeds, how='left', left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
sub = pd.merge(sub, seeds, how='left', left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'], suffixes=('Team1', 'Team2'))
sub.head()


Unnamed: 0,ID,Pred,Season,Team1,Team2,SeedTeam1,TeamIDTeam1,seed_numTeam1,SeedTeam2,TeamIDTeam2,seed_numTeam2
0,2019_1101_1113,0.5,2019,1101,1113,Y15,1101,15,X11a,1113,11
1,2019_1101_1120,0.5,2019,1101,1120,Y15,1101,15,Y05,1120,5
2,2019_1101_1124,0.5,2019,1101,1124,Y15,1101,15,X09,1124,9
3,2019_1101_1125,0.5,2019,1101,1125,Y15,1101,15,W11a,1125,11
4,2019_1101_1133,0.5,2019,1101,1133,Y15,1101,15,W15,1133,15


In [14]:
mo_133 = mo_133.groupby(['TeamID']).median()
mo_133.head()

Unnamed: 0_level_0,Season,RankingDayNum,OrdinalRank
TeamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1101,2019.0,133.0,142.0
1102,2019.0,133.0,226.0
1103,2019.0,133.0,128.0
1104,2019.0,133.0,58.0
1105,2019.0,133.0,347.0


In [15]:
# Add Massey Ordinal data to submission
sub = sub.merge(mo_133, left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
sub = sub.merge(mo_133, left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'], suffixes=['Team1', 'Team2'])

sub.head()

Unnamed: 0,ID,Pred,Season,Team1,Team2,SeedTeam1,TeamIDTeam1,seed_numTeam1,SeedTeam2,TeamIDTeam2,seed_numTeam2,RankingDayNumTeam1,OrdinalRankTeam1,RankingDayNumTeam2,OrdinalRankTeam2
0,2019_1101_1113,0.5,2019,1101,1113,Y15,1101,15,X11a,1113,11,133.0,142.0,133.0,54.0
1,2019_1101_1120,0.5,2019,1101,1120,Y15,1101,15,Y05,1120,5,133.0,142.0,133.0,14.0
2,2019_1113_1120,0.5,2019,1113,1120,X11a,1113,11,Y05,1120,5,133.0,54.0,133.0,14.0
3,2019_1101_1124,0.5,2019,1101,1124,Y15,1101,15,X09,1124,9,133.0,142.0,133.0,46.0
4,2019_1113_1124,0.5,2019,1113,1124,X11a,1113,11,X09,1124,9,133.0,54.0,133.0,46.0


In [16]:
# Retrieve feature data from submission 
X_sub = sub[features]
xgb = XGBClassifier()

#XGB model using both seeds and ordinal data had better log loss value than either unique
xgb.fit(X, y)

# Log loss punishes utmost certainty with a prediction, so make sure no values equal exactly 1 or 0
sub_preds = xgb.predict_proba(X_sub).clip(0.025, .975)


# Seperate prediction values for teams1,2. We want the values in 'win1'
sub_preds_df = pd.DataFrame(data=sub_preds, columns=['win2', 'win1'])

sub['pred'] = sub_preds_df['win1']
sub['id'] = sub['ID']

# Write out submission
sub[['id', 'pred']].to_csv('submission.csv', index = False)


In [17]:
sub.head(20)

Unnamed: 0,ID,Pred,Season,Team1,Team2,SeedTeam1,TeamIDTeam1,seed_numTeam1,SeedTeam2,TeamIDTeam2,seed_numTeam2,RankingDayNumTeam1,OrdinalRankTeam1,RankingDayNumTeam2,OrdinalRankTeam2,pred,id
0,2019_1101_1113,0.5,2019,1101,1113,Y15,1101,15,X11a,1113,11,133.0,142.0,133.0,54.0,0.07316,2019_1101_1113
1,2019_1101_1120,0.5,2019,1101,1120,Y15,1101,15,Y05,1120,5,133.0,142.0,133.0,14.0,0.088839,2019_1101_1120
2,2019_1113_1120,0.5,2019,1113,1120,X11a,1113,11,Y05,1120,5,133.0,54.0,133.0,14.0,0.480709,2019_1113_1120
3,2019_1101_1124,0.5,2019,1101,1124,Y15,1101,15,X09,1124,9,133.0,142.0,133.0,46.0,0.095681,2019_1101_1124
4,2019_1113_1124,0.5,2019,1113,1124,X11a,1113,11,X09,1124,9,133.0,54.0,133.0,46.0,0.462375,2019_1113_1124
5,2019_1120_1124,0.5,2019,1120,1124,Y05,1120,5,X09,1124,9,133.0,14.0,133.0,46.0,0.607613,2019_1120_1124
6,2019_1101_1125,0.5,2019,1101,1125,Y15,1101,15,W11a,1125,11,133.0,142.0,133.0,51.0,0.069941,2019_1101_1125
7,2019_1113_1125,0.5,2019,1113,1125,X11a,1113,11,W11a,1125,11,133.0,54.0,133.0,51.0,0.376021,2019_1113_1125
8,2019_1120_1125,0.5,2019,1120,1125,Y05,1120,5,W11a,1125,11,133.0,14.0,133.0,51.0,0.582517,2019_1120_1125
9,2019_1124_1125,0.5,2019,1124,1125,X09,1124,9,W11a,1125,11,133.0,46.0,133.0,51.0,0.316473,2019_1124_1125
