In [1]:
# In this Kernel, I will build a model that takes into account the rating difference inherent in the Massey Ordinal syste,
# By using only the ordinal rankings of Massey Ordinals, it doesn't take into account that the difference between
# Teams #1 vs #10 is more significant tham #30 vs #40

# Partially inspired by discussion: https://www.kaggle.com/c/mens-machine-learning-competition-2019/discussion/80692

# In this kernel, I will use Jeff Sonas' Chessmetrics formula to convert from ordinal rank to true relative strength
# https://www.kaggle.com/c/march-machine-learning-mania-2014/discussion/6777

import numpy as np
import pandas as pd

# Store path to data file on local computer
PATH = '/Users/kelson/Kaggle/MarchMadness/Data/'

# Read in sample submission
sub = pd.read_csv(f'{PATH}SampleSubmissionStage2.csv')

# Read in Basic Tourney Data
tourney_compact = pd.read_csv(f'{PATH}DataFiles/NCAATourneyCompactResults.csv')

# Read in Massey Ordinals
mo_up_to_128 = pd.read_csv(f'{PATH}MasseyOrdinals_thru_2019_day_128.csv')

# Kaggle released 2019 day 133 seperately, so must merge
mo = pd.read_csv(f'{PATH}MasseyOrdinals_2019_only_day_133_61systems.csv')
#mo = pd.concat([mo_up_to_128, mo_133]) 

mo.head()


Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
0,2019,133,7OT,1101,156
1,2019,133,7OT,1102,205
2,2019,133,7OT,1103,82
3,2019,133,7OT,1104,40
4,2019,133,7OT,1105,346


In [2]:
# Add in a rating value that accounts for the relative strength of the team
# Per Sonas' Chessmetrics formula, rating = 100 - 4*LN(rank+1) - rank/22
mo['Rating'] = 100 - 4*np.log(mo['OrdinalRank'] + 1) - (mo['OrdinalRank']/22)
mo.head()

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank,Rating
0,2019,133,7OT,1101,156,72.684108
1,2019,133,7OT,1102,205,69.370314
2,2019,133,7OT,1103,82,78.597365
3,2019,133,7OT,1104,40,83.32753
4,2019,133,7OT,1105,346,60.875428


In [3]:
# Take the median rating for each team
mo = mo.drop(columns = ['Season', 'RankingDayNum', 'OrdinalRank'])
mo = mo.groupby(['TeamID']).median()
# Store median rating values as dictionary for easy retrieval 
mo.head()

Unnamed: 0_level_0,Rating
TeamID,Unnamed: 1_level_1
1101,73.694174
1102,68.027473
1103,74.742569
1104,81.053487
1105,60.818463


In [4]:
# Add columns with individual titles to add  MO data
sub['Season'], sub['Team1'], sub['Team2'] = sub['ID'].str.split('_').str

# Make values numerical
sub[['Season', 'Team1', 'Team2']] = sub[['Season', 'Team1', 'Team2']].apply(pd.to_numeric)

sub.head()

Unnamed: 0,ID,Pred,Season,Team1,Team2
0,2019_1101_1113,0.5,2019,1101,1113
1,2019_1101_1120,0.5,2019,1101,1120
2,2019_1101_1124,0.5,2019,1101,1124
3,2019_1101_1125,0.5,2019,1101,1125
4,2019_1101_1133,0.5,2019,1101,1133


In [5]:
sub = pd.merge(sub, mo, how='left', left_on='Team1', right_on='TeamID')
sub = pd.merge(sub, mo, how='left', left_on='Team2', right_on='TeamID', suffixes=['W', 'L'])
sub.head()

Unnamed: 0,ID,Pred,Season,Team1,Team2,RatingW,RatingL
0,2019_1101_1113,0.5,2019,1101,1113,73.694174,81.516122
1,2019_1101_1120,0.5,2019,1101,1120,73.694174,88.531436
2,2019_1101_1124,0.5,2019,1101,1124,73.694174,82.508501
3,2019_1101_1125,0.5,2019,1101,1125,73.694174,81.876843
4,2019_1101_1133,0.5,2019,1101,1133,73.694174,72.684108


In [7]:
# Generate predictions based on Chessmetrics formula 
# Winning prob = 1/(1+POWER(10,-RatingDiff/15))
sub['pred'] = 1/(1+10**((sub['RatingL']-sub['RatingW'])/15))

# Log loss punishes utmost certainty with a prediction, so make sure no values equal exactly 1 or 0
#sub['pred'].clip(0.025, .975)

sub.head(20)



Unnamed: 0,ID,Pred,Season,Team1,Team2,RatingW,RatingL,pred
0,2019_1101_1113,0.5,2019,1101,1113,73.694174,81.516122,0.231348
1,2019_1101_1120,0.5,2019,1101,1120,73.694174,88.531436,0.092995
2,2019_1101_1124,0.5,2019,1101,1124,73.694174,82.508501,0.205372
3,2019_1101_1125,0.5,2019,1101,1125,73.694174,81.876843,0.221648
4,2019_1101_1133,0.5,2019,1101,1133,73.694174,72.684108,0.538685
5,2019_1101_1138,0.5,2019,1101,1138,73.694174,87.665786,0.104825
6,2019_1101_1153,0.5,2019,1101,1153,73.694174,86.033588,0.13077
7,2019_1101_1159,0.5,2019,1101,1159,73.694174,74.78086,0.458393
8,2019_1101_1181,0.5,2019,1101,1181,73.694174,95.514642,0.033909
9,2019_1101_1192,0.5,2019,1101,1192,73.694174,69.370502,0.660093


In [8]:
# Write submission to csv 
sub['id'] = sub['ID']
sub[['id', 'pred']].to_csv('submission_mo.csv', index=False)