In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

mens_dir = "../data/MDataFiles_Stage1/"

In [3]:
M_seeds = pd.read_csv(mens_dir + "MNCAATourneySeeds.csv")
M_tour = pd.read_csv(mens_dir + "MNCAATourneyCompactResults.csv")

pd.set_option('display.max_columns', 999)

# Load the training data
For this second kaggle attempt I will be keeping it relatively simple, just like the first try and only be using a handful of files for the models to calculate other features.

Loading detailed season data to calculate season average statistics for each team.

In [5]:
M_reg_season_detailed = pd.read_csv(mens_dir + "MRegularSeasonDetailedResults.csv")
M_reg_season_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,3,14,11,18,14,24,13,23,7,1,22,22,53,2,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,8,20,10,19,15,28,16,13,4,4,18,24,67,6,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,8,18,17,29,17,26,15,10,5,2,25,22,73,3,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,3,9,17,31,6,19,11,12,14,2,18,18,49,6,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,6,14,11,13,17,22,12,14,4,4,20,24,62,6,16,17,27,21,15,12,10,7,1,14


Drop columns I wont be using.

In [6]:
M_reg_season_detailed.drop(labels=['WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WDR', 'WAst', 
                'WStl', 'WBlk', 'WPF', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LDR', 
                'LAst', 'LStl', 'LBlk', 'LPF', 'WLoc', 'NumOT', 'WOR', 'LOR'], 
                            inplace=True, axis=1)
M_reg_season_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WFGM,WFGA,WTO,LFGM,LFGA,LTO
0,2003,10,1104,68,1328,62,27,58,23,22,53,18
1,2003,10,1272,70,1393,63,26,62,13,24,67,12
2,2003,11,1266,73,1437,61,24,58,10,22,73,12
3,2003,11,1296,56,1457,50,18,38,12,18,49,19
4,2003,11,1400,77,1208,71,30,61,14,24,62,10


In [7]:
M_teams = pd.read_csv(mens_dir + "MTeams.csv")
M_teams.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2020
1,1102,Air Force,1985,2020
2,1103,Akron,1985,2020
3,1104,Alabama,1985,2020
4,1105,Alabama A&M,2000,2020


In [8]:
year_list = range(2003, 2019)
M_teams = pd.read_csv(mens_dir + "MTeams.csv")
teamIDs = M_teams['TeamID'].tolist()

rows = list()

for year in year_list:
    for team in teamIDs:
        df_curr_season = M_reg_season_detailed[M_reg_season_detailed.Season == year]       

        df_curr_team_wins = df_curr_season[df_curr_season.WTeamID == team]
        df_curr_team_losses = df_curr_season[df_curr_season.LTeamID == team]
        
        # no games played by team...
        if df_curr_team_wins.shape[0] == 0 and df_curr_team_losses.shape[0] == 0:
            continue;
        
        df_winteam = df_curr_team_wins.rename(columns={'WTeamID':'TeamID', 'WFGM':'FGM', 
                    'WFGA':'FGA', 'WTO':'TO', 'WScore':'Score', 'LScore':'OppScore'})
        
        # drop all columns except the ones will be using
        df_winteam = df_winteam[['TeamID', 'FGM', 'FGA', 'TO', 'Score', 'OppScore']]

        df_loseteam = df_curr_team_losses.rename(columns={'LTeamID':'TeamID', 'LFGM':'FGM',
                    'LFGA':'FGA', 'LTO':'TO', 'LScore':'Score', 'WScore':'OppScore'})
        
        # drop all columns except the ones will be using
        df_loseteam = df_loseteam[['TeamID', 'FGM', 'FGA', 'TO', 'Score', 'OppScore']] 

        # dataframe w/ all relevant stats from current year for current team
        df_curr_team = pd.concat((df_winteam, df_loseteam)) 
        
        wins = df_winteam.shape[0]
        FGPercent = df_curr_team['FGM'].sum() / df_curr_team['FGA'].sum()
        TurnoverAvg = df_curr_team['TO'].sum() / len(df_curr_team['TO'].values)
        PPG = df_curr_team['Score'].sum() / len(df_curr_team['Score'].values)
        OppPPG = df_curr_team['OppScore'].sum() / len(df_curr_team['OppScore'].values)

        # collect all data in rows list first for effeciency
        rows.append([year, team, wins, FGPercent, TurnoverAvg, PPG, OppPPG])

M_training_data = pd.DataFrame(rows, columns=['Season', 'TeamID', 'Wins', 'FGPercent', 
                                               'TOAvg', 'PPG', 'OppPPG'])
M_training_data.head()

Unnamed: 0,Season,TeamID,Wins,FGPercent,TOAvg,PPG,OppPPG
0,2003,1102,12,0.481149,11.428571,57.25,57.0
1,2003,1103,13,0.486074,12.62963,78.777778,78.148148
2,2003,1104,17,0.420362,13.285714,69.285714,65.0
3,2003,1105,7,0.395755,18.653846,71.769231,76.653846
4,2003,1106,13,0.423773,17.035714,63.607143,63.75


In [9]:
M_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [10]:
M_tour.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [11]:
def seed_to_int(seed):
    #Get just the digits from the seeding. Return as int
    s_int = int(seed[1:3])
    return s_int
M_seeds['seed_int'] = M_seeds.Seed.apply(seed_to_int)
M_seeds.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label
M_seeds.head()

Unnamed: 0,Season,TeamID,seed_int
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5


In [12]:
M_tour.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
M_tour.head()

Unnamed: 0,Season,WTeamID,LTeamID
0,1985,1116,1234
1,1985,1120,1345
2,1985,1207,1250
3,1985,1229,1425
4,1985,1242,1325


In [13]:
M_winseeds = M_seeds.rename(columns = {"TeamID" : "WTeamID", "seed_int" : "WSeed"})
M_lossseeds = M_seeds.rename(columns = {"TeamID" : "LTeamID", "seed_int" : "LSeed"})
M_dummy = pd.merge(left = M_tour, right = M_winseeds, how = "left", on = ["Season", "WTeamID"])
M_merged = pd.merge(left = M_dummy, right = M_lossseeds, on = ["Season", "LTeamID"])
M_merged["SeedDiff"] = M_merged.WSeed - M_merged.LSeed
M_merged.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,SeedDiff
0,1985,1116,1234,9,8,1
1,1985,1120,1345,11,6,5
2,1985,1207,1250,1,16,-15
3,1985,1229,1425,9,8,1
4,1985,1242,1325,3,14,-11


In [14]:
M_winstats = M_training_data.rename(columns = {"TeamID" : "WTeamID", "FGPercent" : "WFGPercent", 
                            "TOAvg" : "WTOAvg", "PPG" : "WPPG", "OppPPG" : "WOppPPG", "Wins" : "WWins"})

M_lossstats = M_training_data.rename(columns = {"TeamID" : "LTeamID", "FGPercent" : "LFGPercent",
                            "TOAvg" : "LTOAvg", "PPG" : "LPPG", "OppPPG" : "LOppPPG", "Wins" :  "LWins"})

df_dummy = pd.merge(left = M_merged, right = M_winstats, on = ["Season", "WTeamID"])
M_merged = pd.merge(left = df_dummy, right = M_lossstats, on = ["Season", "LTeamID"])

M_merged['FGPercentDiff'] = M_merged.WFGPercent - M_merged.LFGPercent
M_merged['TOAvgDiff'] = M_merged.WTOAvg - M_merged.LTOAvg
M_merged['PPGDiff'] = M_merged.WPPG - M_merged.LPPG
M_merged['OppPPGDiff'] = M_merged.WOppPPG - M_merged.LOppPPG
M_merged['WWinMargin'] = M_merged.WPPG - M_merged.WOppPPG
M_merged['LWinMargin'] = M_merged.LPPG - M_merged.LOppPPG
M_merged['WinMarginDiff'] = M_merged.WWinMargin - M_merged.LWinMargin
M_merged['WinDiff'] = M_merged.WWins - M_merged.LWins

 # drop all columns except the ones we are using
M_merged = M_merged[['Season', 'WTeamID', 'LTeamID', 'SeedDiff', 'FGPercentDiff', 
                       'TOAvgDiff', 'PPGDiff', 'OppPPGDiff', 'WinMarginDiff', 'WinDiff']]


M_merged.head()

Unnamed: 0,Season,WTeamID,LTeamID,SeedDiff,FGPercentDiff,TOAvgDiff,PPGDiff,OppPPGDiff,WinMarginDiff,WinDiff
0,2003,1421,1411,0,-0.018262,0.973563,-1.593103,7.614943,-9.208046,-5
1,2003,1112,1436,-15,0.016969,0.716749,17.421182,7.112069,10.309113,6
2,2003,1112,1211,-8,-0.008628,0.237327,8.14977,2.056452,6.093318,2
3,2003,1112,1323,-4,0.012716,2.011521,5.117512,-0.943548,6.06106,3
4,2003,1113,1272,3,0.040251,0.206897,1.448276,3.344828,-1.896552,-5


**Create positive and negative versions of the data so the algorithms have sample data of each class to classify...**

In [15]:
M_wins = pd.DataFrame()
M_wins['SeedDiff'] = M_merged['SeedDiff']
M_wins['FGPercentDiff'] = M_merged['FGPercentDiff']
M_wins['TOAvgDiff'] = M_merged['TOAvgDiff']
M_wins['PPGDiff'] = M_merged['PPGDiff']
M_wins['OppPPGDiff'] = M_merged['OppPPGDiff']
M_wins['WinMarginDiff'] = M_merged['WinMarginDiff']
M_wins['WinDiff'] = M_merged['WinDiff']
M_wins['Result'] = 1


In [16]:
M_losses = pd.DataFrame()
M_losses['SeedDiff'] = -M_merged['SeedDiff']
M_losses['FGPercentDiff'] = -M_merged['FGPercentDiff']
M_losses['TOAvgDiff'] = -M_merged['TOAvgDiff']
M_losses['PPGDiff'] = -M_merged['PPGDiff']
M_losses['OppPPGDiff'] = -M_merged['OppPPGDiff']
M_losses['WinMarginDiff'] = -M_merged['WinMarginDiff']
M_losses['WinDiff'] = -M_merged['WinDiff']
M_losses['Result'] = 0

In [17]:
M_preds = pd.concat((M_wins, M_losses))
M_preds.head()

Unnamed: 0,SeedDiff,FGPercentDiff,TOAvgDiff,PPGDiff,OppPPGDiff,WinMarginDiff,WinDiff,Result
0,0,-0.018262,0.973563,-1.593103,7.614943,-9.208046,-5,1
1,-15,0.016969,0.716749,17.421182,7.112069,10.309113,6,1
2,-8,-0.008628,0.237327,8.14977,2.056452,6.093318,2,1
3,-4,0.012716,2.011521,5.117512,-0.943548,6.06106,3,1
4,3,0.040251,0.206897,1.448276,3.344828,-1.896552,-5,1


In [18]:
M_preds.tail()

Unnamed: 0,SeedDiff,FGPercentDiff,TOAvgDiff,PPGDiff,OppPPGDiff,WinMarginDiff,WinDiff,Result
1043,-5,0.00287,-1.533929,6.971429,3.301786,3.669643,5,0
1044,-15,0.02322,-3.101662,-4.938416,-17.573803,12.635386,10,0
1045,7,0.043533,0.400609,-2.238337,-2.965517,0.727181,0,0
1046,8,0.033658,1.827986,3.988414,10.090909,-6.102496,-1,0
1047,15,-0.049303,0.193405,-6.625668,5.250446,-11.876114,-13,0


In [19]:
X_train = [list(a) for a in zip(M_preds.SeedDiff.values, M_preds.FGPercentDiff.values, 
                                M_preds.TOAvgDiff.values, M_preds.PPGDiff.values,
                                M_preds.OppPPGDiff.values, M_preds.WinMarginDiff.values,
                                M_preds.WinDiff.values)]
X_train = np.array(X_train)
y_train = M_preds.Result.values
X_train, y_train = shuffle(X_train, y_train)

In [None]:
# Gradient Boosted Classifier
GBC = GradientBoostingClassifier()
param_grid_GBC = {
    "n_estimators" : [1000],
    "learning_rate" : [0.1, 0.05, 0.02, 0.01],
    "max_depth" : [1,2,3],
    "min_samples_leaf" : [1,3,5],
    "max_features" : ["auto"]  # "[1.0, 0.3, 0.1]
}
clf = GridSearchCV(GBC, param_grid_GBC, scoring='neg_log_loss')
clf.fit(X_train, y_train)
print('Best Gradient Boosting Classifier: {}'.format(clf.best_score_))

# Random Forest Classifier
RFC = RandomForestClassifier()
param_grid_RFC = { 
    'n_estimators': [60, 120, 240, 480],
    'max_features': ['auto', 'sqrt', 'log2']
}
clf = GridSearchCV(RFC, param_grid_RFC, scoring='neg_log_loss')
clf.fit(X_train, y_train)
print('Best Random Forest Classifier: {}'.format(clf.best_score_))

# K Nearest Neighbors Classifier
knn = KNeighborsClassifier()
k = np.arange(80)+1
parameters = {'n_neighbors': k}
clf = GridSearchCV(knn, parameters, scoring='neg_log_loss')
clf.fit(X_train, y_train)
print('Best K-Nearest Neighbors Classifier: {}'.format(clf.best_score_))

# SVC
SVC = svm.SVC(probability=True)
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
tuned_parameters_preselected = [{'kernel': ['linear'], 'C': [10]}]
clf = GridSearchCV(SVC, tuned_parameters_preselected, scoring='neg_log_loss')
clf.fit(X_train, y_train)
print('Best Support Vector Classification: {}'.format(clf.best_score_))

# Logistic Regression
logreg = LogisticRegression()
params = {'C': np.logspace(start=-15, stop=15, num=31)}
clf = GridSearchCV(logreg, params, scoring='neg_log_loss', refit=True) #sklearn model selection
clf.fit(X_train, y_train)
print("Best Logistic Regression: {}, w/best C: {}".format(clf.best_score_, 
                                                                      clf.best_params_['C'], 6))