In [44]:
import numpy as np
import pandas as pd
from sklearn import metrics
import math
import tabulate
import time

from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn import svm, neighbors
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier #RandomizedLasso
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessRegressor 
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.semi_supervised import LabelPropagation
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# data processing functions

In [45]:
def fix_seed(string_seed):
    result = ""
    for char in string_seed:
        if char.isdigit():
            result += char
    return int(result)

In [46]:
def load_data_2018():
    ext_data_matchups = ['PomeroyRank', 'Conf', 'AdjEM', 'AdjO', 'AdjD', 'AdjT', 'Luck', 'SOSAdjEM', 'OppO', 'OppD', 'NCSOSAdjEM', 'MooreRank', 'MooreSOS', 'MoorePR', 'OppPomeroyRank', 'OppConf', 'OppAdjEM', 'OppAdjO', 'OppAdjD', 'OppAdjT', 'OppLuck', 'OppSOSAdjEM', 'OppOppO', 'OppOppD', 'OppNCSOSAdjEM', 'OppMooreRank', 'OppMooreSOS', 'OppMoorePR']
    ext_data_team = ['PomeroyRank', 'Conf', 'AdjEM', 'AdjO', 'AdjD', 'AdjT', 'Luck', 'SOSAdjEM', 'OppO', 'OppD', 'NCSOSAdjEM', 'MooreRank', 'MooreSOS', 'MoorePR']
    
    train = pd.read_csv('train_2010_2017.csv')
    #train = train.drop(labels=ext_data_matchups, axis=1)
    #train['TeamSeed'] = train['TeamSeed'].apply(fix_seed)
    #train['OppTeamSeed'] = train['OppTeamSeed'].apply(fix_seed)
    
    train_Y = train['Outcome']
    train_X = train.drop(labels=['Outcome'], axis=1)
    
    team_data = pd.read_csv('team_info_2018.csv')
    #team_data = team_data.drop(labels=ext_data_team, axis=1)
    #team_data['Seed'] = team_data['Seed'].apply(fix_seed)
    
    return train_X, train_Y, team_data

In [47]:
def load_data_2017():
    ext_data_matchups = ['PomeroyRank', 'Conf', 'AdjEM', 'AdjO', 'AdjD', 'AdjT', 'Luck', 'SOSAdjEM', 'OppO', 'OppD', 'NCSOSAdjEM', 'MooreRank', 'MooreSOS', 'MoorePR', 'OppPomeroyRank', 'OppConf', 'OppAdjEM', 'OppAdjO', 'OppAdjD', 'OppAdjT', 'OppLuck', 'OppSOSAdjEM', 'OppOppO', 'OppOppD', 'OppNCSOSAdjEM', 'OppMooreRank', 'OppMooreSOS', 'OppMoorePR']
    ext_data_team = ['PomeroyRank', 'Conf', 'AdjEM', 'AdjO', 'AdjD', 'AdjT', 'Luck', 'SOSAdjEM', 'OppO', 'OppD', 'NCSOSAdjEM', 'MooreRank', 'MooreSOS', 'MoorePR']
    
    train = pd.read_csv('train_2010_2016.csv')
    #train = train.drop(labels=ext_data_matchups, axis=1)
    #train['TeamSeed'] = train['TeamSeed'].apply(fix_seed)
    #train['OppTeamSeed'] = train['OppTeamSeed'].apply(fix_seed)
    
    train_Y = train['Outcome']
    train_X = train.drop(labels=['Outcome'], axis=1)
    
    team_data = pd.read_csv('team_info_2017.csv')
    #team_data = team_data.drop(labels=ext_data_team, axis=1)
    #team_data['Seed'] = team_data['Seed'].apply(fix_seed)
    
    return train_X, train_Y, team_data

In [48]:
def check_coef(lr, train_X):
    adj = []
    coefs = lr.coef_
    for c in coefs[0]:
        adj.append(math.exp(c))

    features = pd.DataFrame(data=list(train_X))
    weights = pd.DataFrame(data=adj)

    feature_weights = pd.concat([features, weights], axis=1)
    feature_weights.columns = ['Feature', 'Weight']
    feature_weights = feature_weights.sort_values(by='Weight', ascending=False)
    return feature_weights

# helper functions for matchups

In [49]:
def winners_to_matchups(winners):
    matchups = []
    for i in xrange(0,len(winners),2):
        team1 = winners[i]
        team2 = winners[i+1]
        matchups.append([team1, team2])
    return matchups

In [50]:
def create_matchups(team_data, pairings, rd, season):
    opp_prefixes = ['Season', 'OppTeamID', 'OppW', 'OppL', 'OppAvgScore', 'OppAvgFGM', 'OppAvgFGA', 'OppAvgFGM3', 'OppAvgFGA3', 'OppAvgFTM', 'OppAvgFTA', 'OppAvgOR', 'OppAvgDR', 'OppAvgAst', 'OppAvgTO', 'OppAvgStl', 'OppAvgBlk', 'OppAvgPF', 'OppAvgOppScore', 'OppAvgOppFGM', 'OppAvgOppFGA', 'OppAvgOppFGM3', 'OppAvgOppFGA3', 'OppAvgOppFTM', 'OppAvgOppFTA', 'OppAvgOppOR', 'OppAvgOppDR', 'OppAvgOppAst', 'OppAvgOppTO', 'OppAvgOppStl', 'OppAvgOppBlk', 'OppAvgOppPF', 'OppSeed']
    
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    
    for p in pairings:        
        team_1 = p[0]
        team_1_data = team_data[(team_data['Season'] == season) & (team_data['TeamID'] == team_1)]
        team_1_data_opp = team_1_data.copy()
        team_1_data_opp.columns = opp_prefixes
        
        team_2 = p[1]
        team_2_data = team_data[(team_data['Season'] == season) & (team_data['TeamID'] == team_2)]
        team_2_data_opp = team_2_data.copy()
        team_2_data_opp.columns = opp_prefixes
        
        team1_v_team2 = team_1_data.merge(team_2_data_opp, how='outer', on='Season')
        team2_v_team1 = team_2_data.merge(team_1_data_opp, how='outer', on='Season')
        
        df1 = df1.append(team1_v_team2, ignore_index=True)
        df2 = df2.append(team2_v_team1, ignore_index=True)
        
    df = df1.append(df2, ignore_index=True)
    df['Round'] = rd
    df = df.rename(columns={'Seed': 'TeamSeed', 'OppSeed': 'OppTeamSeed', 'AvgScore': 'AvgPoints', 'AvgOppScore': 'AvgOppPoints', 'OppAvgScore': 'OppAvgPoints'})
    df = df[['Season', 'Round', 'TeamID', 'OppTeamID', 'TeamSeed', 'OppTeamSeed', 'W', 'L', 'AvgPoints', 'AvgFGM', 'AvgFGA', 'AvgFGM3', 'AvgFGA3', 'AvgFTM', 'AvgFTA', 'AvgOR', 'AvgDR', 'AvgAst', 'AvgTO', 'AvgStl', 'AvgBlk', 'AvgPF', 'AvgOppPoints', 'AvgOppFGM', 'AvgOppFGA', 'AvgOppFGM3', 'AvgOppFGA3', 'AvgOppFTM', 'AvgOppFTA', 'AvgOppOR', 'AvgOppDR', 'AvgOppAst', 'AvgOppTO', 'AvgOppStl', 'AvgOppBlk', 'AvgOppPF', 'OppW', 'OppL', 'OppAvgPoints', 'OppAvgFGM', 'OppAvgFGA', 'OppAvgFGM3', 'OppAvgFGA3', 'OppAvgFTM', 'OppAvgFTA', 'OppAvgOR', 'OppAvgDR', 'OppAvgAst', 'OppAvgTO', 'OppAvgStl', 'OppAvgBlk', 'OppAvgPF', 'OppAvgOppScore', 'OppAvgOppFGM', 'OppAvgOppFGA', 'OppAvgOppFGM3', 'OppAvgOppFGA3', 'OppAvgOppFTM', 'OppAvgOppFTA', 'OppAvgOppOR', 'OppAvgOppDR', 'OppAvgOppAst', 'OppAvgOppTO', 'OppAvgOppStl', 'OppAvgOppBlk', 'OppAvgOppPF']]
    
    return df

# baseline predictor - always pick higher seed

In [51]:
def baseline_predictor(matchups):
    winners = []
    losers = []
    
    for m in matchups:
        team1 = m[0]
        team2 = m[1]
        
        seed1 = seeds_dict[team1]
        seed2 = seeds_dict[team2]
        
        if (seed1 > seed2):
            winners.append(team2)
            losers.append(team1)
        else:
            winners.append(team1)
            losers.append(team2)
            
    return losers, winners

In [52]:
def predict_bracket_baseline(matchups):
    winners = []
    losers = []
    
    for r in rounds:
        (loser_ids, winner_ids) = baseline_predictor(matchups)
        winners.append(winner_ids)
        losers.append(loser_ids)
        winner_names = [team_dict[team_id] for team_id in winner_ids]
        #print winner_names
        #print
        
        if (r < 6):
            matchups = winners_to_matchups(winner_ids)
            
    return losers, winners

# normal probability-based prediction

In [53]:
def predict_with_prob(classifier, matchups):
    split = len(matchups) / 2
    
    # get win probabilities
    teams = matchups[['TeamID', 'OppTeamID']]
    win_probs = pd.DataFrame(data=classifier.predict_proba(matchups), columns=['Loss', 'Win'])
    results = pd.concat([teams, win_probs], axis=1)

    # compare predictions for each matchup from each POV
    results_1 = results.iloc[:split]
    results_1.loc[:,'Matchup'] = results_1.index
    results_2 = results.iloc[split:].reset_index()
    results_2.loc[:,'Matchup'] = results_2.index
    results_concat = results_1.join(results_2, on='Matchup', lsuffix='1', rsuffix='2')
    results_concat = results_concat[['TeamID1', 'OppTeamID1', 'Win1', 'Win2']]
    results_concat.columns = ['Team1', 'Team2', 'Win1', 'Win2']
    
    # standardize probabilities
    results_concat['Sum'] = results_concat['Win1'] + results_concat['Win2']
    results_concat['Win1Adj'] = results_concat['Win1'] / results_concat['Sum']
    results_concat['Win2Adj'] = results_concat['Win2'] / results_concat['Sum']

    # make predictions
    results_concat['Team1WinPred'] = np.where(results_concat['Win1Adj'] > results_concat['Win2Adj'], 1, 0)
    # print results_concat
    
    pred_winners = np.where(results_concat['Team1WinPred'] == 1, results_concat['Team1'], results_concat['Team2'])
    pred_losers = np.where(results_concat['Team1WinPred'] == 1, results_concat['Team2'], results_concat['Team1'])
    return pred_losers, pred_winners

In [54]:
def predict_first_round(classifier, matchups):
    split = len(matchups) / 2
    
    # get win probabilities
    teams = matchups[['TeamID', 'OppTeamID']]
    win_probs = pd.DataFrame(data=classifier.predict_proba(matchups), columns=['Loss', 'Win'])
    results = pd.concat([teams, win_probs], axis=1)

    # compare predictions for each matchup from each POV
    results_1 = results.iloc[:split]
    results_1.loc[:,'Matchup'] = results_1.index
    results_2 = results.iloc[split:].reset_index()
    results_2.loc[:,'Matchup'] = results_2.index
    results_concat = results_1.join(results_2, on='Matchup', lsuffix='1', rsuffix='2')
    results_concat = results_concat[['TeamID1', 'OppTeamID1', 'Win1', 'Win2']]
    results_concat.columns = ['Team1', 'Team2', 'Win1', 'Win2']
    
    # standardize probabilities
    results_concat['Sum'] = results_concat['Win1'] + results_concat['Win2']
    results_concat['Win1Adj'] = results_concat['Win1'] / results_concat['Sum']
    results_concat['Win2Adj'] = results_concat['Win2'] / results_concat['Sum']

    # make predictions
    results_concat['Team1WinPred'] = np.where(results_concat['Win1Adj'] > results_concat['Win2Adj'], 1, 0)
    
    results = results_concat['Team1WinPred'].values.tolist()
    return results

In [55]:
def predict_bracket(team_data, matchups, classifier, season):
    winners = []
    losers = []
    
    for r in rounds:
        matchups_with_data = create_matchups(team_data, matchups, r, season)
        (loser_ids, winner_ids) = predict_with_prob(classifier, matchups_with_data)
        winners.append(winner_ids)
        losers.append(loser_ids)
        winner_names = [team_dict[team_id] for team_id in winner_ids]
        #print winner_names
        #print

        if (r < 6):
            matchups = winners_to_matchups(winner_ids)
    return losers, winners

# prediction with upset bonus

In [56]:
def predict_with_upset_bonus(classifier, matchups, round_num):
    points = {1:1, 2:2, 3:4, 4:8, 5:16, 6:32}
    upset_bonus = 2
    round_points = points[round_num]
    
    split = len(matchups) / 2
    
    # get win probabilities
    teams = matchups[['TeamID', 'OppTeamID']]
    win_probs = pd.DataFrame(data=classifier.predict_proba(matchups), columns=['Loss', 'Win'])
    results = pd.concat([teams, win_probs], axis=1)

    # compare predictions for each matchup from each POV
    results_1 = results.iloc[:split]
    results_1.loc[:,'Matchup'] = results_1.index
    results_2 = results.iloc[split:].reset_index()
    results_2.loc[:,'Matchup'] = results_2.index
    results_concat = results_1.join(results_2, on='Matchup', lsuffix='1', rsuffix='2')
    results_concat = results_concat[['TeamID1', 'OppTeamID1', 'Win1', 'Win2']]
    results_concat.columns = ['Team1', 'Team2', 'Win1', 'Win2']
    
    # standardize probabilities
    results_concat['Sum'] = results_concat['Win1'] + results_concat['Win2']
    results_concat['Win1Adj'] = results_concat['Win1'] / results_concat['Sum']
    results_concat['Win2Adj'] = results_concat['Win2'] / results_concat['Sum']
    
    # calculate expected values
    results_concat['Team1Seed'] = results_concat['Team1'].map(seeds_dict)
    results_concat['Team2Seed'] = results_concat['Team2'].map(seeds_dict)
    
    results_concat['Team1WinVal'] = np.where(results_concat['Team1Seed'] > results_concat['Team2Seed'], round_points + upset_bonus, round_points)
    results_concat['Team2WinVal'] = np.where(results_concat['Team1Seed'] < results_concat['Team2Seed'], round_points + upset_bonus, round_points)
    
    results_concat['Team1ExpVal'] = results_concat['Win1Adj'] * results_concat['Team1WinVal']
    results_concat['Team2ExpVal'] = results_concat['Win2Adj'] * results_concat['Team2WinVal']
    
    #results_concat = results_concat[['Team1', 'Win1Adj', 'Team1Seed', 'Team1WinVal', 'Team1ExpVal', 'Team2', 'Win2Adj', 'Team2Seed', 'Team2WinVal', 'Team2ExpVal']]
    #print results_concat
    
    pred_winners = np.where(results_concat['Team1ExpVal'] > results_concat['Team2ExpVal'], results_concat['Team1'], results_concat['Team2'])
    pred_losers = np.where(results_concat['Team1ExpVal'] < results_concat['Team2ExpVal'], results_concat['Team1'], results_concat['Team2'])
    return pred_winners, pred_losers

In [57]:
def predict_bracket_upsets(team_data, matchups, classifier, season):
    winners = []
    losers = []
    
    for r in rounds:
        matchups_with_data = create_matchups(team_data, matchups, r, season)
        (loser_ids, winner_ids) = predict_with_upset_bonus(classifier, matchups_with_data, r)
        winners.append(winner_ids)
        losers.append(loser_ids)
        winner_names = [team_dict[team_id] for team_id in winner_ids]
        #print winner_names
        #print

        if (r < 6):
            matchups = winners_to_matchups(winner_ids)
    
    return losers, winners

# scoring metrics

In [58]:
def score_bracket_upsets(results, pred_winners, pred_losers):
    # see https://www.nytimes.com/2015/03/16/upshot/heres-how-our-ncaa-bracket-works.html
    points = [1, 2, 4, 8, 16, 32]
    total_pts = 0
    upset_bonus = 5
    num_upsets = 0
    
    for rd, pts, pred_win, pred_lose, act_win in zip(rounds, points, pred_winners, pred_losers, results):
        num_correct = 0
        
        for pred_w, pred_l, act_w in zip(pred_win, pred_lose, act_win):
            if (pred_w == act_w):
                num_correct += 1
                if (seeds_dict[pred_w] > seeds_dict[pred_l]):
                    num_upsets += 1
                
        rd_pts = pts * num_correct
        total_pts += rd_pts
    
    total_pts += (num_upsets * upset_bonus)
    return total_pts

In [59]:
def score_bracket_espn(results, prediction):
    # see http://games.espn.com/tournament-challenge-bracket/2018/en/story?pageName=tcmen%5Chowtoplay
    points = [10, 20, 40, 80, 160, 320]
    total_pts = 0
    
    for rd, pts, pred_winners, act_winners in zip(rounds, points, prediction, results):
        num_correct = 0
        
        for pred, act in zip(pred_winners, act_winners):
            if (pred == act):
                num_correct += 1
        
        rd_pts = pts * num_correct
        total_pts += rd_pts
    
    return total_pts

# predict 2018

In [60]:
(train_X, train_Y, team_data) = load_data_2018()
teams = pd.read_csv('DataFiles/Teams.csv')
team_dict = pd.Series(teams.TeamName.values,index=teams.TeamID).to_dict()
rounds = [1, 2, 3, 4, 5, 6]

matchups = [[1438,1420], [1166, 1243], [1246, 1172], [1112, 1138], [1274, 1260], [1397, 1460], [1305, 1400], [1153, 1209], [1462, 1411], [1281, 1199], [1326, 1355], [1211, 1422], [1222, 1361], [1276, 1285], [1401, 1344], [1314, 1252], [1437, 1347], [1439, 1104], [1452, 1293], [1455, 1267], [1196, 1382], [1403, 1372], [1116, 1139], [1345, 1168], [1242, 1335], [1371, 1301], [1155, 1308], [1120, 1158], [1395, 1393], [1277, 1137], [1348, 1328], [1181, 1233]]    
tournament_results = [[1420, 1243, 1246, 1138, 1260, 1397, 1305, 1153, 1462, 1199, 1326, 1211, 1222, 1276, 1401, 1314, 1437, 1104, 1452, 1267, 1196, 1403, 1139, 1345, 1242, 1371, 1155, 1120, 1393, 1277, 1348, 1181],[1243, 1246, 1260, 1305, 1199, 1211, 1276, 1401, 1437, 1452, 1403, 1345, 1242, 1155, 1393, 1181],[1243, 1260, 1199, 1276, 1437, 1403, 1242, 1181],[1260, 1276, 1437, 1242],[1276, 1437],[1437]]

In [61]:
seeds = pd.read_csv('Stage2UpdatedDataFiles/NCAATourneySeeds.csv')
seeds = seeds[seeds['Season'] == 2018]
seeds['Seed'] = seeds['Seed'].apply(fix_seed)

seeds_dict = pd.Series(seeds.Seed.values,index=seeds.TeamID).to_dict()

In [62]:
models = ["GNB", "LDA","SVM_L", "5NN", "LR2", "SGD","ADA", "DT", "RF", "DPGMM", "ET", "GMM", "MLP"] #"SVM_L", "SVM_G", "P2", "DT",  "ADA_R", 
classifiers = [
            GaussianNB(), \
            LinearDiscriminantAnalysis(), \
            svm.SVC(kernel = 'linear', probability=True), \
            neighbors.KNeighborsClassifier(n_neighbors=5), \
            LogisticRegression(), \
            SGDClassifier(loss='log', tol=0.0001, power_t=0.4, average=True), \
            AdaBoostClassifier(base_estimator=None, n_estimators=100), \
            DecisionTreeClassifier(), \
            RandomForestClassifier(),  \
            BayesianGaussianMixture(n_components=2,max_iter=1000, weight_concentration_prior_type='dirichlet_process', tol=0.0001), \
            ExtraTreesClassifier(bootstrap=True, n_estimators=4), \
            GaussianMixture(n_components=2, tol=0.0001, max_iter=1000, n_init=2), \
            MLPClassifier(activation='relu', alpha=0.00001, max_iter=1000)]

In [None]:
results = []

(baseline_losers, baseline_winners) = predict_bracket_baseline(matchups)
baseline_winner = team_dict[baseline_winners[5][0]]
baseline_espn_score = score_bracket_espn(tournament_results, baseline_winners)
baseline_upset_score = score_bracket_upsets(tournament_results, baseline_winners, baseline_losers)
results.append(["Baseline", baseline_winner, baseline_espn_score, baseline_upset_score])

for m,c in zip(models,classifiers):
    c.fit(train_X, train_Y)
    (losers, winners) = predict_bracket(team_data, matchups, c, 2018)
    
    champion = team_dict[winners[5][0]]
    espn_score = score_bracket_espn(tournament_results, winners)
    upset_score = score_bracket_upsets(tournament_results, winners, losers)
    
    results.append([m, champion, espn_score, upset_score])

In [None]:
print tabulate.tabulate(results, headers=['Model', 'Champion', 'ESPN Score', 'Upset Score'])

In [None]:
#lr = LogisticRegression()
#lr.fit(train_X, train_Y)
#(losers, winners) = predict_bracket_upsets(team_data, matchups, lr, 2018)
#upset_score = score_bracket_upsets(results, winners, losers)
#print upset_score
#espn_score = score_bracket_espn(results, winners)
#print espn_score

# predict 2017

In [None]:
(train_X_2017, train_Y_2017, team_data_2017) = load_data_2017()
teams = pd.read_csv('DataFiles/Teams.csv')
team_dict = pd.Series(teams.TeamName.values,index=teams.TeamID).to_dict()
rounds = [1, 2, 3, 4, 5, 6]
matchups_2017 = [[1437, 1291], [1458, 1439], [1438, 1423], [1196, 1190], [1374, 1425], [1124, 1308], [1376, 1266], [1181, 1407], [1211, 1355], [1321, 1435], [1323, 1343], [1452, 1137], [1268, 1462], [1199, 1195], [1388, 1433], [1112, 1315], [1242, 1413], [1274, 1277], [1235, 1305], [1345, 1436], [1166, 1348], [1332, 1233], [1276, 1329], [1257, 1240], [1314, 1411], [1116, 1371], [1278, 1292], [1139, 1457], [1153, 1243], [1417, 1245], [1173, 1455], [1246, 1297]]
tournament_results_2017 = [[1437, 1458, 1438, 1196, 1425, 1124, 1376, 1181, 1211, 1321, 1323, 1452, 1462, 1199, 1388, 1112, 1242, 1277, 1235, 1345, 1348, 1332, 1276, 1257, 1314, 1116, 1292, 1139, 1153, 1417, 1455, 1246], [1458, 1196, 1124, 1376, 1211, 1452, 1462, 1112, 1242, 1345, 1332, 1276, 1314, 1139, 1417, 1246], [1196, 1376, 1211, 1462, 1242, 1332, 1314, 1246], [1376, 1211, 1332, 1314], [1211, 1314], [1314]]

In [None]:
seeds = pd.read_csv('Stage2UpdatedDataFiles/NCAATourneySeeds.csv')
seeds = seeds[seeds['Season'] == 2017]
seeds['Seed'] = seeds['Seed'].apply(fix_seed)

seeds_dict = pd.Series(seeds.Seed.values,index=seeds.TeamID).to_dict()

In [None]:
models = ["GNB", "LDA","SVM_L", "5NN", "LR2", "SGD","ADA", "DT", "RF", "DPGMM", "ET", "GMM", "MLP"] #"SVM_L", "SVM_G", "P2", "DT",  "ADA_R", 
classifiers = [
            GaussianNB(), \
            LinearDiscriminantAnalysis(), \
            svm.SVC(kernel = 'linear', probability=True), \
            neighbors.KNeighborsClassifier(n_neighbors=5), \
            LogisticRegression(), \
            SGDClassifier(loss='log', tol=0.0001, power_t=0.4, average=True), \
            AdaBoostClassifier(base_estimator=None, n_estimators=100), \
            DecisionTreeClassifier(), \
            RandomForestClassifier(),  \
            BayesianGaussianMixture(n_components=2,max_iter=1000, weight_concentration_prior_type='dirichlet_process', tol=0.0001), \
            ExtraTreesClassifier(bootstrap=True, n_estimators=4), \
            GaussianMixture(n_components=2, tol=0.0001, max_iter=1000, n_init=2), \
            MLPClassifier(activation='relu', alpha=0.00001, max_iter=1000)]

In [None]:
results_2017 = []

(baseline_losers, baseline_winners) = predict_bracket_baseline(matchups_2017)
baseline_winner = team_dict[baseline_winners[5][0]]
baseline_espn_score = score_bracket_espn(tournament_results_2017, baseline_winners)
baseline_upset_score = score_bracket_upsets(tournament_results_2017, baseline_winners, baseline_losers)
results_2017.append(["Baseline", baseline_winner, baseline_espn_score, baseline_upset_score])

for m,c in zip(models,classifiers):
    c.fit(train_X_2017, train_Y_2017)
    (losers, winners) = predict_bracket(team_data_2017, matchups_2017, c, 2017)
    
    champion = team_dict[winners[5][0]]
    espn_score = score_bracket_espn(tournament_results_2017, winners)
    upset_score = score_bracket_upsets(tournament_results_2017, winners, losers)
    
    results_2017.append([m, champion, espn_score, upset_score])

In [None]:
print tabulate.tabulate(results_2017, headers=['Model', 'Champion', 'ESPN Score', 'Upset Score'])

In [None]:
df_2017 = pd.DataFrame(results_2017, columns=['Model', '2017 Winner', '2017 ESPN Score', '2017 Upset Score'])
df_2018 = pd.DataFrame(results, columns=['Model', '2018 Winner', '2018 ESPN Score', '2018 Upset Score'])
df = df_2017.merge(df_2018, on='Model')
df = df[['Model', '2017 Winner', '2018 Winner', '2017 ESPN Score', '2018 ESPN Score', '2017 Upset Score', '2018 Upset Score']]
#print df.to_latex(index=False)

## predict first round results

In [88]:
def bin_test(x_data, x_labels, y_data, y_labels):
    starttime = time.time()

    # binary models
    models = ["GNB", "LDA","SVM_L", "5NN", "LR2", "SGD","ADA", "DT", "RF", "DPGMM", "ET", "GMM", "MLP"] #"SVM_L", "SVM_G", "P2", "DT",  "ADA_R", 
    clfs = [
                GaussianNB(), \
                LinearDiscriminantAnalysis(), \
                svm.SVC(kernel = 'linear', probability=True), \
                neighbors.KNeighborsClassifier(n_neighbors=5), \
                LogisticRegression(), \
                SGDClassifier(loss='log', tol=0.0001, power_t=0.4, average=True), \
                AdaBoostClassifier(base_estimator=None, n_estimators=100), \
                DecisionTreeClassifier(), \
                RandomForestClassifier(),  \
                BayesianGaussianMixture(n_components=2,max_iter=1000, weight_concentration_prior_type='dirichlet_process', tol=0.0001), \
                ExtraTreesClassifier(bootstrap=True, n_estimators=4), \
                GaussianMixture(n_components=2, tol=0.0001, max_iter=1000, n_init=2), \
                MLPClassifier(activation='relu', alpha=0.00001, max_iter=1000)]

    results = []

    for i in range(len(clfs)):
        print "model being tested: {0}".format(models[i])
        time_start = time.time()
        clf = clfs[i].fit(x_data, x_labels)
        predict = predict_first_round(clf, y_data)
        runtime = time.time() - time_start
        a = metrics.accuracy_score(y_labels, predict)

        results.append([models[i], a])
    print tabulate.tabulate(results, headers=['Model', 'Accuracy'])
    print "Binary test took {0} secs".format(time.time() - starttime)
    return pd.DataFrame(data=results, columns=['Model', 'Accuracy'])

## predict 2018 first round

In [None]:
(train_X_2018, train_Y_2018, team_data_2018) = load_data_2018()

test_2018 = pd.read_csv('test_2018.csv')
ext_data_matchups = ['PomeroyRank', 'Conf', 'AdjEM', 'AdjO', 'AdjD', 'AdjT', 'Luck', 'SOSAdjEM', 'OppO', 'OppD', 'NCSOSAdjEM', 'MooreRank', 'MooreSOS', 'MoorePR', 'OppPomeroyRank', 'OppConf', 'OppAdjEM', 'OppAdjO', 'OppAdjD', 'OppAdjT', 'OppLuck', 'OppSOSAdjEM', 'OppOppO', 'OppOppD', 'OppNCSOSAdjEM', 'OppMooreRank', 'OppMooreSOS', 'OppMoorePR']
test_2018 = test_2018.drop(ext_data_matchups, axis=1)  
test_2018['TeamSeed'] = test_2018['TeamSeed'].apply(fix_seed)
test_2018['OppTeamSeed'] = test_2018['OppTeamSeed'].apply(fix_seed)
test_Y_2018 = test_2018['Outcome'][0:32]
test_X_2018 = test_2018.drop(['Outcome'], axis=1)

fr_2018 = bin_test(train_X_2018, train_Y_2018, test_X_2018, test_Y_2018)

In [None]:
#fr_2018 = fr_2018.round(decimals=3)
#df = df.merge(fr_2018, on='Model')
#df.columns = ['Model', '2017 Winner', '2018 Winner', '2017 ESPN Score', '2018 ESPN Score', '2017 Upset Score', '2018 Upset Score', '2018 FR Accuracy']
#print df

## predict 2017 first round

In [None]:
fr_2017_matchups = create_matchups(team_data_2017, matchups_2017, 1, 2017)

fr_2017_winners = tournament_results_2017[0]

outcomes = []
teams = fr_2017_matchups['TeamID'].values.tolist()
for t in teams:
    if t in fr_2017_winners:
        outcomes.append(1)
    else:
        outcomes.append(0)
        
outcomes = outcomes[:32]

fr_2017 = bin_test(train_X_2017, train_Y_2017, fr_2017_matchups, outcomes)

In [None]:
#fr_2017 = fr_2017.round(decimals=3)
#df = df.merge(fr_2017, on='Model')
#df.columns = ['Model', '2017 Winner', '2018 Winner', '2017 ESPN Score', '2018 ESPN Score', '2017 Upset Score', '2018 Upset Score', '2018 FR Acc', '2017 FR Acc']
#print df

In [None]:
#print df.to_latex(index=False)

## check LR coefficients for 2018 predictions

In [None]:
#(train_X, train_Y, team_data) = load_data_2018()
#scaled_X = StandardScaler().fit_transform(train_X)
#lr = LogisticRegression()
#lr.fit(scaled_X, train_Y)
#weights = check_coef(lr, train_X)
#print weights

## ensemble classifier

In [35]:
def find_outliers(x_data, x_labels, y_data, y_labels):
    starttime = time.time()

    # binary models
    models = ["GNB", "LDA","SVM_L", "5NN", "LR2", "SGD","ADA", "DT", "RF", "DPGMM", "ET", "GMM", "MLP"] #"SVM_L", "SVM_G", "P2", "DT",  "ADA_R", 
    clfs = [
                GaussianNB(), \
                LinearDiscriminantAnalysis(), \
                svm.SVC(kernel = 'linear', probability=True), \
                neighbors.KNeighborsClassifier(n_neighbors=5), \
                LogisticRegression(), \
                SGDClassifier(loss='log', tol=0.0001, power_t=0.4, average=True), \
                AdaBoostClassifier(base_estimator=None, n_estimators=100), \
                DecisionTreeClassifier(), \
                RandomForestClassifier(),  \
                BayesianGaussianMixture(n_components=2,max_iter=1000, weight_concentration_prior_type='dirichlet_process', tol=0.0001), \
                ExtraTreesClassifier(bootstrap=True, n_estimators=4), \
                GaussianMixture(n_components=2, tol=0.0001, max_iter=1000, n_init=2), \
                MLPClassifier(activation='relu', alpha=0.00001, max_iter=1000)]

    misclassified = dict.fromkeys(range(0, 33), 0)
    
    for i in range(len(clfs)):
        print "model being tested: {0}".format(models[i])
        time_start = time.time()
        clf = clfs[i].fit(x_data, x_labels)
        predict = predict_first_round(clf, y_data)
        
        i = 0
        for p,l in zip(predict, y_labels):
            if (p != l):
                misclassified[i] += 1
            i += 1
            
    return misclassified

In [36]:
(train_X_2018, train_Y_2018, team_data_2018) = load_data_2018()

test_2018 = pd.read_csv('test_2018.csv')
ext_data_matchups = ['PomeroyRank', 'Conf', 'AdjEM', 'AdjO', 'AdjD', 'AdjT', 'Luck', 'SOSAdjEM', 'OppO', 'OppD', 'NCSOSAdjEM', 'MooreRank', 'MooreSOS', 'MoorePR', 'OppPomeroyRank', 'OppConf', 'OppAdjEM', 'OppAdjO', 'OppAdjD', 'OppAdjT', 'OppLuck', 'OppSOSAdjEM', 'OppOppO', 'OppOppD', 'OppNCSOSAdjEM', 'OppMooreRank', 'OppMooreSOS', 'OppMoorePR']
test_2018 = test_2018.drop(ext_data_matchups, axis=1)  
test_2018['TeamSeed'] = test_2018['TeamSeed'].apply(fix_seed)
test_2018['OppTeamSeed'] = test_2018['OppTeamSeed'].apply(fix_seed)
test_Y_2018 = test_2018['Outcome'][0:32]
test_X_2018 = test_2018.drop(['Outcome'], axis=1)

outliers_fr_2018 = find_outliers(train_X_2018, train_Y_2018, test_X_2018, test_Y_2018)

model being tested: GNB
model being tested: LDA
model being tested: SVM_L
model being tested: 5NN
model being tested: LR2
model being tested: SGD
model being tested: ADA
model being tested: DT
model being tested: RF
model being tested: DPGMM
model being tested: ET
model being tested: GMM
model being tested: MLP


In [63]:
for key, value in sorted(outliers_fr_2018.iteritems(), key=lambda (k,v): (v,k)):
    print "%s: %s" % (key, value)

32: 0
15: 1
2: 2
8: 2
17: 2
19: 2
24: 2
4: 3
5: 3
7: 3
9: 3
13: 3
22: 3
25: 3
26: 3
29: 3
31: 3
1: 4
3: 4
30: 4
6: 5
10: 5
21: 5
27: 6
14: 7
23: 7
0: 8
16: 8
28: 8
11: 11
12: 11
18: 12
20: 13


In [None]:
# get outlier data
# 1 12 13 15 17 19 21 24 29

In [95]:
(train_X_2018, train_Y_2018, team_data_2018) = load_data_2018()

test_2018_outliers = pd.read_csv('test_2018_outliers.csv')
ext_data_matchups = ['PomeroyRank', 'Conf', 'AdjEM', 'AdjO', 'AdjD', 'AdjT', 'Luck', 'SOSAdjEM', 'OppO', 'OppD', 'NCSOSAdjEM', 'MooreRank', 'MooreSOS', 'MoorePR', 'OppPomeroyRank', 'OppConf', 'OppAdjEM', 'OppAdjO', 'OppAdjD', 'OppAdjT', 'OppLuck', 'OppSOSAdjEM', 'OppOppO', 'OppOppD', 'OppNCSOSAdjEM', 'OppMooreRank', 'OppMooreSOS', 'OppMoorePR']
test_2018_outliers = test_2018_outliers.drop(ext_data_matchups, axis=1)  
test_2018_outliers['TeamSeed'] = test_2018_outliers['TeamSeed'].apply(fix_seed)
test_2018_outliers['OppTeamSeed'] = test_2018_outliers['OppTeamSeed'].apply(fix_seed)
test_Y_2018 = test_2018_outliers['Outcome'][0:9]
test_X_2018 = test_2018_outliers.drop(['Outcome'], axis=1)

fr_2018_outliers = bin_test(train_X_2018, train_Y_2018, test_X_2018, test_Y_2018)
fr_2018_outliers = fr_2018_outliers.round(decimals=3)
print fr_2018_outliers

model being tested: GNB
9
model being tested: LDA
9
model being tested: SVM_L
9
model being tested: 5NN
9
model being tested: LR2
9
model being tested: SGD
9
model being tested: ADA
9
model being tested: DT
9
model being tested: RF
9
model being tested: DPGMM
9
model being tested: ET
9
model being tested: GMM
9
model being tested: MLP
9
Model      Accuracy
-------  ----------
GNB        0.777778
LDA        0.888889
SVM_L      0.888889
5NN        0.666667
LR2        0.888889
SGD        0.111111
ADA        0.777778
DT         0.666667
RF         0.888889
DPGMM      0.222222
ET         0.777778
GMM        0.444444
MLP        0.777778
Binary test took 67.5949790478 secs
    Model  Accuracy
0     GNB     0.778
1     LDA     0.889
2   SVM_L     0.889
3     5NN     0.667
4     LR2     0.889
5     SGD     0.111
6     ADA     0.778
7      DT     0.667
8      RF     0.889
9   DPGMM     0.222
10     ET     0.778
11    GMM     0.444
12    MLP     0.778


In [158]:
print fr_2018_outliers.to_latex(index=False)

\begin{tabular}{lr}
\toprule
 Model &  Accuracy \\
\midrule
   GNB &     0.778 \\
   LDA &     0.889 \\
 SVM\_L &     0.889 \\
   5NN &     0.667 \\
   LR2 &     0.889 \\
   SGD &     0.111 \\
   ADA &     0.778 \\
    DT &     0.667 \\
    RF &     0.889 \\
 DPGMM &     0.222 \\
    ET &     0.778 \\
   GMM &     0.444 \\
   MLP &     0.778 \\
\bottomrule
\end{tabular}



In [159]:
ens_names = ['LDA', 'SVM_L', 'LR2', 'RF', 'ADA', 'ET']
ens_clfs = [LinearDiscriminantAnalysis(), \
            svm.SVC(kernel = 'linear', probability=True), \
            LogisticRegression(), \
            RandomForestClassifier(), \
            AdaBoostClassifier(base_estimator=None, n_estimators=100), \
            ExtraTreesClassifier(bootstrap=True, n_estimators=4)
           ]

ens_models = []
for n,c in zip(ens_names, ens_clfs):
    ens_models.append([n,c])
    
ens_clf = VotingClassifier(ens_models, voting='soft')

In [160]:
(train_X, train_Y, team_data) = load_data_2018()
teams = pd.read_csv('DataFiles/Teams.csv')
team_dict = pd.Series(teams.TeamName.values,index=teams.TeamID).to_dict()
rounds = [1, 2, 3, 4, 5, 6]

matchups = [[1438,1420], [1166, 1243], [1246, 1172], [1112, 1138], [1274, 1260], [1397, 1460], [1305, 1400], [1153, 1209], [1462, 1411], [1281, 1199], [1326, 1355], [1211, 1422], [1222, 1361], [1276, 1285], [1401, 1344], [1314, 1252], [1437, 1347], [1439, 1104], [1452, 1293], [1455, 1267], [1196, 1382], [1403, 1372], [1116, 1139], [1345, 1168], [1242, 1335], [1371, 1301], [1155, 1308], [1120, 1158], [1395, 1393], [1277, 1137], [1348, 1328], [1181, 1233]]    
tournament_results = [[1420, 1243, 1246, 1138, 1260, 1397, 1305, 1153, 1462, 1199, 1326, 1211, 1222, 1276, 1401, 1314, 1437, 1104, 1452, 1267, 1196, 1403, 1139, 1345, 1242, 1371, 1155, 1120, 1393, 1277, 1348, 1181],[1243, 1246, 1260, 1305, 1199, 1211, 1276, 1401, 1437, 1452, 1403, 1345, 1242, 1155, 1393, 1181],[1243, 1260, 1199, 1276, 1437, 1403, 1242, 1181],[1260, 1276, 1437, 1242],[1276, 1437],[1437]]

In [161]:
seeds = pd.read_csv('Stage2UpdatedDataFiles/NCAATourneySeeds.csv')
seeds = seeds[seeds['Season'] == 2018]
seeds['Seed'] = seeds['Seed'].apply(fix_seed)

seeds_dict = pd.Series(seeds.Seed.values,index=seeds.TeamID).to_dict()

In [162]:
ens_clf.fit(train_X, train_Y)
(losers, winners) = predict_bracket(team_data, matchups, ens_clf, 2018)

champion = team_dict[winners[5][0]]
print champion
espn_score = score_bracket_espn(tournament_results, winners)
print espn_score
upset_score = score_bracket_upsets(tournament_results, winners, losers)
print upset_score

Villanova
1100
145


In [163]:
test_2018 = pd.read_csv('test_2018.csv')
ext_data_matchups = ['PomeroyRank', 'Conf', 'AdjEM', 'AdjO', 'AdjD', 'AdjT', 'Luck', 'SOSAdjEM', 'OppO', 'OppD', 'NCSOSAdjEM', 'MooreRank', 'MooreSOS', 'MoorePR', 'OppPomeroyRank', 'OppConf', 'OppAdjEM', 'OppAdjO', 'OppAdjD', 'OppAdjT', 'OppLuck', 'OppSOSAdjEM', 'OppOppO', 'OppOppD', 'OppNCSOSAdjEM', 'OppMooreRank', 'OppMooreSOS', 'OppMoorePR']
test_2018 = test_2018.drop(ext_data_matchups, axis=1)  
test_2018['TeamSeed'] = test_2018['TeamSeed'].apply(fix_seed)
test_2018['OppTeamSeed'] = test_2018['OppTeamSeed'].apply(fix_seed)
test_Y_2018 = test_2018['Outcome'][0:32]
test_X_2018 = test_2018.drop(['Outcome'], axis=1)

predict = predict_first_round(ens_clf, test_X_2018)
a = metrics.accuracy_score(test_Y_2018, predict)
print a

0.8125


In [164]:
ens_names = ['LDA', 'SVM_L', 'LR2', 'RF', 'ADA', 'ET']
ens_clfs = [LinearDiscriminantAnalysis(), \
            svm.SVC(kernel = 'linear', probability=True), \
            LogisticRegression(), \
            RandomForestClassifier(), \
            AdaBoostClassifier(base_estimator=None, n_estimators=100), \
            ExtraTreesClassifier(bootstrap=True, n_estimators=4)
           ]

ens_models = []
for n,c in zip(ens_names, ens_clfs):
    ens_models.append([n,c])
    
ens_clf = VotingClassifier(ens_models, voting='soft')

In [165]:
(train_X_2017, train_Y_2017, team_data_2017) = load_data_2017()
teams = pd.read_csv('DataFiles/Teams.csv')
team_dict = pd.Series(teams.TeamName.values,index=teams.TeamID).to_dict()
rounds = [1, 2, 3, 4, 5, 6]
matchups_2017 = [[1437, 1291], [1458, 1439], [1438, 1423], [1196, 1190], [1374, 1425], [1124, 1308], [1376, 1266], [1181, 1407], [1211, 1355], [1321, 1435], [1323, 1343], [1452, 1137], [1268, 1462], [1199, 1195], [1388, 1433], [1112, 1315], [1242, 1413], [1274, 1277], [1235, 1305], [1345, 1436], [1166, 1348], [1332, 1233], [1276, 1329], [1257, 1240], [1314, 1411], [1116, 1371], [1278, 1292], [1139, 1457], [1153, 1243], [1417, 1245], [1173, 1455], [1246, 1297]]
tournament_results_2017 = [[1437, 1458, 1438, 1196, 1425, 1124, 1376, 1181, 1211, 1321, 1323, 1452, 1462, 1199, 1388, 1112, 1242, 1277, 1235, 1345, 1348, 1332, 1276, 1257, 1314, 1116, 1292, 1139, 1153, 1417, 1455, 1246], [1458, 1196, 1124, 1376, 1211, 1452, 1462, 1112, 1242, 1345, 1332, 1276, 1314, 1139, 1417, 1246], [1196, 1376, 1211, 1462, 1242, 1332, 1314, 1246], [1376, 1211, 1332, 1314], [1211, 1314], [1314]]

In [166]:
seeds = pd.read_csv('Stage2UpdatedDataFiles/NCAATourneySeeds.csv')
seeds = seeds[seeds['Season'] == 2017]
seeds['Seed'] = seeds['Seed'].apply(fix_seed)

seeds_dict = pd.Series(seeds.Seed.values,index=seeds.TeamID).to_dict()

In [167]:
ens_clf.fit(train_X_2017, train_Y_2017)
(losers, winners) = predict_bracket(team_data_2017, matchups_2017, ens_clf, 2017)

champion = team_dict[winners[5][0]]
print champion
espn_score = score_bracket_espn(tournament_results_2017, winners)
print espn_score
upset_score = score_bracket_upsets(tournament_results_2017, winners, losers)
print upset_score

Villanova
790
104


In [168]:
fr_2017_matchups = create_matchups(team_data_2017, matchups_2017, 1, 2017)

fr_2017_winners = tournament_results_2017[0]

outcomes = []
teams = fr_2017_matchups['TeamID'].values.tolist()
for t in teams:
    if t in fr_2017_winners:
        outcomes.append(1)
    else:
        outcomes.append(0)
        
outcomes = outcomes[:32]

predict = predict_first_round(ens_clf, fr_2017_matchups)
a = metrics.accuracy_score(outcomes, predict)
print a

0.84375
