import numpy as np
import pandas as pd
from sklearn import metrics
import math
import tabulate

from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn import svm, neighbors
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier #RandomizedLasso
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessRegressor 
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.semi_supervised import LabelPropagation
from sklearn.neural_network import MLPClassifier

# data processing functions

In [43]:
def fix_seed(string_seed):
    result = ""
    for char in string_seed:
        if char.isdigit():
            result += char
    return int(result)

In [44]:
def load_data():
    ext_data_matchups = ['PomeroyRank', 'Conf', 'AdjEM', 'AdjO', 'AdjD', 'AdjT', 'Luck', 'SOSAdjEM', 'OppO', 'OppD', 'NCSOSAdjEM', 'MooreRank', 'MooreSOS', 'MoorePR', 'OppPomeroyRank', 'OppConf', 'OppAdjEM', 'OppAdjO', 'OppAdjD', 'OppAdjT', 'OppLuck', 'OppSOSAdjEM', 'OppOppO', 'OppOppD', 'OppNCSOSAdjEM', 'OppMooreRank', 'OppMooreSOS', 'OppMoorePR']
    ext_data_team = ['PomeroyRank', 'Conf', 'AdjEM', 'AdjO', 'AdjD', 'AdjT', 'Luck', 'SOSAdjEM', 'OppO', 'OppD', 'NCSOSAdjEM', 'MooreRank', 'MooreSOS', 'MoorePR']
    
    train = pd.read_csv('train_2010_2017.csv')
    train = train.drop(labels=ext_data_matchups, axis=1)
    train['TeamSeed'] = train['TeamSeed'].apply(fix_seed)
    train['OppTeamSeed'] = train['OppTeamSeed'].apply(fix_seed)
    
    train_Y = train['Outcome']
    train_X = train.drop(labels=['Outcome'], axis=1)
    
    team_data = pd.read_csv('team_info_2018.csv')
    team_data = team_data.drop(labels=ext_data_team, axis=1)
    team_data['Seed'] = team_data['Seed'].apply(fix_seed)
    
    return train_X, train_Y, team_data

In [45]:
def check_coef(lr):
    adj = []
    coefs = lr.coef_
    for c in coefs[0]:
        adj.append(math.exp(c))

    features = pd.DataFrame(data=list(test_X))
    weights = pd.DataFrame(data=adj)

    feature_weights = pd.concat([features, weights], axis=1)
    feature_weights.columns = ['Feature', 'Weight']
    feature_weights = feature_weights.sort_values(by='Weight', ascending=False)

# helper functions for matchups

In [46]:
def winners_to_matchups(winners):
    matchups = []
    for i in xrange(0,len(winners),2):
        team1 = winners[i]
        team2 = winners[i+1]
        matchups.append([team1, team2])
    return matchups

In [47]:
def create_matchups(team_data, pairings, round):
    opp_prefixes = ['Season', 'OppTeamID', 'OppW', 'OppL', 'OppAvgScore', 'OppAvgFGM', 'OppAvgFGA', 'OppAvgFGM3', 'OppAvgFGA3', 'OppAvgFTM', 'OppAvgFTA', 'OppAvgOR', 'OppAvgDR', 'OppAvgAst', 'OppAvgTO', 'OppAvgStl', 'OppAvgBlk', 'OppAvgPF', 'OppAvgOppScore', 'OppAvgOppFGM', 'OppAvgOppFGA', 'OppAvgOppFGM3', 'OppAvgOppFGA3', 'OppAvgOppFTM', 'OppAvgOppFTA', 'OppAvgOppOR', 'OppAvgOppDR', 'OppAvgOppAst', 'OppAvgOppTO', 'OppAvgOppStl', 'OppAvgOppBlk', 'OppAvgOppPF', 'OppSeed']
    
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    
    for p in pairings:        
        team_1 = p[0]
        team_1_data = team_data[(team_data['Season'] == 2018) & (team_data['TeamID'] == team_1)]
        team_1_data_opp = team_1_data.copy()
        team_1_data_opp.columns = opp_prefixes
        
        team_2 = p[1]
        team_2_data = team_data[(team_data['Season'] == 2018) & (team_data['TeamID'] == team_2)]
        team_2_data_opp = team_2_data.copy()
        team_2_data_opp.columns = opp_prefixes
        
        team1_v_team2 = team_1_data.merge(team_2_data_opp, how='outer', on='Season')
        team2_v_team1 = team_2_data.merge(team_1_data_opp, how='outer', on='Season')
        
        df1 = df1.append(team1_v_team2, ignore_index=True)
        df2 = df2.append(team2_v_team1, ignore_index=True)
        
    df = df1.append(df2, ignore_index=True)
    df['Round'] = 1  
    df = df.rename(columns={'Seed': 'TeamSeed', 'OppSeed': 'OppTeamSeed', 'AvgScore': 'AvgPoints', 'AvgOppScore': 'AvgOppPoints', 'OppAvgScore': 'OppAvgPoints'})
    df = df[['Season', 'Round', 'TeamID', 'OppTeamID', 'TeamSeed', 'OppTeamSeed', 'W', 'L', 'AvgPoints', 'AvgFGM', 'AvgFGA', 'AvgFGM3', 'AvgFGA3', 'AvgFTM', 'AvgFTA', 'AvgOR', 'AvgDR', 'AvgAst', 'AvgTO', 'AvgStl', 'AvgBlk', 'AvgPF', 'AvgOppPoints', 'AvgOppFGM', 'AvgOppFGA', 'AvgOppFGM3', 'AvgOppFGA3', 'AvgOppFTM', 'AvgOppFTA', 'AvgOppOR', 'AvgOppDR', 'AvgOppAst', 'AvgOppTO', 'AvgOppStl', 'AvgOppBlk', 'AvgOppPF', 'OppW', 'OppL', 'OppAvgPoints', 'OppAvgFGM', 'OppAvgFGA', 'OppAvgFGM3', 'OppAvgFGA3', 'OppAvgFTM', 'OppAvgFTA', 'OppAvgOR', 'OppAvgDR', 'OppAvgAst', 'OppAvgTO', 'OppAvgStl', 'OppAvgBlk', 'OppAvgPF', 'OppAvgOppScore', 'OppAvgOppFGM', 'OppAvgOppFGA', 'OppAvgOppFGM3', 'OppAvgOppFGA3', 'OppAvgOppFTM', 'OppAvgOppFTA', 'OppAvgOppOR', 'OppAvgOppDR', 'OppAvgOppAst', 'OppAvgOppTO', 'OppAvgOppStl', 'OppAvgOppBlk', 'OppAvgOppPF']]
    
    return df

# baseline predictor - always pick higher seed

In [48]:
def baseline_predictor(matchups):
    winners = []
    losers = []
    
    for m in matchups:
        team1 = m[0]
        team2 = m[1]
        
        seed1 = seeds_dict[team1]
        seed2 = seeds_dict[team2]
        
        if (seed1 < seed2):
            winners.append(team1)
            losers.append(team2)
        else:
            winners.append(team2)
            losers.append(team1)
            
    return losers, winners

In [49]:
def predict_bracket_baseline():
    matchups = [[1438,1420], [1166, 1243], [1246, 1172], [1112, 1138], [1274, 1260], [1397, 1460], [1305, 1400], [1153, 1209], [1462, 1411], [1281, 1199], [1326, 1355], [1211, 1422], [1222, 1361], [1276, 1285], [1401, 1344], [1314, 1252], [1437, 1347], [1439, 1104], [1452, 1293], [1455, 1267], [1196, 1382], [1403, 1372], [1116, 1139], [1345, 1168], [1242, 1335], [1371, 1301], [1155, 1308], [1120, 1158], [1395, 1393], [1277, 1137], [1348, 1328], [1181, 1233]]
    
    winners = []
    losers = []
    
    for r in rounds:
        (loser_ids, winner_ids) = baseline_predictor(matchups)
        winners.append(winner_ids)
        losers.append(loser_ids)
        winner_names = [team_dict[team_id] for team_id in winner_ids]
        # print winner_names
        # print
        
        if (r < 6):
            matchups = winners_to_matchups(winner_ids)
            
    return losers, winners

# normal probability-based prediction

In [50]:
def predict_with_prob(classifier, matchups):
    split = len(matchups) / 2
    
    # get win probabilities
    teams = matchups[['TeamID', 'OppTeamID']]
    win_probs = pd.DataFrame(data=classifier.predict_proba(matchups), columns=['Loss', 'Win'])
    results = pd.concat([teams, win_probs], axis=1)

    # compare predictions for each matchup from each POV
    results_1 = results.iloc[:split]
    results_1.loc[:,'Matchup'] = results_1.index
    results_2 = results.iloc[split:].reset_index()
    results_2.loc[:,'Matchup'] = results_2.index
    results_concat = results_1.join(results_2, on='Matchup', lsuffix='1', rsuffix='2')
    results_concat = results_concat[['TeamID1', 'OppTeamID1', 'Win1', 'Win2']]
    results_concat.columns = ['Team1', 'Team2', 'Win1', 'Win2']
    
    # standardize probabilities
    results_concat['Sum'] = results_concat['Win1'] + results_concat['Win2']
    results_concat['Win1Adj'] = results_concat['Win1'] / results_concat['Sum']
    results_concat['Win2Adj'] = results_concat['Win2'] / results_concat['Sum']

    # make predictions
    results_concat['Team1WinPred'] = np.where(results_concat['Win1Adj'] > results_concat['Win2Adj'], 1, 0)
    # print results_concat
    
    pred_winners = np.where(results_concat['Team1WinPred'] == 1, results_concat['Team1'], results_concat['Team2'])
    pred_losers = np.where(results_concat['Team1WinPred'] == 1, results_concat['Team2'], results_concat['Team1'])
    return pred_losers, pred_winners

In [51]:
def predict_bracket(classifier):
    matchups = [[1438,1420], [1166, 1243], [1246, 1172], [1112, 1138], [1274, 1260], [1397, 1460], [1305, 1400], [1153, 1209], [1462, 1411], [1281, 1199], [1326, 1355], [1211, 1422], [1222, 1361], [1276, 1285], [1401, 1344], [1314, 1252], [1437, 1347], [1439, 1104], [1452, 1293], [1455, 1267], [1196, 1382], [1403, 1372], [1116, 1139], [1345, 1168], [1242, 1335], [1371, 1301], [1155, 1308], [1120, 1158], [1395, 1393], [1277, 1137], [1348, 1328], [1181, 1233]]
    winners = []
    losers = []
    
    for r in rounds:
        matchups_with_data = create_matchups(team_data, matchups, r)
        (loser_ids, winner_ids) = predict_with_prob(classifier, matchups_with_data)
        winners.append(winner_ids)
        losers.append(loser_ids)
        winner_names = [team_dict[team_id] for team_id in winner_ids]
        #print winner_names
        #print

        if (r < 6):
            matchups = winners_to_matchups(winner_ids)
    
    return losers, winners

# prediction with upset bonus

In [52]:
def predict_with_upset_bonus(classifier, matchups, round_num):
    points = {1:1, 2:2, 3:4, 4:8, 5:16, 6:32}
    upset_bonus = 2
    round_points = points[round_num]
    
    split = len(matchups) / 2
    
    # get win probabilities
    teams = matchups[['TeamID', 'OppTeamID']]
    win_probs = pd.DataFrame(data=classifier.predict_proba(matchups), columns=['Loss', 'Win'])
    results = pd.concat([teams, win_probs], axis=1)

    # compare predictions for each matchup from each POV
    results_1 = results.iloc[:split]
    results_1.loc[:,'Matchup'] = results_1.index
    results_2 = results.iloc[split:].reset_index()
    results_2.loc[:,'Matchup'] = results_2.index
    results_concat = results_1.join(results_2, on='Matchup', lsuffix='1', rsuffix='2')
    results_concat = results_concat[['TeamID1', 'OppTeamID1', 'Win1', 'Win2']]
    results_concat.columns = ['Team1', 'Team2', 'Win1', 'Win2']
    
    # standardize probabilities
    results_concat['Sum'] = results_concat['Win1'] + results_concat['Win2']
    results_concat['Win1Adj'] = results_concat['Win1'] / results_concat['Sum']
    results_concat['Win2Adj'] = results_concat['Win2'] / results_concat['Sum']
    
    # calculate expected values
    results_concat['Team1Seed'] = results_concat['Team1'].map(seeds_dict)
    results_concat['Team2Seed'] = results_concat['Team2'].map(seeds_dict)
    
    results_concat['Team1WinVal'] = np.where(results_concat['Team1Seed'] > results_concat['Team2Seed'], round_points + upset_bonus, round_points)
    results_concat['Team2WinVal'] = np.where(results_concat['Team1Seed'] < results_concat['Team2Seed'], round_points + upset_bonus, round_points)
    
    results_concat['Team1ExpVal'] = results_concat['Win1Adj'] * results_concat['Team1WinVal']
    results_concat['Team2ExpVal'] = results_concat['Win2Adj'] * results_concat['Team2WinVal']
    
    #results_concat = results_concat[['Team1', 'Win1Adj', 'Team1Seed', 'Team1WinVal', 'Team1ExpVal', 'Team2', 'Win2Adj', 'Team2Seed', 'Team2WinVal', 'Team2ExpVal']]
    #print results_concat
    
    pred_winners = np.where(results_concat['Team1ExpVal'] > results_concat['Team2ExpVal'], results_concat['Team1'], results_concat['Team2'])
    pred_losers = np.where(results_concat['Team1ExpVal'] < results_concat['Team2ExpVal'], results_concat['Team1'], results_concat['Team2'])
    return pred_winners, pred_losers

In [53]:
def predict_bracket_upsets(classifier):
    matchups = [[1438,1420], [1166, 1243], [1246, 1172], [1112, 1138], [1274, 1260], [1397, 1460], [1305, 1400], [1153, 1209], [1462, 1411], [1281, 1199], [1326, 1355], [1211, 1422], [1222, 1361], [1276, 1285], [1401, 1344], [1314, 1252], [1437, 1347], [1439, 1104], [1452, 1293], [1455, 1267], [1196, 1382], [1403, 1372], [1116, 1139], [1345, 1168], [1242, 1335], [1371, 1301], [1155, 1308], [1120, 1158], [1395, 1393], [1277, 1137], [1348, 1328], [1181, 1233]]
    winners = []
    losers = []
    
    for r in rounds:
        matchups_with_data = create_matchups(team_data, matchups, r)
        (loser_ids, winner_ids) = predict_with_upset_bonus(classifier, matchups_with_data, r)
        winners.append(winner_ids)
        losers.append(loser_ids)
        winner_names = [team_dict[team_id] for team_id in winner_ids]
        #print winner_names
        #print

        if (r < 6):
            matchups = winners_to_matchups(winner_ids)
    
    return losers, winners

# scoring metrics

In [54]:
def score_bracket_upsets(results, pred_winners, pred_losers):
    # see https://www.nytimes.com/2015/03/16/upshot/heres-how-our-ncaa-bracket-works.html
    points = [1, 2, 4, 8, 16, 32]
    total_pts = 0
    upset_bonus = 5
    num_upsets = 0
    
    for rd, pts, pred_win, pred_lose, act_win in zip(rounds, points, pred_winners, pred_losers, results):
        num_correct = 0
        
        for pred_w, pred_l, act_w in zip(pred_win, pred_lose, act_win):
            if (pred_w == act_w):
                num_correct += 1
                if (seeds_dict[pred_w] > seeds_dict[pred_l]):
                    num_upsets += 1
                
        rd_pts = pts * num_correct
        total_pts += rd_pts
    
    total_pts += (num_upsets * upset_bonus)
    return total_pts

In [55]:
def score_bracket_espn(results, prediction):
    # see http://games.espn.com/tournament-challenge-bracket/2018/en/story?pageName=tcmen%5Chowtoplay
    points = [10, 20, 40, 80, 160, 320]
    total_pts = 0
    
    for rd, pts, pred_winners, act_winners in zip(rounds, points, prediction, results):
        num_correct = 0
        
        for pred, act in zip(pred_winners, act_winners):
            if (pred == act):
                num_correct += 1
        
        rd_pts = pts * num_correct
        total_pts += rd_pts
    
    return total_pts

# main

In [56]:
(train_X, train_Y, team_data) = load_data()
teams = pd.read_csv('DataFiles/Teams.csv')
team_dict = pd.Series(teams.TeamName.values,index=teams.TeamID).to_dict()
rounds = [1, 2, 3, 4, 5, 6]
tournament_results = [[1420, 1243, 1246, 1138, 1260, 1397, 1305, 1153, 1462, 1199, 1326, 1211, 1222, 1276, 1401, 1314, 1437, 1104, 1452, 1267, 1196, 1403, 1139, 1345, 1242, 1371, 1155, 1120, 1393, 1277, 1348, 1181],[1243, 1246, 1260, 1305, 1199, 1211, 1276, 1401, 1437, 1452, 1403, 1345, 1242, 1155, 1393, 1181],[1243, 1260, 1199, 1276, 1437, 1403, 1242, 1181],[1260, 1276, 1437, 1242],[1276, 1437],[1437]]

In [57]:
seeds = pd.read_csv('Stage2UpdatedDataFiles/NCAATourneySeeds.csv')
seeds = seeds[seeds['Season'] == 2018]
seeds['Seed'] = seeds['Seed'].apply(fix_seed)

seeds_dict = pd.Series(seeds.Seed.values,index=seeds.TeamID).to_dict()

In [58]:
models = ["BNB", "GNB", "LDA","SVM_L", "5NN", "LR2", "SGD","ADA", "DT", "RF", "DPGMM", "ET", "GMM", "MLP"] #"SVM_L", "SVM_G", "P2", "DT",  "ADA_R", 
classifiers = [BernoulliNB(), \
            GaussianNB(), \
            LinearDiscriminantAnalysis(), \
            svm.SVC(kernel = 'linear', probability=True), \
            neighbors.KNeighborsClassifier(n_neighbors=5), \
            LogisticRegression(), \
            SGDClassifier(loss='log', tol=0.0001, power_t=0.4, average=True), \
            AdaBoostClassifier(base_estimator=None, n_estimators=100), \
            DecisionTreeClassifier(), \
            RandomForestClassifier(),  \
            BayesianGaussianMixture(n_components=2,max_iter=1000, weight_concentration_prior_type='dirichlet_process', tol=0.0001), \
            ExtraTreesClassifier(bootstrap=True, n_estimators=4), \
            GaussianMixture(n_components=2, tol=0.0001, max_iter=1000, n_init=2), \
            MLPClassifier(activation='relu', alpha=0.00001, max_iter=1000)]

In [59]:
results = []

(baseline_losers, baseline_winners) = predict_bracket_baseline()
baseline_winner = team_dict[baseline_winners[5][0]]
baseline_espn_score = score_bracket_espn(tournament_results, baseline_winners)
baseline_upset_score = score_bracket_upsets(tournament_results, baseline_winners, baseline_losers)
results.append(["Baseline", baseline_winner, baseline_espn_score, baseline_upset_score])

for m,c in zip(models,classifiers):
    c.fit(train_X, train_Y)
    (losers, winners) = predict_bracket(c)
    
    champion = team_dict[winners[5][0]]
    espn_score = score_bracket_espn(tournament_results, winners)
    upset_score = score_bracket_upsets(tournament_results, winners, losers)
    
    results.append([m, champion, espn_score, upset_score])

In [60]:
print tabulate.tabulate(results, headers=['Model', 'Champion', 'ESPN Score', 'Upset Score'])

Model     Champion       ESPN Score    Upset Score
--------  -----------  ------------  -------------
Baseline  Kansas                650             65
BNB       Iona                  130             58
GNB       Michigan St           630             98
LDA       Villanova            1200            165
SVM_L     Villanova            1130            143
5NN       Wichita St            430             63
LR2       Villanova            1200            165
SGD       Auburn                320             67
ADA       Villanova            1590            234
DT        Villanova            1920            292
RF        Villanova            1920            292
DPGMM     Syracuse              200             35
ET        Villanova            1760            271
GMM       Alabama               520            107
MLP       Virginia              550             80


In [61]:
#lr = LogisticRegression()
#lr.fit(train_X, train_Y)
#(losers, winners) = predict_bracket_upsets(lr)
#upset_score = score_bracket_upsets(results, winners, losers)
#print upset_score
#espn_score = score_bracket_espn(results, winners)
#print espn_score