In [1]:
import pandas as pd
import numpy as np
import math
import csv
import random
from sklearn import cross_validation, linear_model, model_selection
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle



In [2]:
folder = 'input'
season_data = pd.read_csv(folder + '/RegularSeasonDetailedResults.csv')
tourney_data = pd.read_csv(folder + '/NCAATourneyDetailedResults.csv')
ratings = pd.read_csv(folder + '/MasseyOrdinals_thruSeason2018_Day128.csv')
seeds = pd.read_csv(folder + '/NCAATourneySeeds.csv')
frames = [season_data, tourney_data]
all_data = pd.concat(frames)
stat_fields = ['score', 'fga', 'fgp', 'fga3', '3pp', 'ftp', 'or', 'dr',
                   'ast', 'to', 'stl', 'blk', 'pf']
stat_ratings = ['POM']
ratings = ratings[ratings['SystemName'].isin(stat_ratings)]
prediction_year = 2018
base_elo = 1600
BASE_ELO = 1500
K = 20.
HOME_ADVANTAGE = 100.
team_elos = {}
team_stats = {}
team_ratings = {}
X = []
y = []
submission_data = []
def initialize_data():
    for i in range(2003, prediction_year+1):
        team_elos[i] = {}
        team_stats[i] = {}
        team_ratings[i] = {}
initialize_data()

In [4]:
# Define Helper Functions
def get_elo(season, team):
    try:
        return team_elos[season][team]
    except:
        try:
            # Get the previous season's ending value.
            team_elos[season][team] = team_elos[season-1][team]
            return team_elos[season][team]
        except:
            # Get the starter elo.
            team_elos[season][team] = base_elo
            return team_elos[season][team]
        
def calc_elo(win_team, lose_team, season):
    winner_rank = get_elo(season, win_team)
    loser_rank = get_elo(season, lose_team)
    rank_diff = winner_rank - loser_rank
    exp = (rank_diff * -1) / 400
    odds = 1 / (1 + math.pow(10, exp))
    k = 20
    new_winner_rank = round(winner_rank + (k * (1 - odds)))
    new_rank_diff = new_winner_rank - winner_rank
    new_loser_rank = loser_rank - new_rank_diff
    return new_winner_rank, new_loser_rank

def get_stat(season, team, field):
    try:
        stat_results = team_stats[season][team][field]
        return sum(stat_results) / float(len(stat_results))
    except:
        return 0
    
def get_rating(season, team, day, rating):
    try:
        stat_results = team_ratings[season][team][rating]
        stat_day, stat_ranking = tuple([int(x) for x in stat_result.split('-')])
        if ((day > 0) and ((day - stat_day) >= 7)):
            # Need to update
            new_stat_result = ratings[(ratings['Season'] == season) & (ratings['TeamID'] == team) & (ratings['RankingDayNum'] <= day)].tail(1)
            update_rating(season, team, day, rating, new_stat_result['OrdinalRank'].values[0])
            return ranking
        return stat_ranking
    except:
        stat_result = ratings[(ratings['Season'] == season) & (ratings['TeamID'] == team) & (ratings['RankingDayNum'] <= day)].tail(1)
        if stat_result.shape[0] > 0:
            ranking = stat_result['OrdinalRank'].values[0]
            update_rating(season, team, day, rating, ranking)
            return ranking
        return 0
    
def update_rating(season, team, day, rating, ranking):
    if team not in team_ratings[season]:
        team_ratings[season][team] = {}
    team_ratings[season][team][rating] = str(day) + '-' + str(rating)
    
def update_stats(season, team, fields):
    if team not in team_stats[season]:
        team_stats[season][team] = {}
    for key, value in fields.items():
        # Make sure we have the field.
        if key not in team_stats[season][team]:
            team_stats[season][team][key] = []
        if len(team_stats[season][team][key]) >= 9:
            team_stats[season][team][key].pop()
        team_stats[season][team][key].append(value)
        
def build_test_feature(team_1, team_2, model, season, stat_fields, stat_ratings):
    features = []
    # Team 1
    features.append(get_elo(season, team_1))
    for stat in stat_fields:
        features.append(get_stat(season, team_1, stat))
    for rating in stat_ratings:
        features.append(get_rating(season, team_1, 0, rating))
    # Team 2
    features.append(get_elo(season, team_2))
    for stat in stat_fields:
        features.append(get_stat(season, team_2, stat))
    for rating in stat_ratings:
        features.append(get_rating(season, team_2, 0, rating))
    return features
#     model_prediction = model.predict_proba([features])
#     return np.clip(model_prediction[0][0], 0.05, 0.95)

wfgm :  field goals made
wfga :  field goals attempted
wfgm3 :  three pointers made
wfga3 :  three pointers attempted
wftm :  free throws made
wfta :  free throws attempted
wor :  offensive rebounds
wdr :  defensive rebounds
wast :  assists
wto :  turnovers
wstl :  steals
wblk :  blocks
wpf :  personal fouls

In [5]:
def build_season_data(all_data):
    # Calculate the elo for every game for every team, each season.
    # Store the elo per season so we can retrieve their end elo
    # later in order to predict the tournaments without having to
    # inject the prediction into this loop.
    for index, row in all_data.iterrows():
        # Used to skip matchups where we don't have usable stats yet.
        skip = 0
        # Get starter or previous elos.
        team_1_elo = get_elo(row['Season'], row['WTeamID'])
        team_2_elo = get_elo(row['Season'], row['LTeamID'])
        # Add 100 to the home team (# taken from Nate Silver analysis.)
        if row['WLoc'] == 'H':
            team_1_elo += 100
        elif row['WLoc'] == 'A':
            team_2_elo += 100         
        # We'll create some arrays to use later.
        team_1_features = [team_1_elo]
        team_2_features = [team_2_elo]
        # Build arrays out of the stats we're tracking..
        for field in stat_fields:
            team_1_stat = get_stat(row['Season'], row['WTeamID'], field)
            team_2_stat = get_stat(row['Season'], row['LTeamID'], field)
            if team_1_stat is not 0 and team_2_stat is not 0:
                team_1_features.append(team_1_stat)
                team_2_features.append(team_2_stat)
            else:
                skip = 1
        for rating in stat_ratings:
            team_1_rating = get_rating(row['Season'], row['WTeamID'], row['DayNum'], rating)
            team_2_rating = get_rating(row['Season'], row['LTeamID'], row['DayNum'], rating)
            if team_1_rating is not 0 and team_2_rating is not 0:
                team_1_features.append(team_1_rating)
                team_2_features.append(team_2_rating)
            else:
                skip = 1
        if skip == 0:  # Make sure we have stats.
            # Randomly select left and right and 0 or 1 so we can train
            # for multiple classes.
            if random.random() > 0.5:
                X.append(team_1_features + team_2_features)
                y.append(1)
            else:
                X.append(team_2_features + team_1_features)
                y.append(0)
        # AFTER we add the current stuff to the prediction, update for
        # next time. Order here is key so we don't fit on data from the
        # same game we're trying to predict.
        if row['WFTA'] != 0 and row['LFTA'] != 0:
            stat_1_fields = {
                'score': row['WScore'],
                'fgp': row['WFGM'] / row['WFGA'] * 100,
                'fga': row['WFGA'],
                'fga3': row['WFGA3'],
                '3pp': row['WFGM3'] / row['WFGA3'] * 100,
                'ftp': row['WFTM'] / row['WFTA'] * 100,
                'or': row['WOR'],
                'dr': row['WDR'],
                'ast': row['WAst'],
                'to': row['WTO'],
                'stl': row['WStl'],
                'blk': row['WBlk'],
                'pf': row['WPF']
            }            
            stat_2_fields = {
                'score': row['LScore'],
                'fgp': row['LFGM'] / row['LFGA'] * 100,
                'fga': row['LFGA'],
                'fga3': row['LFGA3'],
                '3pp': row['LFGM3'] / row['LFGA3'] * 100,
                'ftp': row['LFTM'] / row['LFTA'] * 100,
                'or': row['LOR'],
                'dr': row['LDR'],
                'ast': row['LAst'],
                'to': row['LTO'],
                'stl': row['LStl'],
                'blk': row['LBlk'],
                'pf': row['LPF']
            }
            update_stats(row['Season'], row['WTeamID'], stat_1_fields)
            update_stats(row['Season'], row['LTeamID'], stat_2_fields)
        # Now that we've added them, calc the new elo.
        new_winner_rank, new_loser_rank = calc_elo(
            row['WTeamID'], row['LTeamID'], row['Season'])
        team_elos[row['Season']][row['WTeamID']] = new_winner_rank
        team_elos[row['Season']][row['LTeamID']] = new_loser_rank
    return X, y
X, y = build_season_data(all_data)

In [6]:
X_train, y_train = shuffle(X, y)

In [7]:
logreg = LogisticRegression()
params = {'C': np.logspace(start=-5, stop=3, num=9)}
clf = GridSearchCV(logreg, params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
print('Best log_loss: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_['C']))

Best log_loss: -0.5284, with best C: 0.001


In [8]:
df_sample_sub = pd.read_csv('./SampleSubmissionStage1.csv')
n_test_games = len(df_sample_sub)

def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))

In [9]:
X_test = []
for ii, row in df_sample_sub.iterrows():
    year, team_1, team_2 = get_year_t1_t2(row.ID)
    feature = build_test_feature(team_1, team_2, clf, year, stat_fields, stat_ratings)
    X_test.append(feature)

In [10]:
predictions = clf.predict_proba(X_test)[:,1]
clipped_preds = np.clip(predictions, 0.05, 0.95)

In [11]:
df_sample_sub.Pred = clipped_preds
df_sample_sub.head()

Unnamed: 0,ID,Pred
0,2014_1107_1110,0.489155
1,2014_1107_1112,0.110207
2,2014_1107_1113,0.229124
3,2014_1107_1124,0.155952
4,2014_1107_1140,0.243963


In [12]:
df_sample_sub.to_csv('SubmissionStage1.csv', index=False)

In [13]:
df_sample_sub = pd.read_csv('./SampleSubmissionStage2.csv')
n_test_games = len(df_sample_sub)

In [14]:
X_test = []
for ii, row in df_sample_sub.iterrows():
    year, team_1, team_2 = get_year_t1_t2(row.ID)
    feature = build_test_feature(team_1, team_2, clf, year, stat_fields, stat_ratings)
    X_test.append(feature)
predictions = clf.predict_proba(X_test)[:,1]
clipped_preds = np.clip(predictions, 0.05, 0.95)
df_sample_sub.Pred = clipped_preds
df_sample_sub.head()
df_sample_sub.to_csv('SubmissionStage2.csv', index=False)