In [1]:
# Import all the libraries needed
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from keras.layers import Input, Dense, Dropout, Flatten, Embedding, merge
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.models import Model
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.constraints import maxnorm
from pandas import read_csv, DataFrame
from numpy.random import seed
from sklearn.preprocessing import scale
from keras.models import Sequential
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers import Dense, Merge
from keras.layers.normalization import BatchNormalization
import autotime
%load_ext autotime

Using TensorFlow backend.


In [2]:
# Initial setup and set random seed
np.random.seed(42)
%cd '/home/jsahewal/MarchMadness/scripts/'
os.chdir('/home/jsahewal/MarchMadness/scripts/')
data_inp_dir = '../input/'
data_out_dir = '../output/'
data_out_file = data_out_dir + 'keras_final.csv'

/local/home/jsahewal/MarchMadness/scripts
time: 5.29 ms


In [3]:
# Load all the dataframes
seeds_df = pd.read_csv(data_inp_dir + 'TourneySeeds.csv')
tour_compact_results_df = pd.read_csv(data_inp_dir + 'TourneyCompactResults.csv')
tour_detailed_results_df = pd.read_csv(data_inp_dir + 'TourneyDetailedResults.csv')
season_compact_results_df = pd.read_csv(data_inp_dir + 'RegularSeasonCompactResults.csv')
season_detailed_results_df = pd.read_csv(data_inp_dir + 'RegularSeasonDetailedResults.csv')
teams_df = pd.read_csv(data_inp_dir + 'Teams.csv')
seasons_df = pd.read_csv(data_inp_dir + 'Seasons.csv')
submission_df = pd.read_csv(data_inp_dir + 'SampleSubmission.csv')
tour_slots_df = pd.read_csv(data_inp_dir + 'TourneySlots.csv')

time: 316 ms


In [4]:
# Transform the tournament data, get the seed number for winning and losing team and remove unnecessary columns
tour_merged_df = pd.merge(left=tour_compact_results_df, right=seeds_df, how='left', left_on=['Season', 'Wteam'], right_on=['Season', 'Team'])
tour_merged_df.rename(columns={'Seed' : 'Wseed', 'Team' : 'W_team'}, inplace=True)
tour_merged_df = pd.merge(left=tour_merged_df, right=seeds_df, how='left', left_on=['Season', 'Lteam'], right_on=['Season', 'Team'])
tour_merged_df.rename(columns={'Seed' : 'Lseed', 'Team' : 'L_team'}, inplace=True)
tour_cleaned_df = tour_merged_df[['Season', 'Wteam', 'Lteam', 'Wseed', 'Lseed']].copy()
tour_cleaned_df['Wseed'] = tour_cleaned_df['Wseed'].map(lambda x: int(x[1:3]))
tour_cleaned_df['Lseed'] = tour_cleaned_df['Lseed'].map(lambda x: int(x[1:3]))
tour_cleaned_df['Seed_diff'] = tour_cleaned_df['Wseed'] - tour_cleaned_df['Lseed']
tour_winning_df = pd.DataFrame()
tour_winning_df[['Team1', 'Team2', 'Seed_diff']] = tour_cleaned_df[['Wteam', 'Lteam', 'Seed_diff']].copy()
tour_winning_df['result'] = 1
tour_losing_df = pd.DataFrame()
tour_losing_df[['Team1', 'Team2', 'Seed_diff']] = tour_cleaned_df[['Lteam', 'Wteam', 'Seed_diff']].copy()
tour_losing_df['Seed_diff'] = -tour_losing_df['Seed_diff']
tour_losing_df['result'] = 0
tour_train_final_df = pd.concat((tour_winning_df, tour_losing_df))

time: 45.8 ms


In [5]:
# Transform the season data, get the seed number for winning and losing team and remove unnecessary columns
season_merged_df = pd.merge(left=season_compact_results_df, right=seeds_df, how='left', left_on=['Season', 'Wteam'], right_on=['Season', 'Team'])
season_merged_df.rename(columns={'Seed' : 'Wseed', 'Team' : 'W_team'}, inplace=True)
season_merged_df = pd.merge(left=season_merged_df, right=seeds_df, how='left', left_on=['Season', 'Lteam'], right_on=['Season', 'Team'])
season_merged_df.rename(columns={'Seed' : 'Lseed', 'Team' : 'L_team'}, inplace=True)
season_cleaned_df = season_merged_df[['Season', 'Wteam', 'Lteam', 'Wseed', 'Lseed']].copy()
season_cleaned_df.fillna('T17', inplace=True)
season_cleaned_df['Wseed'] = season_cleaned_df['Wseed'].map(lambda x: int(x[1:3]))
season_cleaned_df['Lseed'] = season_cleaned_df['Lseed'].map(lambda x: int(x[1:3]))
season_cleaned_df['Seed_diff'] = season_cleaned_df['Wseed'] - season_cleaned_df['Lseed']
season_winning_df = pd.DataFrame()
season_winning_df[['Team1', 'Team2', 'Seed_diff']] = season_cleaned_df[['Wteam', 'Lteam', 'Seed_diff']].copy()
season_winning_df['result'] = 1
season_losing_df = pd.DataFrame()
season_losing_df[['Team1', 'Team2', 'Seed_diff']] = season_cleaned_df[['Lteam', 'Wteam', 'Seed_diff']].copy()
season_losing_df['Seed_diff'] = -season_losing_df['Seed_diff']
season_losing_df['result'] = 0
season_train_final_df = pd.concat((season_winning_df, season_losing_df))

time: 419 ms


In [6]:
# Create final input dataframe
input_train_final_df = pd.concat((season_train_final_df, tour_train_final_df), ignore_index=True)
team_dict = {t: i for i, t in enumerate(input_train_final_df.Team1.unique())}
input_train_final_df['Team1'] = input_train_final_df['Team1'].apply(lambda x: team_dict[x])
input_train_final_df['Team2'] = input_train_final_df['Team2'].apply(lambda x: team_dict[x])

time: 252 ms


In [7]:
X1 = scale(input_train_final_df[['Team1', 'Team2']])
X2 = scale(input_train_final_df[['Team2', 'Seed_diff']])
X3 = scale(input_train_final_df[['Team2', 'Seed_diff']])

time: 30.3 ms


In [8]:
# Deep Learning Model
branch1 = Sequential()
branch1.add(Dense(X1.shape[1], input_shape =  (X1.shape[1],), init = 'normal', activation = 'relu'))
branch1.add(Dense(X1.shape[1], init = 'normal', activation = 'relu', W_constraint = maxnorm(5)))
branch1.add(BatchNormalization())
branch1.add(Dense(X1.shape[1], init = 'normal', activation = 'relu', W_constraint = maxnorm(5)))
branch1.add(BatchNormalization())

branch2 = Sequential()
branch2.add(Dense(X2.shape[1], input_shape =  (X2.shape[1],), init = 'normal', activation = 'relu'))
branch2.add(BatchNormalization())
branch2.add(Dense(X2.shape[1], init = 'normal', activation = 'relu', W_constraint = maxnorm(5)))
branch2.add(Dense(X2.shape[1], init = 'normal', activation = 'relu', W_constraint = maxnorm(5)))
branch2.add(BatchNormalization())

branch3 = Sequential()
branch3.add(Dense(X3.shape[1], input_shape =  (X3.shape[1],), init = 'normal', activation = 'relu'))
branch3.add(BatchNormalization())
branch3.add(Dense(X3.shape[1], init = 'normal', activation = 'relu', W_constraint = maxnorm(5)))
branch3.add(BatchNormalization())
branch3.add(Dense(X3.shape[1], init = 'normal', activation = 'relu', W_constraint = maxnorm(5)))

model = Sequential()
model.add(Merge([branch1, branch2, branch3], mode = 'sum'))
model.add(Dense(1, init = 'normal', activation = 'sigmoid'))
model.summary()

sgd = SGD(lr = 0.01, momentum = 0.9, decay = 0, nesterov = False)
model.compile(loss = 'binary_crossentropy', optimizer = sgd, metrics = ['accuracy'])
model.compile(Adam(0.001), loss="binary_crossentropy")

history = model.fit([X1, X2, X3], input_train_final_df['result'].values, batch_size = 64, nb_epoch = 10, verbose = 2)
model.predict([X1, X2, X3])

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_1 (Dense)                  (None, 2)             6           dense_input_1[0][0]              
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 2)             6           dense_1[0][0]                    
____________________________________________________________________________________________________
batchnormalization_1 (BatchNormal(None, 2)             4           dense_2[0][0]                    
____________________________________________________________________________________________________
dense_3 (Dense)                  (None, 2)             6           batchnormalization_1[0][0]       
___________________________________________________________________________________________

array([[ 0.34172741],
       [ 0.53873098],
       [ 0.82313931],
       ..., 
       [ 0.15761016],
       [ 0.47347084],
       [ 0.57613689]], dtype=float32)

time: 4min 46s


In [9]:
# Process test data and create predictions for it
test_df = pd.DataFrame()
test_df['Season'] = submission_df['Id'].apply(lambda x: int(x.split("_")[0]))
test_df['Team1'] = submission_df['Id'].apply(lambda x: int(x.split("_")[1]))
test_df['Team2'] = submission_df['Id'].apply(lambda x: int(x.split("_")[2]))

time: 17.6 ms


In [10]:
test_merged_df = pd.merge(left=test_df, right=seeds_df, how='left', left_on=['Season', 'Team1'], right_on=['Season', 'Team'])
test_merged_df.rename(columns={'Seed' : 'Seed1', 'Team' : 'team_1'}, inplace=True)
test_merged_df = pd.merge(left=test_merged_df, right=seeds_df, how='left', left_on=['Season', 'Team2'], right_on=['Season', 'Team'])
test_merged_df.rename(columns={'Seed' : 'Seed2', 'Team' : 'team_2'}, inplace=True)
test_cleaned_df = test_merged_df[['Season', 'Team1', 'Team2', 'Seed1', 'Seed2']].copy()
test_cleaned_df['Seed1'] = test_cleaned_df['Seed1'].map(lambda x: int(x[1:3]))
test_cleaned_df['Seed2'] = test_cleaned_df['Seed2'].map(lambda x: int(x[1:3]))
test_cleaned_df['Seed_diff'] = test_cleaned_df['Seed1'] - test_cleaned_df['Seed2']
test_cleaned_df['Team1'] = test_cleaned_df['Team1'].apply(lambda x: team_dict[x])
test_cleaned_df['Team2'] = test_cleaned_df['Team2'].apply(lambda x: team_dict[x])

time: 27.6 ms


In [11]:
Z1 = scale(test_cleaned_df[['Team1', 'Team2']])
Z2 = scale(test_cleaned_df[['Team2', 'Seed_diff']])
Z3 = scale(test_cleaned_df[['Team2', 'Seed_diff']])

time: 6.87 ms


In [12]:
test_pred_prob = model.predict([Z1, Z2, Z3])
final_output = pd.DataFrame()
final_output['Id'] = submission_df['Id']
final_output['Pred'] = test_pred_prob
final_output.to_csv(data_out_file, index=False)

time: 91.5 ms
