In [1]:
# Import all the libraries needed
import os
import autotime
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
%load_ext autotime

In [2]:
# Initial setup and set random seed
np.random.seed(42)
%cd '/home/jsahewal/MarchMadness/scripts/'
os.chdir('/home/jsahewal/MarchMadness/scripts/')
data_inp_dir = '../input/'
data_out_dir = '../output/'
data_out_file = data_out_dir + 'randomForest_final.csv'

/local/home/jsahewal/MarchMadness/scripts
time: 4.73 ms


In [3]:
# Load all the dataframes
seeds_df = pd.read_csv(data_inp_dir + 'TourneySeeds.csv')
tour_compact_results_df = pd.read_csv(data_inp_dir + 'TourneyCompactResults.csv')
tour_detailed_results_df = pd.read_csv(data_inp_dir + 'TourneyDetailedResults.csv')
season_compact_results_df = pd.read_csv(data_inp_dir + 'RegularSeasonCompactResults.csv')
season_detailed_results_df = pd.read_csv(data_inp_dir + 'RegularSeasonDetailedResults.csv')
teams_df = pd.read_csv(data_inp_dir + 'Teams.csv')
seasons_df = pd.read_csv(data_inp_dir + 'Seasons.csv')
submission_df = pd.read_csv(data_inp_dir + 'SampleSubmission.csv')
tour_slots_df = pd.read_csv(data_inp_dir + 'TourneySlots.csv')

time: 317 ms


In [4]:
# Transform the tournament data, get the seed number for winning and losing team and remove unnecessary columns
tour_merged_df = pd.merge(left=tour_compact_results_df, right=seeds_df, how='left', left_on=['Season', 'Wteam'], right_on=['Season', 'Team'])
tour_merged_df.rename(columns={'Seed' : 'Wseed', 'Team' : 'W_team'}, inplace=True)
tour_merged_df = pd.merge(left=tour_merged_df, right=seeds_df, how='left', left_on=['Season', 'Lteam'], right_on=['Season', 'Team'])
tour_merged_df.rename(columns={'Seed' : 'Lseed', 'Team' : 'L_team'}, inplace=True)
tour_cleaned_df = tour_merged_df[['Season', 'Wteam', 'Lteam', 'Wseed', 'Lseed']].copy()
tour_cleaned_df['Wseed'] = tour_cleaned_df['Wseed'].map(lambda x: int(x[1:3]))
tour_cleaned_df['Lseed'] = tour_cleaned_df['Lseed'].map(lambda x: int(x[1:3]))
tour_cleaned_df['Seed_diff'] = tour_cleaned_df['Wseed'] - tour_cleaned_df['Lseed']
tour_winning_df = pd.DataFrame()
tour_winning_df[['Team1', 'Team2', 'Seed_diff']] = tour_cleaned_df[['Wteam', 'Lteam', 'Seed_diff']].copy()
tour_winning_df['result'] = 1
tour_losing_df = pd.DataFrame()
tour_losing_df[['Team1', 'Team2', 'Seed_diff']] = tour_cleaned_df[['Lteam', 'Wteam', 'Seed_diff']].copy()
tour_losing_df['Seed_diff'] = -tour_losing_df['Seed_diff']
tour_losing_df['result'] = 0
tour_train_final_df = pd.concat((tour_winning_df, tour_losing_df))

time: 42.1 ms


In [5]:
# Transform the season data, get the seed number for winning and losing team and remove unnecessary columns
season_merged_df = pd.merge(left=season_compact_results_df, right=seeds_df, how='left', left_on=['Season', 'Wteam'], right_on=['Season', 'Team'])
season_merged_df.rename(columns={'Seed' : 'Wseed', 'Team' : 'W_team'}, inplace=True)
season_merged_df = pd.merge(left=season_merged_df, right=seeds_df, how='left', left_on=['Season', 'Lteam'], right_on=['Season', 'Team'])
season_merged_df.rename(columns={'Seed' : 'Lseed', 'Team' : 'L_team'}, inplace=True)
season_cleaned_df = season_merged_df[['Season', 'Wteam', 'Lteam', 'Wseed', 'Lseed']].copy()
season_cleaned_df.fillna('T17', inplace=True)
season_cleaned_df['Wseed'] = season_cleaned_df['Wseed'].map(lambda x: int(x[1:3]))
season_cleaned_df['Lseed'] = season_cleaned_df['Lseed'].map(lambda x: int(x[1:3]))
season_cleaned_df['Seed_diff'] = season_cleaned_df['Wseed'] - season_cleaned_df['Lseed']
season_winning_df = pd.DataFrame()
season_winning_df[['Team1', 'Team2', 'Seed_diff']] = season_cleaned_df[['Wteam', 'Lteam', 'Seed_diff']].copy()
season_winning_df['result'] = 1
season_losing_df = pd.DataFrame()
season_losing_df[['Team1', 'Team2', 'Seed_diff']] = season_cleaned_df[['Lteam', 'Wteam', 'Seed_diff']].copy()
season_losing_df['Seed_diff'] = -season_losing_df['Seed_diff']
season_losing_df['result'] = 0
season_train_final_df = pd.concat((season_winning_df, season_losing_df))

time: 421 ms


In [6]:
# Create final input dataframe
input_train_final_df = pd.concat((season_train_final_df, tour_train_final_df), ignore_index=True)
team_dict = {t: i for i, t in enumerate(input_train_final_df.Team1.unique())}

time: 12.9 ms


In [7]:
X_train = input_train_final_df[['Team1', 'Team2', 'Seed_diff']].copy()
X_train['Team1'] = X_train['Team1'].apply(lambda x: team_dict[x])
X_train['Team2'] = X_train['Team2'].apply(lambda x: team_dict[x])
X_train['Seed_diff'] = X_train['Seed_diff'].values.reshape(-1,1)
Y_train = input_train_final_df.result.values
X_train, Y_train = shuffle(X_train, Y_train)

time: 476 ms


In [8]:
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2)
sss.get_n_splits(X_train, Y_train)
pipe = Pipeline(steps=[('clf', RandomForestClassifier())])
search_parameters = {'clf__criterion': ['entropy'],
                         'clf__max_features': ['auto'],
                         'clf__max_depth': [128, 256],
                         'clf__min_samples_split': [64, 128],
                         'clf__n_estimators': [70, 80]}
grid_search = GridSearchCV(pipe, param_grid=search_parameters, cv=sss, scoring='neg_log_loss', n_jobs=-1)
grid_search.fit(X_train, Y_train)

# Score of best_estimator on the left out data
print("best score is {0}".format(grid_search.best_score_))

# Print the optimized parameters used in the model selected from grid search
print "Params: ", grid_search.best_params_

### Assign the best estimator to final Random Forest classifier
clf = grid_search.best_estimator_

best score is -0.587968312325
Params:  {'clf__criterion': 'entropy', 'clf__max_depth': 128, 'clf__max_features': 'auto', 'clf__min_samples_split': 128, 'clf__n_estimators': 80}
time: 3min 21s


In [9]:
test_df = pd.DataFrame()
test_df['Season'] = submission_df['Id'].apply(lambda x: int(x.split("_")[0]))
test_df['Team1'] = submission_df['Id'].apply(lambda x: int(x.split("_")[1]))
test_df['Team2'] = submission_df['Id'].apply(lambda x: int(x.split("_")[2]))

time: 19.3 ms


In [10]:
test_merged_df = pd.merge(left=test_df, right=seeds_df, how='left',
                           left_on=['Season', 'Team1'], right_on=['Season', 'Team'])
test_merged_df.rename(columns={'Seed' : 'Seed1', 'Team' : 'team_1'}, inplace=True)
test_merged_df = pd.merge(left=test_merged_df, right=seeds_df, how='left',
                           left_on=['Season', 'Team2'], right_on=['Season', 'Team'])
test_merged_df.rename(columns={'Seed' : 'Seed2', 'Team' : 'team_2'}, inplace=True)
test_cleaned_df = test_merged_df[['Season', 'Team1', 'Team2', 'Seed1', 'Seed2']].copy()
test_cleaned_df['Seed1'] = test_cleaned_df['Seed1'].map(lambda x: int(x[1:3]))
test_cleaned_df['Seed2'] = test_cleaned_df['Seed2'].map(lambda x: int(x[1:3]))
test_cleaned_df['Seed_diff'] = test_cleaned_df['Seed1'] - test_cleaned_df['Seed2']

time: 24.3 ms


In [11]:
X_test = test_cleaned_df[['Team1', 'Team2', 'Seed_diff']].copy()
X_test['Team1'] = X_test['Team1'].apply(lambda x: team_dict[x])
X_test['Team2'] = X_test['Team2'].apply(lambda x: team_dict[x])
X_test['Seed_diff'] = X_test['Seed_diff'].values.reshape(-1,1)

time: 8.75 ms


In [12]:
test_pred_prob = clf.predict_proba(X_test)[:,1]

time: 84 ms


In [13]:
final_output = pd.DataFrame()
final_output['Id'] = submission_df['Id']
final_output['Pred'] = test_pred_prob
final_output.to_csv(data_out_file, index=False)

time: 8.81 ms
