In [1]:
# Import all the libraries needed
import os
import autotime
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
%load_ext autotime

In [2]:
# Initial setup and set random seed
np.random.seed(42)
%cd '/home/jsahewal/MarchMadness/scripts/'
os.chdir('/home/jsahewal/MarchMadness/scripts/')
data_inp_dir = '../input/'
data_out_dir = '../output/'

/local/home/jsahewal/MarchMadness/scripts
time: 3.99 ms


In [3]:
# Load all the dataframes
seeds_df = pd.read_csv(data_inp_dir + 'TourneySeeds.csv')
tour_compact_results_df = pd.read_csv(data_inp_dir + 'TourneyCompactResults.csv')
tour_detailed_results_df = pd.read_csv(data_inp_dir + 'TourneyDetailedResults.csv')
season_compact_results_df = pd.read_csv(data_inp_dir + 'RegularSeasonCompactResults.csv')
season_detailed_results_df = pd.read_csv(data_inp_dir + 'RegularSeasonDetailedResults.csv')
teams_df = pd.read_csv(data_inp_dir + 'Teams.csv')
seasons_df = pd.read_csv(data_inp_dir + 'Seasons.csv')
submission_df = pd.read_csv(data_inp_dir + 'sample_submission.csv')
tour_slots_df = pd.read_csv(data_inp_dir + 'TourneySlots.csv')

time: 298 ms


In [4]:
# Transform the data, get the seed number for winning and losing team and remove unnecessary columns
tour_merged_df = pd.merge(left=tour_compact_results_df, right=seeds_df, how='left',
                           left_on=['Season', 'Wteam'], right_on=['Season', 'Team'])
tour_merged_df.rename(columns={'Seed' : 'Wseed', 'Team' : 'W_team'}, inplace=True)
tour_merged_df = pd.merge(left=tour_merged_df, right=seeds_df, how='left',
                           left_on=['Season', 'Lteam'], right_on=['Season', 'Team'])
tour_merged_df.rename(columns={'Seed' : 'Lseed', 'Team' : 'L_team'}, inplace=True)
tour_cleaned_df = tour_merged_df[['Season', 'Wteam', 'Lteam', 'Wseed', 'Lseed']].copy()
tour_cleaned_df['Wseed'] = tour_cleaned_df['Wseed'].map(lambda x: int(x[1:3]))
tour_cleaned_df['Lseed'] = tour_cleaned_df['Lseed'].map(lambda x: int(x[1:3]))
tour_cleaned_df['Seed_diff'] = tour_cleaned_df['Wseed'] - tour_cleaned_df['Lseed']
tour_winning_df = pd.DataFrame()
tour_winning_df[['Team1', 'Team2', 'Seed_diff']] = tour_cleaned_df[['Wteam', 'Lteam', 'Seed_diff']].copy()
tour_winning_df['result'] = 1
tour_losing_df = pd.DataFrame()
tour_losing_df[['Team1', 'Team2', 'Seed_diff']] = tour_cleaned_df[['Lteam', 'Wteam', 'Seed_diff']].copy()
tour_losing_df['Seed_diff'] = -tour_losing_df['Seed_diff']
tour_losing_df['result'] = 0
tour_train_final_df = pd.concat((tour_winning_df, tour_losing_df))
team_dict = {t: i for i, t in enumerate(tour_train_final_df.Team1.unique())}

time: 43.6 ms


In [5]:
X_train = tour_train_final_df[['Team1', 'Team2', 'Seed_diff']].copy()
X_train['Team1'] = X_train['Team1'].apply(lambda x: team_dict[x])
X_train['Team2'] = X_train['Team2'].apply(lambda x: team_dict[x])
X_train['Seed_diff'] = X_train['Seed_diff'].values.reshape(-1,1)
Y_train = tour_train_final_df.result.values
X_train, Y_train = shuffle(X_train, Y_train)

time: 13.2 ms


In [6]:
sss_rf = StratifiedShuffleSplit(n_splits=1000, test_size=0.2)
sss_rf.get_n_splits(X_train, Y_train)
pipe_rf = Pipeline(steps=[('clf', RandomForestClassifier())])
search_parameters_rf = {'clf__criterion': ['gini', 'entropy'],
                         'clf__max_features': [3, 'auto'],
                         'clf__max_depth': [5, 10, 15, 20, 25],
                         'clf__min_samples_split': [2, 4, 8, 16, 32]}
grid_search_rf = GridSearchCV(pipe_rf, param_grid=search_parameters_rf, cv=sss_rf, scoring='neg_log_loss', n_jobs=-1)
grid_search_rf.fit(X_train, Y_train)

# Score of best_estimator on the left out data
print("best score is {0}".format(grid_search_rf.best_score_))

# Print the optimized parameters used in the model selected from grid search
print "Params: ", grid_search_rf.best_params_

### Assign the best estimator to final LR classifier
clf_rf = grid_search_rf.best_estimator_

best score is -0.5379848968
Params:  {'clf__criterion': 'entropy', 'clf__max_depth': 5, 'clf__max_features': 'auto', 'clf__min_samples_split': 32}
time: 10min 7s


In [7]:
test_rf_df = pd.DataFrame()
test_rf_df['Season'] = submission_df['id'].apply(lambda x: int(x.split("_")[0]))
test_rf_df['Team1'] = submission_df['id'].apply(lambda x: int(x.split("_")[1]))
test_rf_df['Team2'] = submission_df['id'].apply(lambda x: int(x.split("_")[2]))

time: 52.3 ms


In [8]:
test_rf_merged_df = pd.merge(left=test_rf_df, right=seeds_df, how='left',
                           left_on=['Season', 'Team1'], right_on=['Season', 'Team'])
test_rf_merged_df.rename(columns={'Seed' : 'Seed1', 'Team' : 'team_1'}, inplace=True)
test_rf_merged_df = pd.merge(left=test_rf_merged_df, right=seeds_df, how='left',
                           left_on=['Season', 'Team2'], right_on=['Season', 'Team'])
test_rf_merged_df.rename(columns={'Seed' : 'Seed2', 'Team' : 'team_2'}, inplace=True)
test_rf_cleaned_df = test_rf_merged_df[['Season', 'Team1', 'Team2', 'Seed1', 'Seed2']].copy()
test_rf_cleaned_df['Seed1'] = test_rf_cleaned_df['Seed1'].map(lambda x: int(x[1:3]))
test_rf_cleaned_df['Seed2'] = test_rf_cleaned_df['Seed2'].map(lambda x: int(x[1:3]))
test_rf_cleaned_df['Seed_diff'] = test_rf_cleaned_df['Seed1'] - test_rf_cleaned_df['Seed2']

time: 36.8 ms


In [9]:
X_rf_test = test_rf_cleaned_df[['Team1', 'Team2', 'Seed_diff']].copy()
X_rf_test['Team1'] = X_rf_test['Team1'].apply(lambda x: team_dict[x])
X_rf_test['Team2'] = X_rf_test['Team2'].apply(lambda x: team_dict[x])
X_rf_test['Seed_diff'] = X_rf_test['Seed_diff'].values.reshape(-1,1)

time: 14.4 ms


In [10]:
test_rf_pred_prob = clf_rf.predict_proba(X_rf_test)[:,1]

time: 14.1 ms


In [11]:
final_rf_output = pd.DataFrame()
final_rf_output['id'] = submission_df['id']
final_rf_output['pred'] = test_rf_pred_prob
final_rf_output.to_csv('../output/rf.csv', index=False)

time: 16.7 ms
