In [11]:
%matplotlib inline
import os
from subprocess import check_output
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from sklearn.linear_model import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [12]:
# Initial setup and set random seed
np.random.seed(15)
data_inp_dir = '../input/'
data_out_dir = '../output/'
data_out_file = data_out_dir + 'ETC_final.csv'

In [13]:
# Load all the dataframes
seeds_df = pd.read_csv(data_inp_dir + 'TourneySeeds.csv')
tour_compact_results_df = pd.read_csv(data_inp_dir + 'TourneyCompactResults.csv')
tour_detailed_results_df = pd.read_csv(data_inp_dir + 'TourneyDetailedResults.csv')
season_compact_results_df = pd.read_csv(data_inp_dir + 'RegularSeasonCompactResults.csv')
season_detailed_results_df = pd.read_csv(data_inp_dir + 'RegularSeasonDetailedResults.csv')
teams_df = pd.read_csv(data_inp_dir + 'Teams.csv')
seasons_df = pd.read_csv(data_inp_dir + 'Seasons.csv')
submission_df = pd.read_csv(data_inp_dir + 'SampleSubmission.csv')
tour_slots_df = pd.read_csv(data_inp_dir + 'TourneySlots.csv')

In [14]:
# Transform the tournament data, get the seed number for winning and losing team and remove unnecessary columns
tour_merged_df = pd.merge(left=tour_compact_results_df, right=seeds_df, how='left', left_on=['Season', 'Wteam'], right_on=['Season', 'Team'])
tour_merged_df.rename(columns={'Seed' : 'Wseed', 'Team' : 'W_team'}, inplace=True)
tour_merged_df = pd.merge(left=tour_merged_df, right=seeds_df, how='left', left_on=['Season', 'Lteam'], right_on=['Season', 'Team'])
tour_merged_df.rename(columns={'Seed' : 'Lseed', 'Team' : 'L_team'}, inplace=True)
tour_cleaned_df = tour_merged_df[['Season', 'Wteam', 'Lteam', 'Wseed', 'Lseed']].copy()
tour_cleaned_df['Wseed'] = tour_cleaned_df['Wseed'].map(lambda x: int(x[1:3]))
tour_cleaned_df['Lseed'] = tour_cleaned_df['Lseed'].map(lambda x: int(x[1:3]))
tour_cleaned_df['Seed_diff'] = tour_cleaned_df['Wseed'] - tour_cleaned_df['Lseed']
tour_winning_df = pd.DataFrame()
tour_winning_df[['Team1', 'Team2', 'Seed_diff']] = tour_cleaned_df[['Wteam', 'Lteam', 'Seed_diff']].copy()
tour_winning_df['result'] = 1
tour_losing_df = pd.DataFrame()
tour_losing_df[['Team1', 'Team2', 'Seed_diff']] = tour_cleaned_df[['Lteam', 'Wteam', 'Seed_diff']].copy()
tour_losing_df['Seed_diff'] = -tour_losing_df['Seed_diff']
tour_losing_df['result'] = 0
tour_train_final_df = pd.concat((tour_winning_df, tour_losing_df))

In [15]:
# Transform the season data, get the seed number for winning and losing team and remove unnecessary columns
season_merged_df = pd.merge(left=season_compact_results_df, right=seeds_df, how='left', left_on=['Season', 'Wteam'], right_on=['Season', 'Team'])
season_merged_df.rename(columns={'Seed' : 'Wseed', 'Team' : 'W_team'}, inplace=True)
season_merged_df = pd.merge(left=season_merged_df, right=seeds_df, how='left', left_on=['Season', 'Lteam'], right_on=['Season', 'Team'])
season_merged_df.rename(columns={'Seed' : 'Lseed', 'Team' : 'L_team'}, inplace=True)
season_cleaned_df = season_merged_df[['Season', 'Wteam', 'Lteam', 'Wseed', 'Lseed']].copy()
season_cleaned_df.fillna('T17', inplace=True)
season_cleaned_df['Wseed'] = season_cleaned_df['Wseed'].map(lambda x: int(x[1:3]))
season_cleaned_df['Lseed'] = season_cleaned_df['Lseed'].map(lambda x: int(x[1:3]))
season_cleaned_df['Seed_diff'] = season_cleaned_df['Wseed'] - season_cleaned_df['Lseed']
season_winning_df = pd.DataFrame()
season_winning_df[['Team1', 'Team2', 'Seed_diff']] = season_cleaned_df[['Wteam', 'Lteam', 'Seed_diff']].copy()
season_winning_df['result'] = 1
season_losing_df = pd.DataFrame()
season_losing_df[['Team1', 'Team2', 'Seed_diff']] = season_cleaned_df[['Lteam', 'Wteam', 'Seed_diff']].copy()
season_losing_df['Seed_diff'] = -season_losing_df['Seed_diff']
season_losing_df['result'] = 0
season_train_final_df = pd.concat((season_winning_df, season_losing_df))

In [16]:
# Create final input dataframe
input_train_final_df = pd.concat((season_train_final_df, tour_train_final_df), ignore_index=True)
team_dict = {t: i for i, t in enumerate(input_train_final_df.Team1.unique())}

In [17]:
X_train = input_train_final_df[['Team1', 'Team2', 'Seed_diff']].copy()
X_train['Team1'] = X_train['Team1'].apply(lambda x: team_dict[x])
X_train['Team2'] = X_train['Team2'].apply(lambda x: team_dict[x])
X_train['Seed_diff'] = X_train['Seed_diff'].values.reshape(-1,1)
Y_train = input_train_final_df.result.values
X_train, Y_train = shuffle(X_train, Y_train)

In [18]:
%%time
X_train, X_test, y_train, y_test = train_test_split(
    X_train, Y_train, test_size=0.2, random_state=5)

Wall time: 423 ms


In [None]:
%%time
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
#from sklearn.grid_search import GridSearchCV

n_estimators_list = [1,5,10,50,100,400,500,550,600,1000]

etc = ExtraTreesClassifier(criterion="entropy",random_state=47, warm_start=True)
grid = GridSearchCV(estimator=etc, param_grid=dict(n_estimators=n_estimators_list))
grid.fit(X_train, y_train)

# Score of best_estimator on the left out data
print("best score is {0}".format(grid.best_score_))

# Print the optimized parameters used in the model selected from grid search
print "Params: ", grid.best_params_

### Assign the best estimator to final Extra Tree classifier
et_clf = grid.best_estimator_

In [None]:
test_df = pd.DataFrame()
test_df['Season'] = submission_df['Id'].apply(lambda x: int(x.split('_')[0]))
test_df['Team1'] = submission_df['Id'].apply(lambda x: int(x.split('_')[1]))
test_df['Team2'] = submission_df['Id'].apply(lambda x: int(x.split('_')[2]))

In [None]:
test_merged_df = pd.merge(left=test_df, right=seeds_df, how='left',
                           left_on=['Season', 'Team1'], right_on=['Season', 'Team'])
test_merged_df.rename(columns={'Seed' : 'Seed1', 'Team' : 'team_1'}, inplace=True)
test_merged_df = pd.merge(left=test_merged_df, right=seeds_df, how='left',
                           left_on=['Season', 'Team2'], right_on=['Season', 'Team'])
test_merged_df.rename(columns={'Seed' : 'Seed2', 'Team' : 'team_2'}, inplace=True)
test_cleaned_df = test_merged_df[['Season', 'Team1', 'Team2', 'Seed1', 'Seed2']].copy()
test_cleaned_df['Seed1'] = test_cleaned_df['Seed1'].map(lambda x: int(x[1:3]))
test_cleaned_df['Seed2'] = test_cleaned_df['Seed2'].map(lambda x: int(x[1:3]))
test_cleaned_df['Seed_diff'] = test_cleaned_df['Seed1'] - test_cleaned_df['Seed2']

In [None]:
X_test = test_cleaned_df[['Team1', 'Team2', 'Seed_diff']].copy()
X_test['Team1'] = X_test['Team1'].apply(lambda x: team_dict[x])
X_test['Team2'] = X_test['Team2'].apply(lambda x: team_dict[x])
X_test['Seed_diff'] = X_test['Seed_diff'].values.reshape(-1,1)

In [None]:
test_pred_prob = et_clf.predict_proba(X_test)[:,1]

In [None]:
final_output = pd.DataFrame()
final_output['Id'] = submission_df['Id']
final_output['Pred'] = test_pred_prob
final_output.to_csv(data_out_file, index=False)