In [None]:
import os

import numpy as np
import pandas as pd

%matplotlib inline
from matplotlib import pylab as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
from dota_ml.data import DATA_URL, transform_data
from dota_ml.utils import generate_grid, make_submission, plot_feature_ranking

In [None]:
if not os.path.exists('data/'):
    !wget $DATA_URL -q --show-progress
    !tar -xvf data.tar.gz
else:
    print('Data already exists!')

In [None]:
data_params = {
    'last_gold_by_player': True, 'last_gold_by_team': True,
    'gold_speed_by_player': True, 'gold_speed_by_team': True,
    'max_gold_by_player': True, 'max_gold_by_team': True,
    
    'last_lh_by_player': True, 'last_lh_by_team': True,
    'lh_speed_by_player': True, 'lh_speed_by_team': True,
    'max_lh_by_player': True, 'max_lh_by_team': True,
}

train_df, test_df = transform_data('data/', **data_params)

In [None]:
X_train = train_df.drop('radiant_won', axis=1)
y_train = train_df['radiant_won']

In [None]:
model_param_grid = {
    'n_estimators': [100, 1000, 5000],
    'max_depth': [None, 2, 4, 6, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 5, 100],
    'criterion': ['gini', 'entropy'],

    'random_state': [0],
}

In [None]:
n_iter = 10
scoring = 'roc_auc'
k_folds = 3

gs = RandomizedSearchCV(RandomForestClassifier(), model_param_grid,
                        scoring=scoring, cv=k_folds, n_iter=n_iter,
                        refit=True, n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)

In [None]:
for score, std, params in zip(gs.cv_results_['mean_test_score'],
                              gs.cv_results_['std_test_score'],
                              gs.cv_results_['params']):
    print('- score={:.5}, std={:.5} | params={}'.format(score, std, params))

In [None]:
best_score = gs.best_score_
best_estimator = gs.best_estimator_
best_params = gs.best_params_

print('best_score: {}'.format(best_score))
print('best params: {}'.format(best_params))

In [None]:
make_submission(test_df, best_estimator,
                'submissions/', 'rfc', {**data_params, **best_params}, best_score)