In [1]:
import os

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', 100)

import matplotlib
matplotlib.use('Agg')
%matplotlib inline
from matplotlib import pylab as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
from dota_ml.data import data_url, transform_data
from dota_ml.utils import generate_grid, make_submission, plot_feature_ranking, plot_feature_ranking

In [None]:
if not os.path.exists('data/'):
    !wget $data_url -q --show-progress
    !tar -xvf data.tar.gz
    !rm data.tar.gz
else:
    print('Data already exists!')

Data already exists!


In [None]:
data_params = {
#     'scale': True,
    
#     'gold_features': True,
#     'lh_features': True,
#     'xp_features': True,
    
#     'heroes_by_player': True, 
#     'heroes_by_team': True,
#     'vector_heroes': True,
#     'bigram_heroes': True,
    
#     'events_features': True,
    
#     'items_by_player': True,
#     'items_by_team': True,
    'vector_items': True,
    'bigram_items': True
}

train_df, test_df = transform_data(**data_params)

In [None]:
X_train = train_df.drop('radiant_won', axis=1)
print('X_train.shape={}'.format(X_train.shape))

y_train = train_df['radiant_won']
print('y_train.shape={}'.format(y_train.shape))

X_test = test_df
print('X_test.shape={}'.format(X_test.shape))

In [None]:
scoring = 'roc_auc'
k_folds = 3

estimator_param_grid = {
    'max_iter': [1000],
    'penalty': ['l1', 'l2'],
    'C': [10 ** power for power in range(-2, 2 + 1)],
    'verbose': [1]
}

In [None]:
gs = GridSearchCV(LogisticRegression(), estimator_param_grid,
                  scoring=scoring, cv=k_folds,
                  refit=True, n_jobs=-1, verbose=5)
gs.fit(X_train, y_train)

In [None]:
for score, std, params in zip(gs.cv_results_['mean_test_score'],
                              gs.cv_results_['std_test_score'],
                              gs.cv_results_['params']):
    print('- score={:.5}, std={:.5} | params={}'.format(score, std, params))

In [None]:
score = gs.best_score_
estimator = gs.best_estimator_
estimator_params = gs.best_params_

print('best_score: {}'.format(score))
print('best params: {}'.format(estimator_params))

In [None]:
make_submission(pd.DataFrame(X_test, index=test_df.index), estimator,
                'submissions/', 'logreg', {**data_params, **estimator_params}, score)

In [None]:
if hasattr(X_test, 'columns'):
    plot_feature_ranking(estimator.coef_.squeeze(), X_test.columns, max_n_importances=200)
else:
    print('features\' column names are not avaliable')

---