In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.ensemble import  GradientBoostingClassifier

In [4]:
from sklearn.cross_validation import KFold

In [5]:
from sklearn.metrics import roc_auc_score

In [6]:
import time, datetime

In [7]:
from sklearn.grid_search import GridSearchCV

In [None]:
features = pd.read_csv('features.csv', index_col='match_id')

In [9]:
features.count()[features.count() != features.count().max()].index

Index(['first_blood_time', 'first_blood_team', 'first_blood_player1',
       'first_blood_player2', 'radiant_bottle_time', 'radiant_courier_time',
       'radiant_flying_courier_time', 'radiant_first_ward_time',
       'dire_bottle_time', 'dire_courier_time', 'dire_flying_courier_time',
       'dire_first_ward_time'],
      dtype='object')

In [11]:
for i in features.count()[features.count() != features.count().max()].index:
    features[i] = features[i].fillna(0)

In [12]:
# 'radiant_win' - столбец содержит целевую переменную

In [13]:
X = features.drop(['radiant_win', 'tower_status_radiant', 'tower_status_dire', 
                   'barracks_status_radiant', 'barracks_status_dire', 'duration'], 1)

In [14]:
y = features['radiant_win']

In [15]:
cv = KFold(n = len(X), n_folds = 5, shuffle = True)

In [16]:
start_time = datetime.datetime.now()

for train_index, test_index in cv:
    X_train, X_test = X.loc[X.index[train_index]], X.loc[X.index[test_index]]
    y_train, y_test = y.loc[y.index[train_index]], y.loc[y.index[test_index]]

clf = GradientBoostingClassifier(n_estimators = 30, max_depth = 2)
clf.fit(X_train, y_train)

pred = clf.predict_proba(X_test)[:, 1]

print ('Качество для градиентного бустинга с 30 деревьями:',roc_auc_score(y_test, pred))
print ('Время кросс-валидации для градиентного бустинга с 30 деревьями:', datetime.datetime.now() - start_time)

Качество для градиентного бустинга с 30 деревьями: 0.688088389991
Время кросс-валидации для градиентного бустинга с 30 деревьями: 0:00:30.210634


In [17]:
grid = {'n_estimators': np.arange(10, 110, 10)}
cv = KFold(y.size, n_folds=5, shuffle=True, random_state=241)
clf = GradientBoostingClassifier(max_depth = 2)
gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=cv)
gs.fit(X, y)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=97230, n_folds=5, shuffle=True, random_state=241),
       error_score='raise',
       estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=2, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [18]:
gs.best_params_

{'n_estimators': 100}

In [19]:
gs.best_score_

0.7012817207271751

In [21]:
# Чтобы ускорить обучение при увеличении количества деревьев:
# features = features.sample(n = len(X)/2)
# clf = GradientBoostingClassifier(n_estimators = 30, max_depth = 2)

In [16]:
from sklearn.linear_model import  LogisticRegression

In [17]:
from sklearn.pipeline import Pipeline

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
from sklearn.cross_validation import cross_val_score

In [20]:
grid1 = GridSearchCV(LogisticRegression(penalty='l2'), param_grid = {'C' : 10.0 ** np.arange(-5, 5)}, scoring='roc_auc')

In [21]:
clf = Pipeline([('preproc', StandardScaler()), ('classifier', grid1)])

In [22]:
clf.fit(X, y)

Pipeline(steps=[('preproc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs...0e+03,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0))])

In [23]:
clf.steps[1][1].best_params_

{'C': 0.01}

In [24]:
clf.steps[1][1].best_score_

0.71499869718819142

In [25]:
X1 = features.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero'
                   , 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero',
                   'radiant_win', 'tower_status_radiant', 'tower_status_dire', 
                   'barracks_status_radiant', 'barracks_status_dire', 'duration'], 1)

In [26]:
clf.fit(X1, y)

Pipeline(steps=[('preproc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs...0e+03,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0))])

In [27]:
clf.steps[1][1].best_score_

0.71514633450694021

In [28]:
# В логистической регрессии без преобразования текстовые категориальные признаки не являются информативными 
# идентификаторы которые нельзя сравнивать между собой

In [29]:
H = features[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 
              'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]

In [30]:
np.unique(H)

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 109, 110, 112], dtype=int64)

In [31]:
X_pick = pd.DataFrame(np.zeros((H.shape[0], len(np.unique(H)))), columns = np.unique(H), index = H.index)

for match_id in H.index:
    for p in list(range(5)):
        X_pick[H.ix[match_id, 'r%d_hero' % (p+1)]][match_id] = 1
        X_pick[H.ix[match_id, 'd%d_hero' % (p+1)]][match_id] = -1

In [32]:
X_a = pd.concat([X1, X_pick], axis = 1)

In [101]:
clf.fit(X_a, y)

Pipeline(steps=[('preproc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs...0e+03,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0))])

In [34]:
clf.steps[1][1].best_score_

0.74656640392876317

In [35]:
# Предсказание вероятностей на тестовых матчах

In [36]:
features_test = pd.read_csv('features_test.csv', index_col='match_id')

In [37]:
for i in features_test.count()[features_test.count() != features_test.count().max()].index:
    features_test[i] = features_test[i].fillna(0)

In [38]:
X_test = features_test.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 
                        'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], 1)

In [39]:
X_test = pd.DataFrame(X_test, columns = list(X_test.columns.values), index = X_test.index)

In [40]:
H_test = features_test[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 
              'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]

In [41]:
X_pick_test = pd.DataFrame(np.zeros((H_test.shape[0], len(np.unique(H_test)))), 
                           columns = np.unique(H_test), index = H_test.index)

for match_id in H_test.index:
    for p in list(range(5)):
        X_pick_test[H_test.ix[match_id, 'r%d_hero' % (p+1)]][match_id] = 1
        X_pick_test[H_test.ix[match_id, 'd%d_hero' % (p+1)]][match_id] = -1

In [42]:
X_b = pd.concat([X_test, X_pick_test], axis = 1)

In [43]:
[print('Минимальное значение прогноза на тестовой выборке:', np.min(clf.predict_proba(X_b)[:, 1])),
print('Максимальное значение прогноза на тестовой выборке:', np.max(clf.predict_proba(X_b)[:, 1]))]

Минимальное значение прогноза на тестовой выборке: 0.00970911709794
Максимальное значение прогноза на тестовой выборке: 0.994768483689


[None, None]

In [63]:
X_final = pd.concat([pd.DataFrame(X_b.index), pd.DataFrame(clf.predict_proba(X_b)[:, 1])], axis = 1)

In [64]:
X_final.columns = ['match_id', 'radiant_win']

In [65]:
X_final.to_csv('My_submission.csv', sep = ',', encoding = 'utf-8', index = False)