# One more thing

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as pl
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import KFold
import time
from functools import partial
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from scipy.optimize import minimize_scalar
%matplotlib inline

In [3]:
features = pd.read_csv('./features.csv', index_col='match_id')
features_test = pd.read_csv('./features_test.csv', index_col='match_id')
features_copy = features.copy()
features_test_copy = features_test.copy()
features_test.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1430287923,0,93,4,1103,1089,8,0,1,9,...,0,12,247,-86,272.0,3,4,2,0,118
7,1430293357,1,20,2,556,570,1,0,0,9,...,2,-29,168,-54,,3,2,2,1,16
10,1430301774,1,112,2,751,808,1,0,0,13,...,1,-22,46,-87,186.0,1,3,3,0,-34
13,1430323933,1,27,3,708,903,1,1,1,11,...,2,-49,30,-89,210.0,3,4,2,1,-26
16,1430331112,1,39,4,1259,661,4,0,0,9,...,0,36,180,-86,180.0,1,3,2,1,-33


## Подход 1: градиентный бустинг "в лоб"

### Удаляем столбцы, связанные с итогами матча

In [4]:
MATCH_RESULT_COLUMNS = ["tower_status_radiant", 
                        "tower_status_dire", 
                        "barracks_status_radiant", 
                        "barracks_status_dire",
                        "duration"]
features.drop(MATCH_RESULT_COLUMNS, axis=1, inplace=True)

### Проверяем наличие пропусков

In [5]:
def get_columns_with_na(df):
    missed = df.shape[0] - df.count()
    missed_labels = missed.index.values[missed.values > 0]
    return missed_labels

print(get_columns_with_na(features))

['first_blood_time' 'first_blood_team' 'first_blood_player1'
 'first_blood_player2' 'radiant_bottle_time' 'radiant_courier_time'
 'radiant_flying_courier_time' 'radiant_first_ward_time' 'dire_bottle_time'
 'dire_courier_time' 'dire_flying_courier_time' 'dire_first_ward_time']


>  Запишите названия признаков, имеющих пропуски, и попробуйте для любых двух из них дать обоснование, почему их значения могут быть пропущены

* `first_blood_player1` - возможны игроки, которые за первые пять минут не убили ни одного противника
* `dire_bottle_time` - dire не покупали bottle за первые пять минут

### Заполняем пропуски

In [6]:
def fill_na_calc_jumps(df, method, columns):
    jumps = pd.Series()
    for column in columns:
        jumps[column] = (method == "big")   * (df[column].max() + 5 * df[column].std()) +\
                        (method == "small") * (df[column].min() - 5 * df[column].std()) +\
                        (method == "zero")  * (0)
    return jumps

def fill_na(df, method, jumps=None):
    """
    method \in ["zero", "big", "small"]
    """
    columns = get_columns_with_na(df)
    if not isinstance(jumps, pd.Series):
        jumps = fill_na_calc_jumps(df, method, columns)
    for column in columns:
        df[column] = df[column].fillna(jumps[column])
    return df, jumps

In [7]:
print(type(features_test))
features, f_jumps = fill_na(features, "zero")
features_test, ft_jumps = fill_na(features_test, "zero", f_jumps)

<class 'pandas.core.frame.DataFrame'>


### Целевая переменная

In [8]:
target = "radiant_win"

def get_train_test(df_train, df_test, target_column):
    train_columns = list(df_train.columns.values)
    train_columns.remove(target_column)
    assert train_columns == list(df_test.columns.values)
    X_train = df_train[train_columns].as_matrix()
    X_test = df_test[train_columns].as_matrix()
    y_train = df_train[target_column].as_matrix()
    return X_train, y_train, X_test
    
X_train, y_train, X_test = get_train_test(features, features_test, target)

### Обучаем градиентный бустинг

In [8]:
def test_boosting(X_train, y_train, X_test, ntrees, bins=5, learning_rate=0.1):
    split = KFold(X_train.shape[0], bins, shuffle=True)
    clf = GradientBoostingClassifier(n_estimators=ntrees, verbose=False, learning_rate=learning_rate)
    scores = []
    for itrain, itest in split:
        clf.fit(X_train[itrain], y_train[itrain])
        pred = clf.predict_proba(X_train[itest])[:, 1]
        score = roc_auc_score(y_train[itest], pred)
        scores.append(score)
    return sum(scores)/len(scores)

for trees in range(10, 31, 5):
    %time score = test_boosting(X_train, y_train, X_test, trees)
    print("trees = %d, score = %f" %(trees, score))

CPU times: user 44 s, sys: 635 ms, total: 44.6 s
Wall time: 44.9 s
trees = 10, score = 0.665132
CPU times: user 1min 2s, sys: 673 ms, total: 1min 3s
Wall time: 1min 3s
trees = 15, score = 0.676325
CPU times: user 1min 20s, sys: 744 ms, total: 1min 21s
Wall time: 1min 22s
trees = 20, score = 0.681743
CPU times: user 1min 41s, sys: 925 ms, total: 1min 42s
Wall time: 1min 43s
trees = 25, score = 0.686081
CPU times: user 2min 11s, sys: 1.41 s, total: 2min 12s
Wall time: 2min 13s
trees = 30, score = 0.689488


### Отчет

1. [Пропуски и что они могут означать](#Проверяем-наличие-пропусков)
2. [Целевая переменная](#Целевая-переменная)
3. [Время кросс-валидации для 30-ти деревьев (внизу) и соответствующее качество](#Обучаем-градиентный-бустинг)
4. Согласно скорам из предыдущего пункта наращивание количества деревьев мало влияет на результат

## Логистическая регрессия

### Преобразуем признаки

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [10]:
X_test = scaler.transform(X_test)

In [11]:
split = KFold(X_train.shape[0], 5, shuffle=True)
def test_logregr(X_train, y_train, X_test, C=1.0, bins=5):
    clf = LogisticRegression(penalty='l2', C=C, n_jobs=-1)   
#     split = KFold(X_train.shape[0], bins, shuffle=True)
    scores = []
    for itrain, itest in split:
        clf.fit(X_train[itrain], y_train[itrain])
        pred = clf.predict_proba(X_train[itest])[:, 1]
        score = roc_auc_score(y_train[itest], pred)
        scores.append(score)
    return sum(scores)/len(scores)

def _test_logregr(C):
    res = %time -1.0 * test_logregr(X_train, y_train, X_test, C)
    print(C, -res)
    return res

solution = minimize_scalar(_test_logregr, bounds=(0.1, 10), method='bounded')

CPU times: user 14.6 s, sys: 561 ms, total: 15.2 s
Wall time: 15.3 s
3.88146351138 0.716273195058
CPU times: user 12.3 s, sys: 397 ms, total: 12.7 s
Wall time: 12.7 s
6.21853648862 0.716273022396
CPU times: user 12.8 s, sys: 455 ms, total: 13.2 s
Wall time: 13.3 s
2.43707297725 0.716273333969
CPU times: user 13.7 s, sys: 513 ms, total: 14.2 s
Wall time: 14.3 s
1.54439053413 0.716273366717
CPU times: user 15.1 s, sys: 602 ms, total: 15.7 s
Wall time: 15.8 s
1.27011729172 0.716273536204
CPU times: user 14.7 s, sys: 558 ms, total: 15.2 s
Wall time: 15.4 s
0.823172257107 0.716273632731
CPU times: user 15.5 s, sys: 625 ms, total: 16.1 s
Wall time: 16.3 s
0.852898553624 0.716273657141
CPU times: user 13.5 s, sys: 498 ms, total: 14 s
Wall time: 14.1 s
1.00320450346 0.716273621067
CPU times: user 13.2 s, sys: 492 ms, total: 13.7 s
Wall time: 13.9 s
0.910310317752 0.716273566047
CPU times: user 14.4 s, sys: 569 ms, total: 15 s
Wall time: 15.2 s
0.907692407353 0.716273580878
CPU times: user 12.4

In [12]:
C = solution.x

В результате двух вызовов оптимизации получили 0.1 и 3.9 с примерно одинаковыми значениями score. Поэтому параметр регуляризации не особо влияет на score. Логистическая регрессия обучается на порядок быстрее, чем градиентный бустинг. score выше примерно на 5 процентов. Попробуем улучшить результат регрессии, убрав категориальные признаки.

### Удаляем категориальные признаки

In [11]:
categorial = ["lobby_type", 
              "r1_hero", 
              "r2_hero", 
              "r3_hero", 
              "r4_hero", 
              "r5_hero", 
              "d1_hero", 
              "d2_hero", 
              "d3_hero", 
              "d4_hero", 
              "d5_hero"]
features_test = features_test.drop(categorial, axis=1)
features = features.drop(categorial, axis=1)
X_train, y_train, X_test = get_train_test(features, features_test, target)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
split = KFold(X_train.shape[0], 5, shuffle=True)
def test_logregr(X_train, y_train, X_test, C=1.0, bins=5):
    clf = LogisticRegression(penalty='l2', C=C, n_jobs=-1)   
#     split = KFold(X_train.shape[0], bins, shuffle=True)
    scores = []
    for itrain, itest in split:
        clf.fit(X_train[itrain], y_train[itrain])
        pred = clf.predict_proba(X_train[itest])[:, 1]
        score = roc_auc_score(y_train[itest], pred)
        scores.append(score)
    return sum(scores)/len(scores)

def _test_logregr(C):
    res = %time -1.0 * test_logregr(X_train, y_train, X_test, C)
    print(C, -res)
    return res

solution = minimize_scalar(_test_logregr, bounds=(0.1, 10), method='bounded')

При обучении номинальные признаки и так игнорируются, что наглядно показывается с помощью `clf.coef_`. Качество не изменилось.

### Идентификаторы героев

In [16]:
features = features_copy
features_test = features_test_copy
features.drop(MATCH_RESULT_COLUMNS, axis=1, inplace=True)
features, f_jumps = fill_na(features, "zero")
features_test, ft_jumps = fill_na(features_test, "zero", f_jumps)

In [17]:
X_train, y_train, X_test = get_train_test(features, features_test, target)

In [18]:
heroes      = ["r1_hero", 
              "r2_hero", 
              "r3_hero", 
              "r4_hero", 
              "r5_hero", 
              "d1_hero", 
              "d2_hero", 
              "d3_hero", 
              "d4_hero", 
              "d5_hero"]
ids = None
for num in heroes:
    ids_current = features[num].unique()
    if ids is None:
        ids = ids_current
    else:
        np.concatenate((ids, ids_current), axis=0)
ids = np.sort(ids)
ids_num = ids[-1]
print(ids_num)

112


In [19]:
X_pick = np.zeros((features.shape[0], ids_num))
X_pick_test = np.zeros((features_test.shape[0], ids_num))
for i, match_id in enumerate(features.index):
    for p in range(5):
        X_pick[i, features.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, features.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
for i, match_id in enumerate(features_test.index):
    for p in range(5):
        X_pick_test[i, features_test.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick_test[i, features_test.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [20]:
X_test = np.concatenate((X_test, X_pick_test), axis=1)
X_train = np.concatenate((X_train, X_pick), axis=1)

In [21]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

split = KFold(X_train.shape[0], 5, shuffle=True)
def test_logregr(X_train, y_train, X_test, C=1.0, bins=5):
    clf = LogisticRegression(penalty='l2', C=C, n_jobs=-1)   
    scores = []
    for itrain, itest in split:
        clf.fit(X_train[itrain], y_train[itrain])
        print(clf.coef_)
        pred = clf.predict_proba(X_train[itest])[:, 1]
        score = roc_auc_score(y_train[itest], pred)
        scores.append(score)
    return sum(scores)/len(scores)

def _test_logregr(C):
    res = %time -1.0 * test_logregr(X_train, y_train, X_test, C)
    print(C, -res)
    return res

solution = minimize_scalar(_test_logregr, bounds=(0.1, 10), method='bounded')

[[ 0.04589563 -0.00677798  0.0139267   0.01463089  0.07224463  0.196151
   0.22774778 -0.04761809 -0.05411918  0.04318361 -0.00910018  0.02027218
   0.07330509  0.21985558  0.20246897 -0.05564847 -0.06509896  0.03108239
   0.00615267  0.0118183   0.07900521  0.18353214  0.21454902 -0.05271359
  -0.06608509  0.04830959  0.01002317  0.01974143  0.08860139  0.17487593
   0.22690436 -0.04805885 -0.0712373   0.03555884 -0.00609838  0.04934023
   0.05944213  0.17214197  0.22453754 -0.05080782 -0.0716725   0.04455671
  -0.01217981 -0.03538618 -0.0811837  -0.19530937 -0.20633109  0.05181034
   0.06516805 -0.03012053 -0.00893486 -0.01792977 -0.09984168 -0.20177528
  -0.21645956  0.0542142   0.0644005  -0.04266554 -0.01682118  0.01158383
  -0.14222328 -0.14857558 -0.22073434  0.03236842  0.05736013 -0.03973225
  -0.00107517 -0.0208834  -0.07815516 -0.19697329 -0.21864848  0.04913313
   0.06553626 -0.03840884 -0.00485074 -0.00296552 -0.13433469 -0.17090173
  -0.21035697  0.05250273  0.07163716 -0

KeyboardInterrupt: 

In [24]:
clf = LogisticRegression(penalty='l2', C=1, n_jobs=-1)
clf.fit(X_train, y_train)
features.columns.values[np.argsort(clf.coef_)]

IndexError: index 140 is out of bounds for axis 1 with size 103

In [25]:
features.columns.values

array(['start_time', 'lobby_type', 'r1_hero', 'r1_level', 'r1_xp',
       'r1_gold', 'r1_lh', 'r1_kills', 'r1_deaths', 'r1_items', 'r2_hero',
       'r2_level', 'r2_xp', 'r2_gold', 'r2_lh', 'r2_kills', 'r2_deaths',
       'r2_items', 'r3_hero', 'r3_level', 'r3_xp', 'r3_gold', 'r3_lh',
       'r3_kills', 'r3_deaths', 'r3_items', 'r4_hero', 'r4_level', 'r4_xp',
       'r4_gold', 'r4_lh', 'r4_kills', 'r4_deaths', 'r4_items', 'r5_hero',
       'r5_level', 'r5_xp', 'r5_gold', 'r5_lh', 'r5_kills', 'r5_deaths',
       'r5_items', 'd1_hero', 'd1_level', 'd1_xp', 'd1_gold', 'd1_lh',
       'd1_kills', 'd1_deaths', 'd1_items', 'd2_hero', 'd2_level', 'd2_xp',
       'd2_gold', 'd2_lh', 'd2_kills', 'd2_deaths', 'd2_items', 'd3_hero',
       'd3_level', 'd3_xp', 'd3_gold', 'd3_lh', 'd3_kills', 'd3_deaths',
       'd3_items', 'd4_hero', 'd4_level', 'd4_xp', 'd4_gold', 'd4_lh',
       'd4_kills', 'd4_deaths', 'd4_items', 'd5_hero', 'd5_level', 'd5_xp',
       'd5_gold', 'd5_lh', 'd5_kills', 'd5_death

### Отчет

1. [Здесь](#Преобразуем-признаки)
2. [Здесь](#Удаляем-категориальные-признаки)
3. 112
4. Улучшилось примерно на 5 процентов (76%). Видимо, наличие определенных героев в матче увеличивает вероятность победы
5. Максимальное - 0.76, минимальное - 0.66

P.S. Иногда прекращал выполнение с помощью KeyboardInterrupt. Прошу не расценивать как ошибку :)

## Experiment

In [42]:
features = pd.read_csv('./features.csv', index_col='match_id')
features_test = pd.read_csv('./features_test.csv', index_col='match_id')
MATCH_RESULT_COLUMNS = ["tower_status_radiant", 
                        "tower_status_dire", 
                        "barracks_status_radiant", 
                        "barracks_status_dire",
                        "duration"]
features.drop(MATCH_RESULT_COLUMNS, axis=1, inplace=True)
features.drop(['start_time', 'lobby_type'], axis=1, inplace=True)
features_test.drop(['start_time', 'lobby_type'], axis=1, inplace=True)
features, f_jumps = fill_na(features, "small")
features_test, ft_jumps = fill_na(features_test, "small", f_jumps)
X_train, y_train, X_test = get_train_test(features, features_test, target)

heroes      = ["r1_hero", 
              "r2_hero", 
              "r3_hero", 
              "r4_hero", 
              "r5_hero", 
              "d1_hero", 
              "d2_hero", 
              "d3_hero", 
              "d4_hero", 
              "d5_hero"]
#
ids = None
for num in heroes:
    ids_current = features[num].unique()
    if ids is None:
        ids = ids_current
    else:
        np.concatenate((ids, ids_current), axis=0)
ids = np.sort(ids)
ids_num = ids[-1]
#
X_pick = np.zeros((features.shape[0], ids_num))
X_pick_test = np.zeros((features_test.shape[0], ids_num))
for i, match_id in enumerate(features.index):
    for p in range(5):
        X_pick[i, features.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, features.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
for i, match_id in enumerate(features_test.index):
    for p in range(5):
        X_pick_test[i, features_test.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick_test[i, features_test.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
#        
X_test = np.concatenate((X_test, X_pick_test), axis=1)
X_train = np.concatenate((X_train, X_pick), axis=1)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [40]:
def test_logregr(X_train, y_train, X_test, C=1.0, bins=5):
    clf = LogisticRegression(penalty='l2', C=C, n_jobs=-1)   
    scores = []
    coefs = None
    for itrain, itest in split:
        clf.fit(X_train[itrain], y_train[itrain])
        coefs = clf.coef_
        pred = clf.predict_proba(X_train[itest])[:, 1]
        score = roc_auc_score(y_train[itest], pred)
        scores.append(score)
    print(coefs)
    return sum(scores)/len(scores)

In [43]:
split = KFold(X_train.shape[0], 5, shuffle=True)
score = test_logregr(X_train, y_train, X_test)
print(score)
print(features.columns.values)

[[ 0.01329966 -0.00793547  0.09303231  0.1987788   0.22201328 -0.0347765
  -0.03979387  0.03794142 -0.00606299  0.02736437  0.06346255  0.1785931
   0.23565862 -0.02680597 -0.04953801  0.02204005  0.01537021 -0.00157834
   0.09686618  0.16348701  0.2334506  -0.03906773 -0.04426795  0.05240631
   0.00914949  0.00217465  0.10772234  0.15098372  0.24743649 -0.02960954
  -0.06314796  0.03051086  0.00380379  0.03477701  0.08013766  0.1712838
   0.21886705 -0.02861339 -0.05357012  0.05236125 -0.003635   -0.00428591
  -0.09041641 -0.19466413 -0.21008496  0.04141796  0.05511133 -0.02731309
  -0.00691507 -0.00268519 -0.09414893 -0.21515676 -0.21060682  0.03546988
   0.04764172 -0.03675381 -0.00507987  0.0100119  -0.13126327 -0.14132336
  -0.22083398  0.01724428  0.04039901 -0.0349438  -0.00255787 -0.02650752
  -0.07413077 -0.18026306 -0.21446939  0.03181551  0.04582582 -0.03931971
  -0.0025106   0.00264937 -0.12497711 -0.15337044 -0.22398505  0.02045266
   0.05635227 -0.04522728 -0.02547318  0.