In [179]:
import pandas as pd
import numpy as np

import time
import datetime

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold, cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
from scipy.sparse import hstack, coo_matrix
    
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def sigmoid(x):
    return 1.0/(1 + np.exp(-x))

def getLL(clf, X, y):
    return [log_loss(y, sigmoid(y_pred)) for i, y_pred in enumerate(clf.staged_decision_function(X))]

def getAucRoc(clf, X, y):
    return roc_auc_score(y, clf.predict_proba(X)[:, 1]) 

In [3]:
features_all = pd.read_csv('features.csv', index_col='match_id')
features_all.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16,2449,0,4,1974,3,63


In [4]:
target = features_all['radiant_win']

In [5]:
features_test = pd.read_csv('features_test.csv', index_col='match_id')
features_list = features_test.columns.tolist()

In [6]:
features_selected = features_all[features_list]
features_selected.head(10)

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,0,35,103.0,-84,221.0,3,4,2,2,-52
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,0,-20,149.0,-84,195.0,5,4,3,1,-5
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,1,-39,45.0,-77,221.0,3,4,3,1,13
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,0,-30,124.0,-80,184.0,0,4,2,0,27
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,0,46,182.0,-80,225.0,6,3,3,0,-16
5,1430284186,1,11,5,1961,1461,19,0,1,6,...,1,-27,2.0,-86,212.0,4,4,4,0,-43
8,1430293701,1,8,3,967,1136,7,1,0,8,...,2,-35,6.0,-86,182.0,3,6,3,0,10
9,1430299335,7,35,5,2117,1252,16,0,0,6,...,1,26,119.0,-61,,2,3,4,0,-15
11,1430308974,1,17,5,1527,906,10,0,1,7,...,1,9,,-84,226.0,1,3,4,0,26
12,1430316105,7,15,5,1651,1060,14,0,1,10,...,0,-24,186.0,-83,195.0,1,3,2,0,16


In [7]:
features_count = features_selected.count()
max_filled = features_count.max()
partial = features_count[features_count < max_filled]

In [10]:
features_selected.fillna(0, inplace=True)
features_test.fillna(0, inplace=True)

In [None]:
totalN = len(target)
RND = 42
def train(X):
    res = pd.DataFrame(columns=['C', 'score', 'time'])  
    for c in np.power(10.0, np.arange(-5, 6)):
        clf = LogisticRegression(C=c, penalty='l2', random_state=RND)
        kf = KFold(totalN, n_folds=5, shuffle=True, random_state=RND)
        start_time = datetime.datetime.now()
        score = cross_val_score(clf, X, target, cv=kf, scoring='roc_auc')
        end_time = datetime.datetime.now() - start_time
        res = res.append({'C': c, 'score': score.mean(), 'time':end_time}, ignore_index=True)
        print end_time, score.mean(), c  
    return res.ix[res.score.idxmax()]

def run(label, X):
    res = train(X)
    print label, "best params:\n", res
    lg = LogisticRegression(C=res.C, penalty='l2', random_state=RND)
    lg.fit(X, target)
    score = getAucRoc(lg, X, target)
    print label, "score:", score
    return (score, res, lg)

# 1

In [229]:
features_scaled = StandardScaler().fit_transform(features_selected)
res_scalled = run("scalled", features_scaled)

0:00:04.107615 0.695161681227 1e-05
0:00:05.916374 0.711357748203 0.0001
0:00:11.702627 0.71636353888 0.001
0:00:15.087472 0.716550270785 0.01
0:00:14.053326 0.716527152902 0.1
0:00:12.339035 0.716522605658 1.0
0:00:12.811199 0.716522287837 10.0
0:00:12.238492 0.716522295257 100.0
0:00:12.763530 0.716522322797 1000.0
0:00:14.505015 0.716522328092 10000.0
0:00:12.680819 0.716522328092 100000.0
scalled best params:
C                          0.01
score                   0.71655
time     0 days 00:00:15.087472
Name: 3, dtype: object
scalled score: 0.717981741817


# 2

In [231]:
hero_cols = sum([["r%i_hero" % i, "d%i_hero" % i] for i in range(1,6)], [])
banned_cols = hero_cols + ["lobby_type"]

In [232]:
features_dropped = features_selected.drop(banned_cols, axis=1)
features_dropped = StandardScaler().fit_transform(features_dropped)
res_dropped = run("dropped", features_dropped)

0:00:04.879200 0.695100187107 1e-05
0:00:05.876823 0.711339145863 0.0001
0:00:08.448575 0.716375799908 0.001
0:00:10.800741 0.716559386445 0.01
0:00:11.504361 0.716534239287 0.1
0:00:10.633027 0.71653036557 1.0
0:00:12.220399 0.716529971503 10.0
0:00:10.772182 0.716529889947 100.0
0:00:10.587814 0.716529888885 1000.0
0:00:10.652103 0.71652988359 10000.0
0:00:11.157362 0.71652988359 100000.0
dropped best params:
C                          0.01
score                  0.716559
time     0 days 00:00:10.800741
Name: 3, dtype: object
dropped score: 0.717840770686


In [273]:
print "dropped better thand scalled: %.10f" % (res_dropped[1].score - res_scalled[1].score)

dropped better thand scalled: 0.0000091157


# 3

In [233]:
heroes = features_selected[hero_cols]

In [234]:
hero_uniq = np.unique(heroes.values)
print "Uniq heroes count:", hero_uniq.size

Uniq heroes count: 108


# 4

In [235]:
hero2id = dict(zip(hero_uniq, range(hero_uniq.size)))
def convertHeroes(X):
    res = np.zeros((X.shape[0], hero_uniq.size))
    for i, match_id in enumerate(X.index):
        for p in xrange(5):
            res[i, hero2id[X.ix[match_id, 'r%d_hero' % (p+1)]]] = 1
            res[i, hero2id[X.ix[match_id, 'd%d_hero' % (p+1)]]] = -1
    return res

In [236]:
features_rich = hstack([coo_matrix(features_dropped), coo_matrix(convertHeroes(features_selected))])

# 5

In [237]:
res_rich = run("rich", features_rich)

0:00:04.608646 0.699216216817 1e-05
0:00:06.653633 0.725071498503 0.0001
0:00:13.988120 0.746334151101 0.001
0:00:17.247907 0.751737730574 0.01
0:00:23.443300 0.751947344966 0.1
0:00:24.701176 0.751927559964 1.0
0:00:24.772272 0.751925225665 10.0
0:00:24.935716 0.75192470544 100.0
0:00:24.288190 0.751924949111 1000.0
0:00:24.311128 0.751924759521 10000.0
0:00:24.211255 0.751924690612 100000.0
rich best params:
C                           0.1
score                  0.751947
time     0 days 00:00:23.443300
Name: 4, dtype: object
rich score: 0.754446947008


# 6

In [238]:
test_heroes = features_test[hero_cols]
test_heroes_uniq = np.unique(test_heroes.values)
assert((test_heroes_uniq == hero_uniq).all())

In [251]:
# над тестовыми данными нужны все те же махимнации по нормализации! и удалении! и добавлению признаков!
test_dropped = features_test.drop(banned_cols, axis=1)
test_scalled = StandardScaler().fit_transform(test_dropped)
test_rich = hstack([coo_matrix(test_scalled), coo_matrix(convertHeroes(features_test))])

In [260]:
test_answers = res_rich[2].predict_proba(test_rich)[:, 1]
assert(test_answers.min() > 0 and test_answers.max() < 1)
test_answers

array([ 0.82466035,  0.75717036,  0.18778809, ...,  0.23399709,
        0.62470369,  0.42710208])

In [261]:
print "Min:", test_answers.min(), "Max:", test_answers.max()

Min: 0.00858060489119 Max: 0.996459223437


# Отчет
# 1
#### Какое качество получилось у логистической регрессии над всеми исходными признаками? Как оно соотносится с качеством градиентного бустинга? Чем вы можете объяснить эту разницу? Быстрее ли работает логистическая регрессия по сравнению с градиентным бустингом?

In [276]:
print "Time:", res_scalled[1].time, "AUC_ROC:", res_scalled[1].score

Time: 0 days 00:00:15.087472 AUC_ROC: 0.716550270785


У бустинга было: Time: 0 days 00:02:41.873910 AUC_ROC: 0.700208881661, быстрее, но похуже. Лучшее качество линейной регрессии думаю связано с пропусками в данных, мы использовали 0, что позволяет игнорировать такие значения в предсказании, в то врямя как деревья все-таки настраиваются на них.

# 2
#### Как влияет на качество логистической регрессии удаление категориальных признаков (укажите новое значение метрики качества)? Чем вы можете объяснить это изменение?

In [278]:
print "AUC_ROC:", res_dropped[1].score
print "dropped better thand scalled: %.10f" % (res_dropped[1].score - res_scalled[1].score)

AUC_ROC: 0.716559386445
dropped better thand scalled: 0.0000091157


Качество чуть заметно улучшилось на кросс-валидации, эти данные являют шумовыми и не несут полезного сигнала для обучения.

# 3
##### Сколько различных идентификаторов героев существует в данной игре?

In [279]:
print "Uniq heroes count:", hero_uniq.size

Uniq heroes count: 108


# 4
#### Какое получилось качество при добавлении "мешка слов" по героям? Улучшилось ли оно по сравнению с предыдущим вариантом? Чем вы можете это объяснить?

In [284]:
print "AUC_ROC:", res_rich[1].score
print "riched better thand dropped: %.10f" % (res_rich[1].score - res_dropped[1].score)

AUC_ROC: 0.751947344966
riched better thand dropped: 0.0353879585


Качество улучшилось на 3%, мы правльно подготовили данные, получили понятный, а главное полезныей, для модели фактор

# 5
#### Какое минимальное и максимальное значение прогноза на тестовой выборке получилось у лучшего из алгоритмов?

In [280]:
print "Min:", test_answers.min(), "Max:", test_answers.max()

Min: 0.00858060489119 Max: 0.996459223437
