In [42]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import time
import datetime
%matplotlib inline

In [93]:
features = pd.read_csv('data/features.csv', index_col='match_id')
features.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63


In [94]:
features.drop(['duration', 
               'tower_status_radiant',
               'tower_status_dire',
               'barracks_status_radiant',
               'barracks_status_dire',
              ], axis=1, inplace=True)

In [95]:
rows = len(features)
counts = features.describe().T['count']
counts_na = counts[counts < rows]
counts_na.sort_values().apply(lambda c: (rows - c) / rows)

first_blood_player2            0.452402
radiant_flying_courier_time    0.282619
dire_flying_courier_time       0.268415
first_blood_time               0.201100
first_blood_team               0.201100
first_blood_player1            0.201100
dire_bottle_time               0.166029
radiant_bottle_time            0.161380
radiant_first_ward_time        0.018883
dire_first_ward_time           0.018780
radiant_courier_time           0.007117
dire_courier_time              0.006953
Name: count, dtype: float64

In [96]:
features.fillna(0,inplace = True)

In [97]:
X_train = features.loc[:, 'start_time':]
y_train = features['radiant_win']
X_train.drop(['radiant_win'],axis = 1,inplace = True)

# Градиентный бустинг

In [98]:
kf = KFold(n_splits=5, shuffle=True)

In [99]:
for n in (10,20,30,50,100):
    start_time = datetime.datetime.now()
    clf = GradientBoostingClassifier(n_estimators=n)    
    model_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='roc_auc')
    print('N =', n)
    print('Time elapsed:', datetime.datetime.now() - start_time)
    print('Model scores:', model_scores)
    print('Mean score:', np.mean(model_scores))
    print()

Time elapsed: 0:00:25.528369
Model scores: [0.66463199 0.66591546 0.66170201 0.66319641 0.66818758]
Mean score: 0.6647266914184755
Time elapsed: 0:00:45.097697
Model scores: [0.68360642 0.67952008 0.68432868 0.68381987 0.67811664]
Mean score: 0.6818783391215122
Time elapsed: 0:01:07.783153
Model scores: [0.68907292 0.69429928 0.68915194 0.69018014 0.68629769]
Mean score: 0.6898003920954444
Time elapsed: 0:01:58.801226
Model scores: [0.69875938 0.70290594 0.69503338 0.68987513 0.70052292]
Mean score: 0.6974193500639957
Time elapsed: 0:03:55.720380
Model scores: [0.70668731 0.70754474 0.70708712 0.71026007 0.7028736 ]
Mean score: 0.7068905698369651


# Логистическая регрессия

In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [101]:
X_train = StandardScaler().fit_transform(X_train)

In [110]:
model_scores = []
for c in [10 ** i for i in range(-3, 5)]:
    start_time = datetime.datetime.now()
    
    model = LogisticRegression(C = c)
    score = cross_val_score(model, X_train, y_train, cv=kf, scoring='roc_auc')
    model_scores.append(np.mean(score))
    
    print('C =', c)
    print('Time elapsed:', datetime.datetime.now() - start_time)
    print('Model scores:', score)
    print('Mean score:', np.mean(score))
    print()

C = 0.001
Time elapsed: 0:00:09.052215
Model scores: [0.71706163 0.7196079  0.71702731 0.71312194 0.71417752]
Mean score: 0.7161992600412621

C = 0.01
Time elapsed: 0:00:11.106274
Model scores: [0.71046054 0.71850965 0.71418347 0.72184553 0.71776821]
Mean score: 0.7165534799351697

C = 0.1
Time elapsed: 0:00:12.996741
Model scores: [0.71938872 0.71669393 0.71465656 0.71843399 0.71237798]
Mean score: 0.7163102343066123

C = 1
Time elapsed: 0:00:12.964495
Model scores: [0.71933871 0.71190262 0.71631847 0.71234844 0.72120472]
Mean score: 0.7162225931407755

C = 10
Time elapsed: 0:00:12.509070
Model scores: [0.7187809  0.71563597 0.71774968 0.71504782 0.71429553]
Mean score: 0.7163019800507926

C = 100
Time elapsed: 0:00:12.360924
Model scores: [0.72112884 0.70979253 0.7139329  0.72009309 0.71750252]
Mean score: 0.7164899762384545

C = 1000
Time elapsed: 0:00:12.196207
Model scores: [0.7124675  0.72147459 0.71497858 0.71911883 0.71334609]
Mean score: 0.7162771168989988

C = 10000
Time elap

In [111]:
max(model_scores)

0.7165534799351697

In [112]:
features.drop([
    'lobby_type',
    'r1_hero',
    'r2_hero',
    'r3_hero',
    'r4_hero',
    'r5_hero',
    'd1_hero',
    'd2_hero',
    'd3_hero',
    'd4_hero',
    'd5_hero',
],axis = 1, inplace = True)

In [118]:
X_train = features.loc[:, 'start_time':]
y_train = features['radiant_win']
X_train.drop(['radiant_win'],axis = 1,inplace = True)

In [119]:
X_train = StandardScaler().fit_transform(X_train)

In [120]:
features.head()

Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,radiant_win
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,5,2098,1489,20,0,0,7,3,842,...,35.0,103.0,-84.0,221.0,3,4,2,2,-52.0,1
1,1430220345,4,1188,1033,9,0,1,12,4,1596,...,-20.0,149.0,-84.0,195.0,5,4,3,1,-5.0,1
2,1430227081,4,1319,1270,22,0,0,12,3,1314,...,-39.0,45.0,-77.0,221.0,3,4,3,1,13.0,0
3,1430263531,4,1779,1056,14,0,0,5,2,539,...,-30.0,124.0,-80.0,184.0,0,4,2,0,27.0,0
4,1430282290,4,1431,1090,8,1,0,8,2,629,...,46.0,182.0,-80.0,225.0,6,3,3,0,-16.0,0


In [121]:
model_scores = []
for c in [10 ** i for i in range(-3, 5)]:
    start_time = datetime.datetime.now()
    
    model = LogisticRegression(C = c)
    score = cross_val_score(model, X_train, y_train, cv=kf, scoring='roc_auc')
    model_scores.append(np.mean(score))
    
    print('C =', c)
    print('Time elapsed:', datetime.datetime.now() - start_time)
    print('Model scores:', score)
    print('Mean score:', np.mean(score))
    print()

C = 0.001
Time elapsed: 0:00:07.463903
Model scores: [0.72003255 0.7184591  0.7170054  0.7123042  0.71329452]
Mean score: 0.7162191561675485

C = 0.01
Time elapsed: 0:00:10.290243
Model scores: [0.71812605 0.71702226 0.7169131  0.71299422 0.71708068]
Mean score: 0.7164272622087816

C = 0.1
Time elapsed: 0:00:11.450542
Model scores: [0.71732072 0.718468   0.71113085 0.71682629 0.71904588]
Mean score: 0.716558346995639

C = 1
Time elapsed: 0:00:11.209806
Model scores: [0.71996418 0.71357278 0.7161764  0.71693519 0.71550075]
Mean score: 0.7164298617956545

C = 10
Time elapsed: 0:00:11.150503
Model scores: [0.71779844 0.71391179 0.71837767 0.71673988 0.71586529]
Mean score: 0.7165386151747422

C = 100
Time elapsed: 0:00:12.607081
Model scores: [0.7212731  0.71542356 0.71875249 0.71263928 0.71312533]
Mean score: 0.7162427518094967

C = 1000
Time elapsed: 0:00:11.016262
Model scores: [0.71809705 0.72224016 0.7114652  0.71693546 0.71367396]
Mean score: 0.7164823664547753

C = 10000
Time elaps

In [122]:
max(model_scores)

0.716558346995639

In [136]:
data = pd.read_csv('data/features.csv',index_col='match_id')

In [129]:
len(np.unique(data[['r' + str(i) + '_hero' for i in range(1,6)]]))

108

In [137]:
data.drop(['duration', 
         'tower_status_radiant', 
         'tower_status_dire', 
         'barracks_status_radiant', 
         'barracks_status_dire',
         'radiant_win'
        ], axis=1, inplace=True)

data.fillna(0,inplace = True)

In [138]:
X_pick = np.zeros((data.shape[0], 112))

for i, match_id in enumerate(data.index):
    for p in range(5):
        X_pick[i, data.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
data.drop([
    'lobby_type',
    'r1_hero',
    'r2_hero',
    'r3_hero',
    'r4_hero',
    'r5_hero',
    'd1_hero',
    'd2_hero',
    'd3_hero',
    'd4_hero',
    'd5_hero',
],axis = 1, inplace = True)

In [139]:
X_train = np.hstack((data.values, X_pick))
X_train = StandardScaler().fit_transform(X_train)

In [140]:
model_scores = []
for c in [10 ** i for i in range(-3, 5)]:
    start_time = datetime.datetime.now()
    
    model = LogisticRegression(C = c)
    score = cross_val_score(model, X_train, y_train, cv=kf, scoring='roc_auc')
    model_scores.append(np.mean(score))
    
    print('C =', c)
    print('Time elapsed:', datetime.datetime.now() - start_time)
    print('Model scores:', score)
    print('Mean score:', np.mean(score))
    print()

C = 0.001
Time elapsed: 0:00:14.331268
Model scores: [0.74769655 0.75092885 0.75433288 0.75245388 0.75208383]
Mean score: 0.7514991992365945

C = 0.01
Time elapsed: 0:00:18.785557
Model scores: [0.75187532 0.75380626 0.75416834 0.7502044  0.74839105]
Mean score: 0.7516890745013457

C = 0.1
Time elapsed: 0:00:20.840639
Model scores: [0.75308016 0.75270443 0.7454229  0.75401945 0.75427592]
Mean score: 0.751900572709374

C = 1
Time elapsed: 0:00:20.529738
Model scores: [0.74898254 0.7569618  0.75273515 0.74591911 0.75471988]
Mean score: 0.7518636967097274

C = 10
Time elapsed: 0:00:20.714813
Model scores: [0.7495297  0.75445025 0.75048332 0.75244234 0.7524603 ]
Mean score: 0.7518731844629345

C = 100
Time elapsed: 0:00:20.137057
Model scores: [0.74802324 0.75391765 0.74983525 0.75345189 0.75359215]
Mean score: 0.7517640365040728

C = 1000
Time elapsed: 0:00:20.458097
Model scores: [0.74640179 0.75400801 0.75335737 0.75485056 0.751003  ]
Mean score: 0.7519241476260876

C = 10000
Time elaps

In [141]:
max(model_scores)

0.7519241476260876

In [142]:
model = LogisticRegression(C=0.01)
model.fit(X_train, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [144]:
data = pd.read_csv('data/features_test.csv',index_col='match_id')
data.fillna(0,inplace = True)


X_pick = np.zeros((data.shape[0], 112))

for i, match_id in enumerate(data.index):
    for p in range(5):
        X_pick[i, data.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
data.drop([
    'lobby_type',
    'r1_hero',
    'r2_hero',
    'r3_hero',
    'r4_hero',
    'r5_hero',
    'd1_hero',
    'd2_hero',
    'd3_hero',
    'd4_hero',
    'd5_hero',
],axis = 1, inplace = True)

X_test = np.hstack((data.values, X_pick))
X_test = StandardScaler().fit_transform(X_test)

In [145]:
res = model.predict_proba(X_test)

In [146]:
res = [res[i][1] for i in range(0,len(res))]

In [150]:
print(max(res))
print(min(res))

0.9963287159254294
0.00870590076923439
