# 1. GradientBoostingClassifier

In [1]:
import numpy as np
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

In [2]:
import pandas as pd
features = pd.read_csv('./features.csv', index_col='match_id')
features = features.drop(['duration', 'tower_status_radiant', 'barracks_status_radiant',
                        'tower_status_dire', 'barracks_status_dire'], axis=1)
features.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,radiant_win
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,35,103,-84,221,3,4,2,2,-52,1
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,-20,149,-84,195,5,4,3,1,-5,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,-39,45,-77,221,3,4,3,1,13,0
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,-30,124,-80,184,0,4,2,0,27,0
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,46,182,-80,225,6,3,3,0,-16,0


In [4]:
len(features)

97230

In [3]:
counts = features.count()
counts = counts[counts != len(features)]
print (counts)

first_blood_time               77677
first_blood_team               77677
first_blood_player1            77677
first_blood_player2            53243
radiant_bottle_time            81539
radiant_courier_time           96538
radiant_flying_courier_time    69751
radiant_first_ward_time        95394
dire_bottle_time               81087
dire_courier_time              96554
dire_flying_courier_time       71132
dire_first_ward_time           95404
dtype: int64


In [5]:
features = features.fillna(0)

In [6]:
X_train = features.drop('radiant_win', axis=1)
y_train = features.radiant_win

In [7]:
gen = KFold(len(X_train), shuffle=True, n_folds=5)

In [9]:
%%time
for n in range(10, 41, 5):
    clf = GradientBoostingClassifier(n_estimators=n)
    print (n, np.mean(cross_val_score(clf, X_train, y_train, cv = gen, scoring='roc_auc')))

(10, 0.6651813597351619)
(15, 0.67646118343815731)
(20, 0.68199390243421354)
(25, 0.6863547859107817)
(30, 0.6898100768424581)
(35, 0.6921669056696218)
(40, 0.69444215353129368)
CPU times: user 12min 53s, sys: 5.07 s, total: 12min 58s
Wall time: 12min 58s


In [10]:
clf = GradientBoostingClassifier(n_estimators=30)
%time cur = np.mean(cross_val_score(clf, X_train, y_train, cv = gen, scoring='roc_auc'))

CPU times: user 2min 12s, sys: 764 ms, total: 2min 13s
Wall time: 2min 13s


# 2. LogisticRegression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=features.columns[:-1])

In [13]:
%%time
for c in [10 ** i for i in range(1,-5, -1)]:
    clf = LogisticRegression(C=c, solver='lbfgs', penalty='l2')
    print(1.0 / c, np.mean(cross_val_score(clf, X_train, y_train, cv = gen, scoring='roc_auc')))

(0.1, 0.71643651368314332)
(1.0, 0.71643794295532215)
(10.0, 0.71644080054974857)
(100.0, 0.71645795832815928)
(1000.0, 0.71624837581015333)
(10000.0, 0.71125445480251026)
CPU times: user 1min 12s, sys: 54.2 s, total: 2min 6s
Wall time: 35 s


In [18]:
X_train = X_train.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 
                       'd3_hero', 'd4_hero', 'd5_hero', ], axis=1)

ValueError: labels ['lobby_type' 'r1_hero' 'r2_hero' 'r3_hero' 'r4_hero' 'r5_hero' 'd1_hero'
 'd2_hero' 'd3_hero' 'd4_hero' 'd5_hero'] not contained in axis

In [15]:
%%time
for c in [10 ** i for i in range(1,-5, -1)]:
    clf = LogisticRegression(C=c, solver='lbfgs', penalty='l2')
    print(1.0 / c, np.mean(cross_val_score(clf, X_train, y_train, cv = gen, scoring='roc_auc')))

(0.1, 0.7164399883695991)
(1.0, 0.71644011017789588)
(10.0, 0.71644321651627052)
(100.0, 0.71646001317548846)
(1000.0, 0.71625157156418262)
(10000.0, 0.71121547858232703)
CPU times: user 1min 10s, sys: 48.1 s, total: 1min 58s
Wall time: 31.2 s


In [19]:
heroes = []
for hero in ['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
            'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']:
    heroes += list(features.get(hero))

In [31]:
len(set(heroes))

108

In [26]:
X_pick = np.zeros((features.shape[0], 112))

for i, match_id in enumerate(features.index):
    for p in range(5):
        X_pick[i, features.ix[match_id, 'r%d_hero'%(p+1)] - 1] = 1
        X_pick[i, features.ix[match_id, 'd%d_hero'%(p+1)] - 1] = -1
X_pick = pd.DataFrame(X_pick)

In [27]:
X_train_with_heroes = pd.concat([X_train, X_pick], axis=1)

In [28]:
clf = LogisticRegression(C=0.01, solver='lbfgs', penalty='l2')
cur = np.mean(cross_val_score(clf, X_train_with_heroes, y_train, cv = gen, scoring='roc_auc'))
print (cur)

0.751520308164
