In [28]:
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
import numpy as np
import sklearn.cross_validation as cv
import time
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.svm import LinearSVC
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
import operator

def replace_col_withsum(data, name):
    cols = data.columns[data.columns.str.contains(name)]
    data[name] = data[cols[cols.str.contains('^r')]].sum(axis=1) - data[cols[cols.str.contains('^d')]].sum(axis=1)
    data = data.drop(cols,axis=1)
    return data

def compute_additional_features(df):
    df['total_level_r'] = 0
    df['total_level_d'] = 0
    df['total_xp_r'] = 0
    df['total_xp_d'] = 0
    df['total_gold_r'] = 0
    df['total_gold_d'] = 0
#     df['total_lh_r'] = 0
#     df['total_lh_d'] = 0
#     df['total_kills_r'] = 0
#     df['total_kills_d'] = 0
#     df['total_deaths_r'] = 0
#     df['total_deaths_d'] = 0
#     df['total_items_r'] = 0
#     df['total_items_d'] = 0
    for i in xrange(1, 6):
        df['total_level_r'] += df['r%d_level' % i]
        df['total_level_d'] += df['d%d_level' % i] 
        df['total_xp_r'] += df['r%d_xp' % i] 
        df['total_xp_d'] += df['d%d_xp' % i] 
        df['total_gold_r'] += df['r%d_gold' % i] 
        df['total_gold_d'] += df['d%d_gold' % i]
#         df['total_lh_r'] += df['r%d_lh' % i] 
#         df['total_lh_d'] += df['d%d_lh' % i]
#         df['total_kills_r'] += df['r%d_kills' % i] 
#         df['total_kills_d'] += df['d%d_kills' % i]
#         df['total_deaths_r'] += df['r%d_deaths' % i] 
#         df['total_deaths_d'] += df['d%d_deaths' % i]
#         df['total_items_r'] += df['r%d_items' % i] 
#         df['total_items_d'] += df['d%d_items' % i]
    return df

def preprocess(df):
    del df['duration']
    del df['tower_status_radiant']
    del df['tower_status_dire']
    del df['barracks_status_radiant']
    del df['barracks_status_dire']
    df.fillna(0, inplace=True)
    return df

def delete_redundant_features(df):
    for i in xrange(1, 6):
        del df['r%d_level' % i]
        del df['r%d_xp' % i]
        del df['r%d_gold' % i]
#         del df['r%d_lh' % i]
#         del df['r%d_kills' % i]
#         del df['r%d_deaths' % i]
#         del df['r%d_items' % i]
        del df['d%d_level' % i]
        del df['d%d_xp' % i]
        del df['d%d_gold' % i]
#         del df['d%d_lh' % i]
#         del df['d%d_kills' % i]
#         del df['d%d_deaths' % i]
#         del df['d%d_items' % i]
    return df

def categorize_lobbies(df):
    X_pick = np.zeros((df.shape[0], 8))
    for i, match_id in enumerate(df.index):
        X_pick[i, df.ix[match_id, 'lobby_type']-1] = 1
    
    for i in xrange(8):
        df['lobby_%d' % (i+1)] = X_pick[:, i]
    del df['lobby_type']
    return df
    

def categorize_heroes(df):
    X_pick = np.zeros((df.shape[0], 112))

    for i, match_id in enumerate(df.index):
        for p in xrange(5):
            X_pick[i, df.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, df.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

    for i in xrange(112):
        df['hero_%d' % (i+1)] = X_pick[:, i]

    for i in xrange(1, 6):
        del df['r%d_hero' % i]
        del df['d%d_hero' % i]
    return df


### Подготовка данных для NEAT


In [10]:
df = pd.read_csv('features.csv', index_col='match_id')
df = df.reindex(np.random.permutation(df.index))
target = df['radiant_win']
del df['radiant_win']
del df['start_time']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

print df.shape
X_train = df

(97230, 210)


In [None]:
df = pd.read_csv('features.csv', index_col='match_id')
df = df.reindex(np.random.permutation(df.index))
target = df['radiant_win']
del df['radiant_win']
del df['start_time']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

print df.shape

In [11]:
train = df[:df.shape[0]/3]
train_target = target[:df.shape[0]/3]
test = df.tail(df.shape[0]/10)
test_target = target.tail(df.shape[0]/10)

In [12]:
# dummies_train = pd.get_dummies(train_target)
ss = StandardScaler()
data_train = ss.fit_transform(X=train, y=train_target)
df_train = pd.DataFrame.from_records(data=data_train, columns=None)
df_train = df_train.applymap(lambda x: '%.4f' % x)
df_train['radiant_win'] = train_target.values
print df_train.shape
df_train.to_csv('clean_dota2_train', sep=',', encoding='utf-8', index=False)

data_test = ss.transform(X=test)
df_test = pd.DataFrame.from_records(data=data_test, columns=None)
df_test = df_test.applymap(lambda x: '%.4f' % x)
df_test['radiant_win'] = test_target.values
print df_test.shape
df_test.to_csv('clean_dota2_test', sep=',', encoding='utf-8', index=False)

(32410, 211)
(9723, 211)


In [9]:
df_test = pd.read_csv('features_test.csv', index_col='match_id')
df = pd.read_csv('features.csv', index_col='match_id')

target = df['radiant_win']
del df['radiant_win']
del df['start_time']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

df_test.fillna(0, inplace=True)
df_test = categorize_heroes(df_test)
df_test = categorize_lobbies(df_test)

del df_test['start_time']
ss = StandardScaler()
ss.fit_transform(X=df, y=target)
data_test = ss.transform(X=df_test)
df_test = pd.DataFrame.from_records(data=data_test, columns=None)
df_test = df_test.applymap(lambda x: '%.4f' % x)
df_test.to_csv('dota2_test', sep=',', encoding='utf-8', index=False)


### Градиентный бустинг


In [49]:
df = pd.read_csv('features.csv', index_col='match_id')
target = df['radiant_win']
del df['radiant_win']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

print df.shape
X_train = df

(97230, 211)


In [None]:
%matplotlib inline
rcParams['figure.figsize'] = 50, 10

kf = cv.KFold(len(df.index), n_folds=4, shuffle=True, random_state=241)
start_time = datetime.datetime.now()

clf = GradientBoostingClassifier(verbose=True, random_state=241)
clf.fit(X=df, y=target)
cv_res = cv.cross_val_score(clf, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
print "CV Score : Mean - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_res),np.min(cv_res),np.max(cv_res))

feat_imp = pd.Series(clf.feature_importances_, df.columns).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')

print 'Time elapsed:', datetime.datetime.now() - start_time
print cv_res

In [None]:
param_test1 = {'n_estimators':[50, 100, 200, 300], 'learning_rate':[0.1, 0.05, 0.01, 0.005]}
clf = GradientBoostingClassifier(verbose=True, random_state=241,
                                min_samples_split=500, min_samples_leaf=50, max_depth=8,
                                max_features=14, subsample=0.8)

gsearch1 = GridSearchCV(estimator = clf, param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
gsearch1.fit(df, target)
print gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [48]:
print feat_imp[:30]

d2_gold                 0.097737
r2_gold                 0.095156
r1_gold                 0.091634
r4_gold                 0.090206
d1_gold                 0.087771
d5_gold                 0.087409
d4_gold                 0.084650
d3_gold                 0.079871
r5_gold                 0.079143
r3_gold                 0.070574
first_blood_player1     0.055376
radiant_boots_count     0.041018
dire_boots_count        0.030568
lobby_1                 0.004815
d3_deaths               0.001155
r4_lh                   0.000946
d1_xp                   0.000927
dire_first_ward_time    0.000643
d2_lh                   0.000398
d5_kills                0.000000
radiant_courier_time    0.000000
d5_deaths               0.000000
d5_level                0.000000
d5_items                0.000000
first_blood_time        0.000000
first_blood_team        0.000000
d4_lh                   0.000000
d5_lh                   0.000000
d4_kills                0.000000
d4_deaths               0.000000
dtype: flo

### Логистическая регрессия

In [24]:
df = pd.read_csv('features.csv', index_col=None)

target = df['radiant_win']
target = target.replace(0, -1)
del df['radiant_win']
del df['match_id']
# del df['start_time']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)
# df = replace_col_withsum(df, 'gold')
# df = compute_additional_features(df)
# df = delete_redundant_features(df)

ss = StandardScaler()
X_train = ss.fit_transform(X=df, y=target)
X_new = X_train
print df.shape

(97230, 211)


In [74]:
print X_train.shape
# sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel = VarianceThreshold()
X_new = sel.fit_transform(X_train) 
# lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X=X_train, y=target)
# model = SelectFromModel(lsvc, prefit=True)
# X_new = model.transform(X_train)
print X_new.shape

(97230L, 211L)
0    1
1    1
2   -1
3   -1
4   -1
5   -1
6   -1
7    1
8   -1
9   -1
Name: radiant_win, dtype: int64
(97230L, 202L)


In [33]:
start_time = datetime.datetime.now()
kf = cv.KFold(X_new.shape[0], n_folds=4, shuffle=True, random_state=241)
# tuned_parameters = {'C': [0.1, 0.5, 1, 10, 100, 1000], 'penalty':['l2']}
# lr = GridSearchCV(LogisticRegression(random_state=241, n_jobs=-1), tuned_parameters, cv=5, scoring='roc_auc')
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.01)
cs_result = cv.cross_val_score(lr, X=X_new, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
print 'Time elapsed:', datetime.datetime.now() - start_time
print cs_result

Time elapsed: 0:00:18.772000
[ 0.75346484  0.75175914  0.74852422  0.75316268]


In [47]:
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.01)
lr.fit(X=df, y=target) 
d ={}
i = 0
for col in df.columns.values:
    d[col] = abs(lr.coef_[0][i])
    i +=1
s = sorted(d.items(), key=operator.itemgetter(1))
print s[-30:]

[('r2_lh', 1.4758462330384746e-18), ('dire_courier_time', 2.1571626310668795e-18), ('dire_first_ward_time', 2.3388827903533294e-18), ('dire_flying_courier_time', 2.8281786987678808e-18), ('radiant_first_ward_time', 3.078165159798031e-18), ('first_blood_time', 3.3964424869753729e-18), ('radiant_courier_time', 3.6662138618507246e-18), ('dire_bottle_time', 7.6522023149950535e-18), ('d1_xp', 1.3321481381295317e-17), ('d4_xp', 1.391119504376969e-17), ('d3_xp', 1.5156955525314047e-17), ('radiant_flying_courier_time', 1.6177215259048604e-17), ('d2_xp', 2.2612837523347719e-17), ('d5_xp', 2.4290806984599594e-17), ('d3_gold', 3.7567762443577028e-17), ('d4_gold', 4.0181070679393203e-17), ('d1_gold', 4.3550106257177432e-17), ('d5_gold', 4.4601177311346692e-17), ('d2_gold', 4.7130477737395722e-17), ('r3_xp', 9.5297815586561879e-17), ('r1_xp', 9.7988439655303937e-17), ('r4_xp', 1.0422103855610577e-16), ('r5_xp', 1.0538083714420889e-16), ('r2_xp', 1.0703779621761113e-16), ('r3_gold', 1.15629541030965

In [50]:
df_test = pd.read_csv('features_test.csv', index_col=None)
df_test.fillna(0, inplace=True)
df_test = categorize_heroes(df_test)
df_test = categorize_lobbies(df_test)
match_ids = df_test['match_id']
del df_test['match_id']
del df_test['start_time']
X_test = ss.transform(X=df_test)
lr.fit(X=X_train, y=target)
predicted = lr.predict_proba(X_test)
print predicted[:10]
print 'max: ', max(predicted[:,1])
print 'min: ', min(predicted[:,1])

[[ 0.12600284  0.87399716]
 [ 0.26256754  0.73743246]
 [ 0.73789737  0.26210263]
 [ 0.12893915  0.87106085]
 [ 0.67727228  0.32272772]
 [ 0.57853999  0.42146001]
 [ 0.46453849  0.53546151]
 [ 0.44879355  0.55120645]
 [ 0.7306638   0.2693362 ]
 [ 0.33605433  0.66394567]]
max:  0.996185053309
min:  0.00899150488956


In [51]:
d = {'match_id': match_ids, 'radiant_win': predicted[:,1]}
df_result = pd.DataFrame(data=d, index=None)
print df_result.head()
df_result.to_csv('submission', sep=',', encoding='utf-8', index=False)

   match_id  radiant_win
0         6     0.873997
1         7     0.737432
2        10     0.262103
3        13     0.871061
4        16     0.322728



### Feature transformations with ensembles of trees


In [7]:
df = pd.read_csv('features.csv', index_col=None)
df = df.reindex(np.random.permutation(df.index))
target = df['radiant_win']
target = target.replace(0, -1)
del df['radiant_win']
del df['match_id']
# del df['start_time']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

train, test, train_target, test_target = train_test_split(df,
                                                            target,
                                                            test_size=0.1)

X_train, X_train_lr, y_train, y_train_lr = train_test_split(train,
                                                            train_target,
                                                            test_size=0.5)
grd = GradientBoostingClassifier(n_estimators=300, random_state=241, verbose=True)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.01)
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(
    grd_enc.transform(grd.apply(test)[:, :, 0]))[:, 1]
print roc_auc_score(test_target, y_pred_grd_lm)

      Iter       Train Loss   Remaining Time 
         1           1.3781            4.98m
         2           1.3722            5.00m
         3           1.3671            5.00m
         4           1.3621            4.99m
         5           1.3569            5.09m
         6           1.3523            5.32m
         7           1.3481            5.35m
         8           1.3437            5.36m
         9           1.3395            5.35m
        10           1.3354            5.26m
        20           1.3045            4.84m
        30           1.2828            4.54m
        40           1.2664            4.32m
        50           1.2535            4.12m
        60           1.2427            4.02m
        70           1.2334            3.83m
        80           1.2252            3.63m
        90           1.2176            3.45m
       100           1.2110            3.26m
       200           1.1614            1.62m
       300           1.1268            0.00s
0.7329875


### "Активное обучение"


In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

ss = StandardScaler()

df_orig = pd.read_csv('features.csv', index_col=None)
df_test = pd.read_csv('features_test.csv', index_col=None)

df = df_orig.copy()
target = df['radiant_win']
del df['radiant_win']
del df['match_id']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

df_test.fillna(0, inplace=True)
df_test = categorize_heroes(df_test)
df_test = categorize_lobbies(df_test)
match_ids = df_test['match_id']
del df_test['match_id']
ss.fit_transform(X=df, y=target)

kf = cv.KFold(len(df.index), n_folds=4, shuffle=True, random_state=241)
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.1)

train = df.values.tolist()
target = target.tolist()
test = df_test.values.tolist()

In [24]:
bound = 0.85
for i in xrange(2):
    start_time = datetime.datetime.now()
    print "train data length: ", len(train)
    print "target length: ", len(target)
    print "test length: ", len(test)
    X_train = ss.fit_transform(X=train, y=target)
    cs_result = cv.cross_val_score(lr, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
    print cs_result
    X_test = ss.transform(X=test)
    lr.fit(X=X_train, y=target)
    predicted = lr.predict_proba(X_test)
    radian_win = predicted[:,1] > bound
    dire_win = predicted[:,0] > bound
    del_ix = []
    for i in xrange(len(predicted)):
        if radian_win[i]: 
            train.append(test[i])
            target.append(1)
            del_ix.append(i)
            # todo del from test
        if dire_win[i]:
            train.append(test[i])
            target.append(0)
            del_ix.append(i)
            # todo del from test
    for i in sorted(del_ix, reverse=True):
        del test[i]
    unique, counts = np.unique(predicted[:,1] > bound, return_counts=True)
    print np.asarray((unique, counts)).T
    unique, counts = np.unique(predicted[:,0] > bound, return_counts=True)
    print np.asarray((unique, counts)).T
    print 'Time elapsed:', datetime.datetime.now() - start_time

train data length:  97230
target length:  97230
test length:  17177
[ 0.75340299  0.7517499   0.74853067  0.75302096]
[[    0 16032]
 [    1  1145]]
[[    0 16338]
 [    1   839]]
Time elapsed: 0:00:35.299000
train data length:  99214
target length:  99214
test length:  15193
[ 0.75340303  0.75174967  0.74853069  0.75302077]
[[    0 15107]
 [    1    86]]
[[    0 15133]
 [    1    60]]
Time elapsed: 0:00:34.645000


In [28]:
print len(train)
print len(target)
lr.fit(X=train, y=target)
predicted = lr.predict_proba(df_test)
print predicted[:10]
d = {'match_id': match_ids, 'radiant_win': predicted[:,1]}
df_result = pd.DataFrame(data=d, index=None)
print df_result.head()
df_result.to_csv('submission', sep=',', encoding='utf-8', index=False)

99360
99360
[[ 0.48037768  0.51962232]
 [ 0.4803776   0.5196224 ]
 [ 0.48037749  0.51962251]
 [ 0.48037718  0.51962282]
 [ 0.48037708  0.51962292]
 [ 0.48037704  0.51962296]
 [ 0.48037703  0.51962297]
 [ 0.48037683  0.51962317]
 [ 0.4803766   0.5196234 ]
 [ 0.48037648  0.51962352]]
   match_id  radiant_win
0         6     0.519622
1         7     0.519622
2        10     0.519623
3        13     0.519623
4        16     0.519623


In [4]:
df = pd.read_csv('features.csv', index_col=None)

In [6]:
a = df.values.tolist()
print a[1]
# print df.head()
# todel = []
# todel.append(1)
# todel.append(3)
# df.drop(df.index[[todel]], inplace=True)
# print df.head()

[1.0, 1430220345.0, 0.0, 42.0, 4.0, 1188.0, 1033.0, 9.0, 0.0, 1.0, 12.0, 49.0, 4.0, 1596.0, 993.0, 10.0, 0.0, 1.0, 7.0, 67.0, 4.0, 1506.0, 1502.0, 18.0, 1.0, 0.0, 7.0, 37.0, 3.0, 669.0, 631.0, 7.0, 0.0, 0.0, 7.0, 26.0, 2.0, 415.0, 539.0, 1.0, 0.0, 0.0, 5.0, 39.0, 5.0, 1960.0, 1384.0, 16.0, 0.0, 0.0, 8.0, 88.0, 3.0, 640.0, 566.0, 1.0, 0.0, 1.0, 5.0, 79.0, 3.0, 720.0, 1350.0, 2.0, 2.0, 0.0, 12.0, 7.0, 2.0, 440.0, 583.0, 0.0, 0.0, 0.0, 7.0, 12.0, 4.0, 1470.0, 1622.0, 24.0, 0.0, 0.0, 9.0, 54.0, 1.0, 7.0, nan, 173.0, -80.0, nan, 2.0, 0.0, 2.0, 0.0, -20.0, 149.0, -84.0, 195.0, 5.0, 4.0, 3.0, 1.0, -5.0, 2463.0, 1.0, 1974.0, 0.0, 63.0, 1.0]



### Random forest


In [7]:
from sklearn.ensemble import ExtraTreesClassifier
df = pd.read_csv('features.csv', index_col=None)

target = df['radiant_win']
del df['radiant_win']
del df['match_id']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

In [13]:
forest = ExtraTreesClassifier(n_estimators=500,
                              n_jobs=-1,
                              random_state=241,
                              verbose=True)

start_time = datetime.datetime.now()

cs = cv.cross_val_score(forest, X=df, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)

print 'Time elapsed:', datetime.datetime.now() - start_time
print cs

Time elapsed: 0:08:57.187000
[ 0.71043932  0.71114338  0.70850037  0.71039338]


In [None]:
forest.fit_transform(X=df, y=target)

df_test = pd.read_csv('features_test.csv', index_col=None)
df_test.fillna(0, inplace=True)
df_test = categorize_heroes(df_test)
df_test = categorize_lobbies(df_test)
match_ids = df_test['match_id']
del df_test['match_id']

predicted = forest.predict_proba(X=df_test)
print predicted[:10]