In [1]:
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
import numpy as np
import sklearn.cross_validation as cv
import time
import datetime

def compute_additional_features(df):
    df['total_level_r'] = 0
    df['total_level_d'] = 0
    df['total_xp_r'] = 0
    df['total_xp_d'] = 0
    df['total_gold_r'] = 0
    df['total_gold_d'] = 0
    df['total_lh_r'] = 0
    df['total_lh_d'] = 0
    df['total_kills_r'] = 0
    df['total_kills_d'] = 0
    df['total_deaths_r'] = 0
    df['total_deaths_d'] = 0
    df['total_items_r'] = 0
    df['total_items_d'] = 0
    for i in xrange(1, 6):
        df['total_level_r'] += df['r%d_level' % i]
        df['total_level_d'] += df['d%d_level' % i] 
        df['total_xp_r'] += df['r%d_xp' % i] 
        df['total_xp_d'] += df['d%d_xp' % i] 
        df['total_gold_r'] += df['r%d_gold' % i] 
        df['total_gold_d'] += df['d%d_gold' % i]
        df['total_lh_r'] += df['r%d_lh' % i] 
        df['total_lh_d'] += df['d%d_lh' % i]
        df['total_kills_r'] += df['r%d_kills' % i] 
        df['total_kills_d'] += df['d%d_kills' % i]
        df['total_deaths_r'] += df['r%d_deaths' % i] 
        df['total_deaths_d'] += df['d%d_deaths' % i]
        df['total_items_r'] += df['r%d_items' % i] 
        df['total_items_d'] += df['d%d_items' % i]
    df['total_level_r'] = df['total_level_r'] * df['total_level_r']
    df['total_level_d'] = df['total_level_d'] * df['total_level_d']
    df['total_xp_r'] = df['total_xp_r'] * df['total_xp_r']
    df['total_xp_d'] = df['total_xp_d'] * df['total_xp_d']
    df['total_gold_r'] = df['total_gold_r'] * df['total_gold_r']
    df['total_gold_d'] = df['total_gold_d'] * df['total_gold_d']
    df['total_lh_r'] = df['total_lh_r'] * df['total_lh_r']
    df['total_lh_d'] = df['total_lh_d'] * df['total_lh_d']
    df['total_kills_r'] = df['total_kills_r'] * df['total_kills_r']
    df['total_kills_d'] = df['total_kills_d'] * df['total_kills_d']
    df['total_deaths_r'] = df['total_deaths_r'] * df['total_deaths_r']
    df['total_deaths_d'] = df['total_deaths_d'] * df['total_deaths_d']
    df['total_items_r'] = df['total_items_r'] * df['total_items_r']
    df['total_items_d'] = df['total_items_d'] * df['total_items_d']
    return df

def preprocess(df):
    del df['duration']
    del df['tower_status_radiant']
    del df['tower_status_dire']
    del df['barracks_status_radiant']
    del df['barracks_status_dire']
    df.fillna(0, inplace=True)
    return df

def delete_redundant_features(df):
    for i in xrange(1, 6):
        del df['r%d_level' % i]
        del df['r%d_xp' % i]
        del df['r%d_gold' % i]
        del df['r%d_lh' % i]
        del df['r%d_kills' % i]
        del df['r%d_deaths' % i]
        del df['r%d_items' % i]
        del df['d%d_level' % i]
        del df['d%d_xp' % i]
        del df['d%d_gold' % i]
        del df['d%d_lh' % i]
        del df['d%d_kills' % i]
        del df['d%d_deaths' % i]
        del df['d%d_items' % i]
    return df

def categorize_lobbies(df):
    X_pick = np.zeros((df.shape[0], 8))
    for i, match_id in enumerate(df.index):
        X_pick[i, df.ix[match_id, 'lobby_type']-1] = 1
    
    for i in xrange(8):
        df['lobby_%d' % (i+1)] = X_pick[:, i]
    del df['lobby_type']
    return df
    

def categorize_heroes(df):
    X_pick = np.zeros((df.shape[0], 113))

    for i, match_id in enumerate(df.index):
        for p in xrange(5):
            X_pick[i, df.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, df.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

    for i in xrange(112):
        df['hero_%d' % (i+1)] = X_pick[:, i]

    for i in xrange(1, 6):
        del df['r%d_hero' % i]
        del df['d%d_hero' % i]
    return df

In [10]:
df = pd.read_csv('features.csv', index_col='match_id')
df = df.reindex(np.random.permutation(df.index))
target = df['radiant_win']
del df['radiant_win']
del df['start_time']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

print df.shape
X_train = df

(97230, 210)


In [11]:
train = df[:df.shape[0]/3]
train_target = target[:df.shape[0]/3]
test = df.tail(df.shape[0]/10)
test_target = target.tail(df.shape[0]/10)

In [12]:
# dummies_train = pd.get_dummies(train_target)
ss = StandardScaler()
data_train = ss.fit_transform(X=train, y=train_target)
df_train = pd.DataFrame.from_records(data=data_train, columns=None)
df_train = df_train.applymap(lambda x: '%.4f' % x)
df_train['radiant_win'] = train_target.values
print df_train.shape
df_train.to_csv('clean_dota2_train', sep=',', encoding='utf-8', index=False)

data_test = ss.transform(X=test)
df_test = pd.DataFrame.from_records(data=data_test, columns=None)
df_test = df_test.applymap(lambda x: '%.4f' % x)
df_test['radiant_win'] = test_target.values
print df_test.shape
df_test.to_csv('clean_dota2_test', sep=',', encoding='utf-8', index=False)

(32410, 211)
(9723, 211)


In [9]:
df_test = pd.read_csv('features_test.csv', index_col='match_id')
df = pd.read_csv('features.csv', index_col='match_id')

target = df['radiant_win']
del df['radiant_win']
del df['start_time']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

df_test.fillna(0, inplace=True)
df_test = categorize_heroes(df_test)
df_test = categorize_lobbies(df_test)

del df_test['start_time']
ss = StandardScaler()
ss.fit_transform(X=df, y=target)
data_test = ss.transform(X=df_test)
df_test = pd.DataFrame.from_records(data=data_test, columns=None)
df_test = df_test.applymap(lambda x: '%.4f' % x)
df_test.to_csv('dota2_test', sep=',', encoding='utf-8', index=False)

In [7]:
kf = cv.KFold(len(df.index), n_folds=4, shuffle=True, random_state=241)
start_time = datetime.datetime.now()

clf = GradientBoostingClassifier(n_estimators=30,verbose=True, random_state=241)
cs_result = cv.cross_val_score(clf, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)

print 'Time elapsed:', datetime.datetime.now() - start_time
print cs_result

Time elapsed: 0:01:32.496000
[ 0.72106467  0.71598025  0.71619558  0.7212618 ]


In [5]:
# Количество героев в датасете (общее количество героев в игре: 112)
import numpy as np
df = pd.read_csv('features.csv', index_col=None)
# df_test = pd.read_csv('features_test.csv', index_col=None)
print df['lobby_type'].value_counts()
# print len(np.unique(df[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]))
# print len(np.unique(df_test[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]))

1    55962
7    28550
0    12718
Name: lobby_type, dtype: int64


In [6]:
df = pd.read_csv('features.csv', index_col=None)

target = df['radiant_win']
del df['radiant_win']
del df['match_id']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

ss = StandardScaler()
X_train = ss.fit_transform(X=df, y=target)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

print df.shape
start_time = datetime.datetime.now()

kf = cv.KFold(len(df.index), n_folds=4, shuffle=True, random_state=241)
tuned_parameters = {'C': [0.1, 0.5, 1, 10, 100, 1000], 'penalty':['l2']}
# lr = GridSearchCV(LogisticRegression(random_state=241, n_jobs=-1), tuned_parameters, cv=5, scoring='roc_auc')
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.01)
cs_result = cv.cross_val_score(lr, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
print 'Time elapsed:', datetime.datetime.now() - start_time
print cs_result

(97230, 211)
Time elapsed: 0:00:19.574000
[ 0.75339035  0.75174887  0.74853068  0.7529954 ]


In [8]:
df_test = pd.read_csv('features_test.csv', index_col=None)
df_test.fillna(0, inplace=True)
df_test = categorize_heroes(df_test)
df_test = categorize_lobbies(df_test)
match_ids = df_test['match_id']
del df_test['match_id']
X_test = ss.transform(X=df_test)
lr.fit(X=X_train, y=target)
predicted = lr.predict_proba(X_test)
print predicted[:10]
print 'max: ', max(predicted[:,1])
print 'min: ', min(predicted[:,1])

[[ 0.18440831  0.81559169]
 [ 0.24028864  0.75971136]
 [ 0.80818866  0.19181134]
 [ 0.13678844  0.86321156]
 [ 0.74625801  0.25374199]
 [ 0.63804943  0.36195057]
 [ 0.46154706  0.53845294]
 [ 0.42502168  0.57497832]
 [ 0.77052818  0.22947182]
 [ 0.32036602  0.67963398]]
max:  0.996650299848
min:  0.0087276102848


In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

ss = StandardScaler()

df_orig = pd.read_csv('features.csv', index_col=None)
df_test = pd.read_csv('features_test.csv', index_col=None)

df = df_orig.copy()
target = df['radiant_win']
del df['radiant_win']
del df['match_id']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

df_test.fillna(0, inplace=True)
df_test = categorize_heroes(df_test)
df_test = categorize_lobbies(df_test)
match_ids = df_test['match_id']
del df_test['match_id']
ss.fit_transform(X=df, y=target)

kf = cv.KFold(len(df.index), n_folds=4, shuffle=True, random_state=241)
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.1)

train = df.values.tolist()
target = target.tolist()
test = df_test.values.tolist()

In [24]:
bound = 0.85
for i in xrange(2):
    start_time = datetime.datetime.now()
    print "train data length: ", len(train)
    print "target length: ", len(target)
    print "test length: ", len(test)
    X_train = ss.fit_transform(X=train, y=target)
    cs_result = cv.cross_val_score(lr, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
    print cs_result
    X_test = ss.transform(X=test)
    lr.fit(X=X_train, y=target)
    predicted = lr.predict_proba(X_test)
    radian_win = predicted[:,1] > bound
    dire_win = predicted[:,0] > bound
    del_ix = []
    for i in xrange(len(predicted)):
        if radian_win[i]: 
            train.append(test[i])
            target.append(1)
            del_ix.append(i)
            # todo del from test
        if dire_win[i]:
            train.append(test[i])
            target.append(0)
            del_ix.append(i)
            # todo del from test
    for i in sorted(del_ix, reverse=True):
        del test[i]
    unique, counts = np.unique(predicted[:,1] > bound, return_counts=True)
    print np.asarray((unique, counts)).T
    unique, counts = np.unique(predicted[:,0] > bound, return_counts=True)
    print np.asarray((unique, counts)).T
    print 'Time elapsed:', datetime.datetime.now() - start_time

train data length:  97230
target length:  97230
test length:  17177
[ 0.75340299  0.7517499   0.74853067  0.75302096]
[[    0 16032]
 [    1  1145]]
[[    0 16338]
 [    1   839]]
Time elapsed: 0:00:35.299000
train data length:  99214
target length:  99214
test length:  15193
[ 0.75340303  0.75174967  0.74853069  0.75302077]
[[    0 15107]
 [    1    86]]
[[    0 15133]
 [    1    60]]
Time elapsed: 0:00:34.645000


In [28]:
print len(train)
print len(target)
lr.fit(X=train, y=target)
predicted = lr.predict_proba(df_test)
print predicted[:10]
d = {'match_id': match_ids, 'radiant_win': predicted[:,1]}
df_result = pd.DataFrame(data=d, index=None)
print df_result.head()
df_result.to_csv('submission', sep=',', encoding='utf-8', index=False)

99360
99360
[[ 0.48037768  0.51962232]
 [ 0.4803776   0.5196224 ]
 [ 0.48037749  0.51962251]
 [ 0.48037718  0.51962282]
 [ 0.48037708  0.51962292]
 [ 0.48037704  0.51962296]
 [ 0.48037703  0.51962297]
 [ 0.48037683  0.51962317]
 [ 0.4803766   0.5196234 ]
 [ 0.48037648  0.51962352]]
   match_id  radiant_win
0         6     0.519622
1         7     0.519622
2        10     0.519623
3        13     0.519623
4        16     0.519623


In [4]:
df = pd.read_csv('features.csv', index_col=None)

In [6]:
a = df.values.tolist()
print a[1]
# print df.head()
# todel = []
# todel.append(1)
# todel.append(3)
# df.drop(df.index[[todel]], inplace=True)
# print df.head()

[1.0, 1430220345.0, 0.0, 42.0, 4.0, 1188.0, 1033.0, 9.0, 0.0, 1.0, 12.0, 49.0, 4.0, 1596.0, 993.0, 10.0, 0.0, 1.0, 7.0, 67.0, 4.0, 1506.0, 1502.0, 18.0, 1.0, 0.0, 7.0, 37.0, 3.0, 669.0, 631.0, 7.0, 0.0, 0.0, 7.0, 26.0, 2.0, 415.0, 539.0, 1.0, 0.0, 0.0, 5.0, 39.0, 5.0, 1960.0, 1384.0, 16.0, 0.0, 0.0, 8.0, 88.0, 3.0, 640.0, 566.0, 1.0, 0.0, 1.0, 5.0, 79.0, 3.0, 720.0, 1350.0, 2.0, 2.0, 0.0, 12.0, 7.0, 2.0, 440.0, 583.0, 0.0, 0.0, 0.0, 7.0, 12.0, 4.0, 1470.0, 1622.0, 24.0, 0.0, 0.0, 9.0, 54.0, 1.0, 7.0, nan, 173.0, -80.0, nan, 2.0, 0.0, 2.0, 0.0, -20.0, 149.0, -84.0, 195.0, 5.0, 4.0, 3.0, 1.0, -5.0, 2463.0, 1.0, 1974.0, 0.0, 63.0, 1.0]


In [26]:
d = {'match_id': match_ids, 'radiant_win': predicted[:,1]}
df_result = pd.DataFrame(data=d, index=None)
print df_result.head()
df_result.to_csv('submission', sep=',', encoding='utf-8', index=False)

   match_id  radiant_win
0         6     1.000000
1         7     1.000000
2        10     0.190810
3        13     0.897760
4        16     0.151311


In [7]:
from sklearn.ensemble import ExtraTreesClassifier
df = pd.read_csv('features.csv', index_col=None)

target = df['radiant_win']
del df['radiant_win']
del df['match_id']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

In [13]:
forest = ExtraTreesClassifier(n_estimators=500,
                              n_jobs=-1,
                              random_state=241,
                              verbose=True)

start_time = datetime.datetime.now()

cs = cv.cross_val_score(forest, X=df, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)

print 'Time elapsed:', datetime.datetime.now() - start_time
print cs

Time elapsed: 0:08:57.187000
[ 0.71043932  0.71114338  0.70850037  0.71039338]


In [None]:
forest.fit_transform(X=df, y=target)

df_test = pd.read_csv('features_test.csv', index_col=None)
df_test.fillna(0, inplace=True)
df_test = categorize_heroes(df_test)
df_test = categorize_lobbies(df_test)
match_ids = df_test['match_id']
del df_test['match_id']

predicted = forest.predict_proba(X=df_test)
print predicted[:10]