In [5]:
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.preprocessing import scale
import numpy as np
import sklearn.cross_validation as cv
import time
import datetime

def compute_additional_features(df):
    df['total_level_r'] = 0
    df['total_level_d'] = 0
    df['total_xp_r'] = 0
    df['total_xp_d'] = 0
    df['total_gold_r'] = 0
    df['total_gold_d'] = 0
    df['total_lh_r'] = 0
    df['total_lh_d'] = 0
    df['total_kills_r'] = 0
    df['total_kills_d'] = 0
    df['total_deaths_r'] = 0
    df['total_deaths_d'] = 0
    df['total_items_r'] = 0
    df['total_items_d'] = 0
    for i in xrange(1, 6):
        df['total_level_r'] += df['r%d_level' % i]
        df['total_level_d'] += df['d%d_level' % i] 
        df['total_xp_r'] += df['r%d_xp' % i] 
        df['total_xp_d'] += df['d%d_xp' % i] 
        df['total_gold_r'] += df['r%d_gold' % i] 
        df['total_gold_d'] += df['d%d_gold' % i]
        df['total_lh_r'] += df['r%d_lh' % i] 
        df['total_lh_d'] += df['d%d_lh' % i]
        df['total_kills_r'] += df['r%d_kills' % i] 
        df['total_kills_d'] += df['d%d_kills' % i]
        df['total_deaths_r'] += df['r%d_deaths' % i] 
        df['total_deaths_d'] += df['d%d_deaths' % i]
        df['total_items_r'] += df['r%d_items' % i] 
        df['total_items_d'] += df['d%d_items' % i]
    df['total_level_r'] = df['total_level_r'] * df['total_level_r']
    df['total_level_d'] = df['total_level_d'] * df['total_level_d']
    df['total_xp_r'] = df['total_xp_r'] * df['total_xp_r']
    df['total_xp_d'] = df['total_xp_d'] * df['total_xp_d']
    df['total_gold_r'] = df['total_gold_r'] * df['total_gold_r']
    df['total_gold_d'] = df['total_gold_d'] * df['total_gold_d']
    df['total_lh_r'] = df['total_lh_r'] * df['total_lh_r']
    df['total_lh_d'] = df['total_lh_d'] * df['total_lh_d']
    df['total_kills_r'] = df['total_kills_r'] * df['total_kills_r']
    df['total_kills_d'] = df['total_kills_d'] * df['total_kills_d']
    df['total_deaths_r'] = df['total_deaths_r'] * df['total_deaths_r']
    df['total_deaths_d'] = df['total_deaths_d'] * df['total_deaths_d']
    df['total_items_r'] = df['total_items_r'] * df['total_items_r']
    df['total_items_d'] = df['total_items_d'] * df['total_items_d']
    return df

def preprocess(df):
    del df['duration']
    del df['tower_status_radiant']
    del df['tower_status_dire']
    del df['barracks_status_radiant']
    del df['barracks_status_dire']
    df.fillna(0, inplace=True)
    return df

def delete_redundant_features(df):
    for i in xrange(1, 6):
        del df['r%d_level' % i]
        del df['r%d_xp' % i]
        del df['r%d_gold' % i]
        del df['r%d_lh' % i]
        del df['r%d_kills' % i]
        del df['r%d_deaths' % i]
        del df['r%d_items' % i]
        del df['d%d_level' % i]
        del df['d%d_xp' % i]
        del df['d%d_gold' % i]
        del df['d%d_lh' % i]
        del df['d%d_kills' % i]
        del df['d%d_deaths' % i]
        del df['d%d_items' % i]
    return df

def categorize_lobbies(df):
    X_pick = np.zeros((df.shape[0], 8))
    for i, match_id in enumerate(df.index):
        X_pick[i, df.ix[match_id, 'lobby_type']-1] = 1
    
    for i in xrange(8):
        df['lobby_%d' % (i+1)] = X_pick[:, i]
    del df['lobby_type']
    return df
    

def categorize_heroes(df):
    X_pick = np.zeros((df.shape[0], 113))

    for i, match_id in enumerate(df.index):
        for p in xrange(5):
            X_pick[i, df.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, df.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

    for i in xrange(112):
        df['hero_%d' % (i+1)] = X_pick[:, i]

    for i in xrange(1, 6):
        del df['r%d_hero' % i]
        del df['d%d_hero' % i]
    return df

kf = cv.KFold(len(df.index), n_folds=4, shuffle=True, random_state=241)

In [None]:
df = pd.read_csv('features.csv', index_col='match_id')

target = df['radiant_win']
del df['radiant_win']
df.fillna(0, inplace=True)

df = preprocess(df)
df = compute_additional_features(df)
# df = delete_redundant_features(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

print df.shape
print df.head()
X_train = df

In [7]:
start_time = datetime.datetime.now()

# lerning_rate=0.3, max_depth=2
clf = GradientBoostingClassifier(n_estimators=30,verbose=True, random_state=241)
cs_result = cv.cross_val_score(clf, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)

print 'Time elapsed:', datetime.datetime.now() - start_time
print cs_result

Time elapsed: 0:01:32.496000
[ 0.72106467  0.71598025  0.71619558  0.7212618 ]


In [31]:
import numpy as np
df = pd.read_csv('features.csv', index_col=None)
df_test = pd.read_csv('features_test.csv', index_col=None)
# print np.unique(df['lobby_type'])
# print np.unique(df_test['lobby_type'])
# print np.unique(df[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']])
# print df.isnull().sum()

In [36]:
# df = pd.read_csv('features.csv', index_col='match_id')
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('features.csv', index_col=None)

target = df['radiant_win']
del df['radiant_win']
del df['match_id']

df = preprocess(df)
# df = compute_additional_features(df)
# df = delete_redundant_features(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)
# del df['lobby_type']


ss = StandardScaler()
X_train = ss.fit_transform(X=df, y=target)

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

print df.shape
start_time = datetime.datetime.now()

kf = cv.KFold(len(df.index), n_folds=4, shuffle=True, random_state=241)
tuned_parameters = {'C': [0.1, 0.5, 1, 10, 100, 1000], 'penalty':['l2']}
# lr = GridSearchCV(LogisticRegression(random_state=241, n_jobs=-1), tuned_parameters, cv=5, scoring='roc_auc')
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=10)
cs_result = cv.cross_val_score(lr, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
print 'Time elapsed:', datetime.datetime.now() - start_time
print cs_result

(97230, 212)
Time elapsed: 0:00:18.559000
[ 0.75339022  0.75174889  0.74853075  0.75299571]


AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [29]:
df_test = pd.read_csv('features_test.csv', index_col=None)
df_test.fillna(0, inplace=True)
df_test = categorize_heroes(df_test)
df_test = categorize_lobbies(df_test)
match_ids = df_test['match_id']
del df_test['match_id']
X_test = ss.transform(X=df_test)
lr.fit(X=X_train, y=target)
predicted = lr.predict_proba(X_test)
print predicted[:10]

[[ 0.17418349  0.82581651]
 [ 0.24523082  0.75476918]
 [ 0.8124345   0.1875655 ]
 [ 0.13954365  0.86045635]
 [ 0.75960663  0.24039337]
 [ 0.62564041  0.37435959]
 [ 0.47055555  0.52944445]
 [ 0.43409641  0.56590359]
 [ 0.78598391  0.21401609]
 [ 0.32784959  0.67215041]]


In [26]:
d = {'match_id': match_ids, 'radiant_win': predicted[:,1]}
df_result = pd.DataFrame(data=d, index=None)
print df_result.head()
df_result.to_csv('submission', sep=',', encoding='utf-8', index=False)

   match_id  radiant_win
0         6     1.000000
1         7     1.000000
2        10     0.190810
3        13     0.897760
4        16     0.151311


In [7]:
from sklearn.ensemble import ExtraTreesClassifier
df = pd.read_csv('features.csv', index_col=None)

target = df['radiant_win']
del df['radiant_win']
del df['match_id']

df = preprocess(df)
df = categorize_heroes(df)
df = categorize_lobbies(df)

In [13]:
forest = ExtraTreesClassifier(n_estimators=500,
                              n_jobs=-1,
                              random_state=241,
                              verbose=True)

start_time = datetime.datetime.now()

cs = cv.cross_val_score(forest, X=df, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)

print 'Time elapsed:', datetime.datetime.now() - start_time
print cs

Time elapsed: 0:08:57.187000
[ 0.71043932  0.71114338  0.70850037  0.71039338]


In [None]:
forest.fit_transform(X=df, y=target)

df_test = pd.read_csv('features_test.csv', index_col=None)
df_test.fillna(0, inplace=True)
df_test = categorize_heroes(df_test)
df_test = categorize_lobbies(df_test)
match_ids = df_test['match_id']
del df_test['match_id']

predicted = forest.predict_proba(X=df_test)
print predicted[:10]