In [49]:
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
import numpy as np
import sklearn.cross_validation as cv
import time
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.svm import LinearSVC
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
import operator
import math
import itertools
from sklearn.cross_validation import train_test_split

def add_heroes_bags(data):
    hero_cols = data.columns[data.columns.str.contains('hero')]
    X_pick = np.zeros((data.shape[0], 112))
    for i, match_id in enumerate(data.index):
        for p in xrange(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    data = data.drop(hero_cols,axis=1)
    return np.hstack((data.values, X_pick))

def fill_nans(data):
    data['first_blood_team'].replace(0,-1)
    data['first_blood_team'] = data['first_blood_team'].fillna(0)
    nullcols = list(data.columns[data.isnull().any(axis=0)].values)
    for nil in nullcols:
        data[nil] = data[nil].fillna(data[nil].median())
    return data

def replace_col_withsum(data, name):
    cols = data.columns[data.columns.str.contains(name)]
    data[name] = data[cols[cols.str.contains('^r')]].sum(axis=1) - data[cols[cols.str.contains('^d')]].sum(axis=1)
    data = data.drop(cols,axis=1)
    return data

def iqr_clean(data, col):
    q75, q25 = np.percentile(data[col], [75 ,25])
    iqr = q75 - q25
    d1 = q25 - 1.5 * iqr
    d2 = q75 + 1.5 * iqr
    data = data[data[col] > d1]
    data = data[data[col] < d2]
    return data
    
def get_test_data(filename):
    data = pd.read_csv(filename, index_col=None)
    data = fill_nans(data)
    data = replace_col_withsum(data,'level')
    data = replace_col_withsum(data,'gold')
    data = replace_col_withsum(data,'xp')
    data = replace_col_withsum(data,'kills')
    data = replace_col_withsum(data,'lh')
    data = replace_col_withsum(data,'items')
    data = replace_col_withsum(data,'deaths')
    return data

def get_train_data(filename):
    data = pd.read_csv(filename, index_col='match_id')
    data = iqr_clean(data,'duration')
    data['radiant_win'] = data['radiant_win'].replace(0, -1)
    data = data.drop(['duration',
                              'tower_status_radiant',
                              'tower_status_dire',
                              'barracks_status_radiant',
                              'barracks_status_dire'],
                     axis=1)

    data = fill_nans(data)
    data = replace_col_withsum(data,'level')
    data = replace_col_withsum(data,'gold')
    data = replace_col_withsum(data,'xp')
    data = replace_col_withsum(data,'kills')
    data = replace_col_withsum(data,'lh')
    data = replace_col_withsum(data,'items')
    data = replace_col_withsum(data,'deaths')  
    return data

def get_lobby_games(data, lobby):
    data = data[data['lobby_type'] == lobby]
    del data['lobby_type']
    target = data['radiant_win']
    del data['radiant_win']
    return data, target

def get_lobby_games_no_target_split(data, lobby):
    data = data[data['lobby_type'] == lobby]
    del data['lobby_type']
    return data

In [10]:
data = get_train_data('features.csv')
data, target = get_lobby_games(data, 0)
data = add_heroes_bags(data) 
ss = StandardScaler()
X_train = ss.fit_transform(X=data, y=target)
print X_train.shape

(12482L, 140L)


In [11]:
kf = cv.KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=241)
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.005)
cs_result = cv.cross_val_score(lr, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
print '241 random state:'
print cs_result.mean()
print 'random state 5 folds:'
cs_results = []
randoms = np.random.randint(1, 255, 5)
for i in randoms:
    kf = cv.KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=i)
    lr = LogisticRegression(penalty='l2', random_state=i, n_jobs=-1, C=0.005)
    cs_result = cv.cross_val_score(lr, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
    cs_results.append(cs_result)
print np.mean(cs_results)

241 random state:
0.757028688332
random state 5 folds:
0.756648988077


In [None]:
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.005)
test = get_test_data('features_test.csv')
test = test[test['lobby_type'] == 0]
del test['lobby_type']
match_ids = test['match_id']
del test['match_id']
test = add_heroes_bags(test)
print test.shape
test = ss.transform(X=test)
lr.fit(X=X_train, y=target)
predicted = lr.predict_proba(test)
d = {'match_id': match_ids, 'radiant_win': predicted[:,1]}
df_result = pd.DataFrame(data=d, index=None)
print df_result[:10]
df_result.to_csv('submission', sep=',', encoding='utf-8', index=False)

In [3]:
train, target = get_train_data_target('features_team_items.csv')
print train.shape
ss = StandardScaler()
X_train = ss.fit_transform(X=train, y=target)

(96111L, 1308L)


In [23]:
for i in xrange(1, 6):
    print i
    lr = LogisticRegression(penalty='l2', random_state=i, n_jobs=-1)
    # params = {'C': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]}
    # params = {'C': [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009]}
    # params = {'C': [0.001, 0.0052, 0.0053, 0.0054, 0.0055, 0.0056, 0.0057, 0.0058, 0.0059]}
    # params = {'C': [0.00521, 0.00522, 0.00523, 0.00524, 0.00525, 0.00526, 0.00527, 0.00528, 0.00529]}
    # params = {'C': [0.005241, 0.005242, 0.005243, 0.005244, 0.005245, 0.005246, 0.005247, 0.005248, 0.005249]}
    gs = GridSearchCV(estimator=lr, param_grid=params, scoring='roc_auc', n_jobs=-1,iid=False, cv=5)
    gs.fit(X_train, target)
    gs.grid_scores_, gs.best_params_, gs.best_score_

1
2
3
4
5


In [37]:
cs_res_hp = []
cs_res_np = []
for j in xrange(5):
    cs_results = []
    randoms = np.random.randint(1, 255, 5)
    for i in randoms:
        print i
        kf = cv.KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=i)
        lr = LogisticRegression(penalty='l2', random_state=i, n_jobs=-1, C=0.005247)
        cs_result = cv.cross_val_score(lr, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
        print cs_result
        print cs_result.mean()
        cs_results.append(cs_result.mean())
    cs_res_hp.append(np.mean(cs_results))

    cs_results = []
    for i in randoms:
        print i
        kf = cv.KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=i)
        lr = LogisticRegression(penalty='l2', random_state=i, n_jobs=-1, C=0.005)
        cs_result = cv.cross_val_score(lr, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
        print cs_result
        print cs_result.mean()
        cs_results.append(cs_result.mean())
    print np.mean(cs_results)
    cs_res_np.append(np.mean(cs_results))
print np.mean(cs_res_hp)
print np.mean(cs_res_np)

175
[ 0.73758645  0.75912912  0.77769864  0.74363151  0.76923072]
0.757455289563
1
[ 0.7559426   0.75718461  0.74806617  0.75143     0.76403421]
0.755331516061
184
[ 0.74992538  0.75690029  0.76959159  0.75907689  0.74365474]
0.755829778112
106
[ 0.75915239  0.75634467  0.75933077  0.75315566  0.75569874]
0.756736447943
169
[ 0.7635336   0.748579    0.75987087  0.75913927  0.75642317]
0.757509182819
175
[ 0.73757037  0.75913169  0.77769478  0.74361413  0.76925001]
0.7574521975
1
[ 0.75595418  0.75717305  0.74809635  0.75146541  0.7640239 ]
0.755342579078
184
[ 0.75002509  0.75692404  0.76961344  0.75908332  0.74365474]
0.755860125546
106
[ 0.7591073   0.75641098  0.75938667  0.75318332  0.75568846]
0.756755345029
169
[ 0.76357096  0.748579    0.7598227   0.75919532  0.75641352]
0.757516299909
0.756585309412
210
[ 0.75940746  0.75293619  0.75579599  0.7499992   0.76517771]
0.756663309948
218
[ 0.75677058  0.77247477  0.75344868  0.75296129  0.75115405]
0.757361874823
8
[ 0.76078888  0.7

In [16]:
df_test = pd.read_csv('features_test.csv', index_col='match_id')
print df_test['lobby_type'].value_counts()
print df_test.shape

1    9857
7    5029
0    2291
Name: lobby_type, dtype: int64
(17177, 102)


In [None]:
df_ab = pd.read_csv('features_test_team_items_abilities.csv', index_col=None)
df_items = pd.read_csv('features_test_team_items.csv', index_col=None)
ab = df_ab[df_ab.columns[df_ab.columns.str.contains('ability')]]
print ab.shape
print df_items.shape
joined = df_items.join(ab)
print joined.shape
joined.to_csv('features_test_team_items_abilities.csv', sep=',', encoding='utf-8', index=False)
# ab = df_ab.columns[[df_ab.columns.str.contains('ability')]]
# print ab.shape

### Подготовка данных для NEAT

In [54]:
data = get_train_data('features.csv')
data, target = get_lobby_games(data, 0)
data = add_heroes_bags(data) 
ss = StandardScaler()
X_train = ss.fit_transform(X=data, y=target)
print X_train.shape

(12482L, 140L)


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_train, target, test_size=0.2, random_state=241)
X_train = np.concatenate((X_train,y_train))
X_test = np.concatenate((X_test,y_test))
df_train = pd.DataFrame.from_records(data=X_train, columns=None)
# df_train = df_train.applymap(lambda x: '%.4f' % x)
# df_target = pd.DataFrame.from_records(data=y_train, columns=None)
# df_train['radiant_win'] = y_train
# print df_train['radiant_win']
print df_train.shape
df_train.to_csv('clean_dota2_train_lobby0.csv', sep=',', encoding='utf-8', index=False)

df_test = pd.DataFrame.from_records(data=X_test, columns=None)
# df_test = df_test.applymap(lambda x: '%.4f' % x)
# df_test['radiant_win'] = y_test
print df_test.shape
df_test.to_csv('clean_dota2_test_lobby0.csv', sep=',', encoding='utf-8', index=False)

ValueError: all the input arrays must have same number of dimensions

In [52]:
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.005)
lr.fit(X_train, y_train)
# predicted = lr.predict_proba(X_train)
# df_pred = pd.DataFrame.from_records(data=predicted, columns=None)
# df_pred.to_csv('predictions.csv', sep=',', encoding='utf-8', index=False)
print roc_auc_score(y_train, lr.predict_proba(X_train)[:,1])
print roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])
print accuracy_score(y_train, lr.predict(X_train))
print accuracy_score(y_test, lr.predict(X_test))
# print ["{0:0.10f}".format(i) for i in lr.coef_[0]]

1.0
1.0
1.0
1.0


In [41]:
df_neat_pred = pd.read_csv('neat\predicted_proba.txt', index_col=None, header=None, sep=' ')
# print y_train.shape
# print df_neat_pred.shape
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.005)
lr.fit(X_train, y_train)
print roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])
print roc_auc_score(y_test, df_neat_pred)
print accuracy_score(y_test, lr.predict(X_test))
print accuracy_score(y_test, np.around(df_neat_pred))

0.999951156687


ValueError: Found arrays with inconsistent numbers of samples: [2497 9985]

In [42]:
df_neat_pred = pd.read_csv('neat\predicted_proba.txt', index_col=None, header=None, sep=' ')
# print y_train.shape
# print df_neat_pred.shape
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.005)
lr.fit(X_train, y_train)
print roc_auc_score(y_train, lr.predict_proba(X_train)[:,1])
print roc_auc_score(y_train, df_neat_pred)
print accuracy_score(y_train, lr.predict(X_train))
print accuracy_score(y_train, np.around(df_neat_pred))

0.999992002909
0.77131094014
0.99849774662
0.704456685028


In [22]:
from collections import Counter
print Counter(y_train)

Counter({0: 5195, 1: 4790})


In [9]:
id=1
with open('neat\lr2.gnm.xml', 'w') as out:
    for c in lr.coef_[0]:      
        out.write('<Con id="%d" src="%d" tgt="141" wght="%.12f" />\n' % (id,id,c))
        id += 1
# with open("neat\genom2.gnm.xml") as myfile:
#     text = myfile.read()
#     with open('neat\lr2.gnm.xml', 'a') as out:
#         out.write(text.format(*lr.coef_[0]))