In [1]:
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
import numpy as np
import sklearn.cross_validation as cv
import time
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.svm import LinearSVC
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
import operator
import math
import itertools
from sklearn.cross_validation import train_test_split
%matplotlib inline

def add_heroes_bags(data):
    hero_cols = data.columns[data.columns.str.contains('hero')]
    X_pick = np.zeros((data.shape[0], 112))
    for i, match_id in enumerate(data.index):
        for p in xrange(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    data = data.drop(hero_cols,axis=1)
    return np.hstack((data.values, X_pick))

def get_heroes_data(filename):
    data = get_train_data(filename, replace=False)
    hero_cols = data.columns[data.columns.str.contains('hero')]
    data = data[hero_cols]
#     return add_heroes_bags(data)
    return data

def fill_nans(data):
    data['first_blood_team'].replace(0,-1)
    data['first_blood_team'] = data['first_blood_team'].fillna(0)
    nullcols = list(data.columns[data.isnull().any(axis=0)].values)
    for nil in nullcols:
        data[nil] = data[nil].fillna(data[nil].median())
    return data

def replace_col_withsum(data, name):
    cols = data.columns[data.columns.str.contains(name)]
    data[name] = data[cols[cols.str.contains('^r')]].sum(axis=1) - data[cols[cols.str.contains('^d')]].sum(axis=1)
    data = data.drop(cols,axis=1)
    return data

def iqr_clean(data, col):
    q75, q25 = np.percentile(data[col], [75 ,25])
    iqr = q75 - q25
    d1 = q25 - 1.5 * iqr
    d2 = q75 + 1.5 * iqr
    data = data[data[col] > d1]
    data = data[data[col] < d2]
    return data
    
def get_test_data(filename):
    data = pd.read_csv(filename, index_col=None)
    data = fill_nans(data)
    data = replace_col_withsum(data,'level')
    data = replace_col_withsum(data,'gold')
    data = replace_col_withsum(data,'xp')
    data = replace_col_withsum(data,'kills')
    data = replace_col_withsum(data,'lh')
    data = replace_col_withsum(data,'items')
    data = replace_col_withsum(data,'deaths')
    return data

def get_train_data(filename, replace=True):
    data = pd.read_csv(filename, index_col='match_id')
    data = iqr_clean(data,'duration')
    data['radiant_win'] = data['radiant_win'].replace(0, -1)
    data = data.drop(['duration',
                              'tower_status_radiant',
                              'tower_status_dire',
                              'barracks_status_radiant',
                              'barracks_status_dire'],
                     axis=1)

    data = fill_nans(data)
    if replace:
        data = replace_col_withsum(data,'level')
        data = replace_col_withsum(data,'gold')
        data = replace_col_withsum(data,'xp')
        data = replace_col_withsum(data,'kills')
        data = replace_col_withsum(data,'lh')
        data = replace_col_withsum(data,'items')
        data = replace_col_withsum(data,'deaths')  
    return data

def get_lobby_games(data, lobby):
    data = data[data['lobby_type'] == lobby]
    del data['lobby_type']
    target = data['radiant_win']
    target = target.replace(0, -1)
    del data['radiant_win']
    return data, target

def get_lobby_games_no_target_split(data, lobby):
    data = data[data['lobby_type'] == lobby]
    del data['lobby_type']
    return data

In [10]:
data = get_train_data('features.csv')
data, target = get_lobby_games(data, 0)
data = add_heroes_bags(data) 
ss = StandardScaler()
X_train = ss.fit_transform(X=data, y=target)
print X_train.shape

(12482L, 140L)


In [11]:
kf = cv.KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=241)
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.005)
cs_result = cv.cross_val_score(lr, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
print '241 random state:'
print cs_result.mean()
print 'random state 5 folds:'
cs_results = []
randoms = np.random.randint(1, 255, 5)
for i in randoms:
    kf = cv.KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=i)
    lr = LogisticRegression(penalty='l2', random_state=i, n_jobs=-1, C=0.005)
    cs_result = cv.cross_val_score(lr, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
    cs_results.append(cs_result)
print np.mean(cs_results)

241 random state:
0.756873474962
random state 5 folds:
0.755890595278


In [None]:
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.005)
test = get_test_data('features_test.csv')
test = test[test['lobby_type'] == 0]
del test['lobby_type']
match_ids = test['match_id']
del test['match_id']
test = add_heroes_bags(test)
print test.shape
test = ss.transform(X=test)
lr.fit(X=X_train, y=target)
predicted = lr.predict_proba(test)
d = {'match_id': match_ids, 'radiant_win': predicted[:,1]}
df_result = pd.DataFrame(data=d, index=None)
print df_result[:10]
df_result.to_csv('submission', sep=',', encoding='utf-8', index=False)

In [None]:
train, target = get_train_data_target('features_team_items.csv')
print train.shape
ss = StandardScaler()
X_train = ss.fit_transform(X=train, y=target)

In [None]:
for i in xrange(1, 6):
    print i
    lr = LogisticRegression(penalty='l2', random_state=i, n_jobs=-1)
    # params = {'C': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]}
    # params = {'C': [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009]}
    # params = {'C': [0.001, 0.0052, 0.0053, 0.0054, 0.0055, 0.0056, 0.0057, 0.0058, 0.0059]}
    # params = {'C': [0.00521, 0.00522, 0.00523, 0.00524, 0.00525, 0.00526, 0.00527, 0.00528, 0.00529]}
    # params = {'C': [0.005241, 0.005242, 0.005243, 0.005244, 0.005245, 0.005246, 0.005247, 0.005248, 0.005249]}
    gs = GridSearchCV(estimator=lr, param_grid=params, scoring='roc_auc', n_jobs=-1,iid=False, cv=5)
    gs.fit(X_train, target)
    gs.grid_scores_, gs.best_params_, gs.best_score_

In [None]:
cs_res_hp = []
cs_res_np = []
for j in xrange(5):
    cs_results = []
    randoms = np.random.randint(1, 255, 5)
    for i in randoms:
        print i
        kf = cv.KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=i)
        lr = LogisticRegression(penalty='l2', random_state=i, n_jobs=-1, C=0.005247)
        cs_result = cv.cross_val_score(lr, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
        print cs_result
        print cs_result.mean()
        cs_results.append(cs_result.mean())
    cs_res_hp.append(np.mean(cs_results))

    cs_results = []
    for i in randoms:
        print i
        kf = cv.KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=i)
        lr = LogisticRegression(penalty='l2', random_state=i, n_jobs=-1, C=0.005)
        cs_result = cv.cross_val_score(lr, X=X_train, y=target, cv=kf, scoring='roc_auc', n_jobs=-1)
        print cs_result
        print cs_result.mean()
        cs_results.append(cs_result.mean())
    print np.mean(cs_results)
    cs_res_np.append(np.mean(cs_results))
print np.mean(cs_res_hp)
print np.mean(cs_res_np)

In [None]:
df_test = pd.read_csv('features_test.csv', index_col='match_id')
print df_test['lobby_type'].value_counts()
print df_test.shape

In [None]:
df_ab = pd.read_csv('features_test_team_items_abilities.csv', index_col=None)
df_items = pd.read_csv('features_test_team_items.csv', index_col=None)
ab = df_ab[df_ab.columns[df_ab.columns.str.contains('ability')]]
print ab.shape
print df_items.shape
joined = df_items.join(ab)
print joined.shape
joined.to_csv('features_test_team_items_abilities.csv', sep=',', encoding='utf-8', index=False)
# ab = df_ab.columns[[df_ab.columns.str.contains('ability')]]
# print ab.shape

### Подготовка данных для NEAT

In [3]:
data = get_train_data('features.csv', replace=False)
data, target = get_lobby_games(data, 0)
data = add_heroes_bags(data) 
print data.shape

(12482L, 203L)


In [4]:
target.replace(-1, 0, inplace=True)
stacked_data = np.hstack((data, np.mat(target.values).T))
train, test = train_test_split(stacked_data, test_size=0.2, random_state=241)
y_train = train[:,-1]
y_test = test[:,-1]
ss = StandardScaler()
X_train = ss.fit_transform(X=train[:,:-1], y=y_train)
stacked_train = np.hstack((X_train, np.mat(y_train)))
np.savetxt('dota_train.csv', stacked_train, delimiter=',', fmt='%.12f')
print stacked_train.shape

X_test = ss.transform(X=test[:,:-1])
stacked_test = np.hstack((X_test, np.mat(y_test)))
np.savetxt('dota_test.csv', stacked_test, delimiter=',', fmt='%.12f')
print stacked_test.shape

# df_test = pd.DataFrame.from_records(data=test, columns=None)
# # df_test = df_test.applymap(lambda x: '%.4f' % x)
# print df_test.shape
# df_test.to_csv('clean_dota2_test_lobby0.csv', sep=',', encoding='utf-8', index=False)

(9985L, 204L)
(2497L, 204L)


In [2]:
data = get_train_data('features.csv')
data, target = get_lobby_games(data, 0)
target.replace(-1, 0, inplace=True)
data = add_heroes_bags(data) 
print data.shape
ss = StandardScaler()
X_train = ss.fit_transform(X=data, y=target)
print X_train.shape
X_train, X_test, y_train, y_test = train_test_split(X_train, target, test_size=0.2, random_state=241)
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape

(12482L, 140L)
(12482L, 140L)
(9985L, 140L)
(2497L, 140L)
(9985L,)
(2497L,)


In [3]:
df = pd.DataFrame.from_records(X_train, index=None)
df['radiant_win'] = pd.Series(np.array(y_train), index=None)
df.to_csv('dota_train.csv', delimeter=',', index=False, header=None, encoding='utf-8')

df = pd.DataFrame.from_records(X_test, index=None)
df['radiant_win'] = pd.Series(np.array(y_test), index=None)
df.to_csv('dota_test.csv', delimeter=',', index=False, header=None, encoding='utf-8')

In [7]:
df_train = pd.read_csv('dota_train.csv', index_col=None, header=None)
df_test = pd.read_csv('dota_test.csv', index_col=None, header=None)

target_train = df_train.ix[:, len(df_train.columns)-1]
target_test = df_test.ix[:, len(df_test.columns)-1]

df_train.drop([df_train.columns[len(df_train.columns)-1]],inplace=True,axis=1)
df_test.drop([df_test.columns[len(df_test.columns)-1]],inplace=True,axis=1)

df_neat_pred = pd.read_csv('neat\predicted_proba.txt', index_col=None, header=None, sep=' ')
df_neat_pred_train = pd.read_csv('neat\predicted_proba_train.txt', index_col=None, header=None, sep=' ')

lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.005)
lr.fit(df_train, target_train)

print 'auc train lr:        ', roc_auc_score(target_train, lr.predict_proba(df_train)[:,1])
print 'auc train neat:      ', roc_auc_score(target_train, df_neat_pred_train) 
print 'auc test lr:         ', roc_auc_score(target_test, lr.predict_proba(df_test)[:,1])
print 'auc test neat:       ', roc_auc_score(target_test, df_neat_pred)
print 'accuracy train lr:   ', accuracy_score(target_train, lr.predict(df_train))
print 'accuracy train neat: ', accuracy_score(target_train, np.rint(df_neat_pred_train))
print 'accuracy test lr:    ', accuracy_score(target_test, lr.predict(df_test))
print 'accuracy test neat:  ', accuracy_score(target_test, np.rint(df_neat_pred))

auc train lr:         0.77283695781
auc train neat:       0.747937252979
auc test lr:          0.748110534989
auc test neat:        0.729887159093
accuracy train lr:    0.697145718578
accuracy train neat:  0.648072108162
accuracy test lr:     0.683219863837
accuracy test neat:   0.621545855026


In [None]:
df_neat_pred = pd.read_csv('neat\predicted_proba.txt', index_col=None, header=None, sep=' ')
df_neat_pred_train = pd.read_csv('neat\predicted_proba_train.txt', index_col=None, header=None, sep=' ')
lr = LogisticRegression(penalty='l2', random_state=241, n_jobs=-1, C=0.005)
lr.fit(X_train, y_train)
print 'auc train lr:        ', roc_auc_score(y_train, lr.predict_proba(X_train)[:,1])
print 'auc train neat:      ', roc_auc_score(y_train, df_neat_pred_train) 
print 'auc test lr:         ', roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])
print 'auc test neat:       ', roc_auc_score(y_test, df_neat_pred)
print 'accuracy train lr:   ', accuracy_score(y_train, lr.predict(X_train))
print 'accuracy train neat: ', accuracy_score(y_train, np.rint(df_neat_pred_train))
print 'accuracy test lr:    ', accuracy_score(y_test, lr.predict(X_test))
print 'accuracy test neat:  ', accuracy_score(y_test, np.rint(df_neat_pred))

In [None]:
print X_train[:1]

In [None]:
print lr.predict_proba(X_train)[:,1][:10]

In [None]:
print lr.coef_

In [None]:
from collections import Counter
print Counter(y_train)

In [None]:
id=1
with open('neat\lr2.gnm.xml', 'w') as out:
    for c in lr.coef_[0]:      
        out.write('<Con id="%d" src="%d" tgt="141" wght="%f" />\n' % (id,id,c))
        id += 1
# with open("neat\genom2.gnm.xml") as myfile:
#     text = myfile.read()
#     with open('neat\lr2.gnm.xml', 'a') as out:
#         out.write(text.format(*lr.coef_[0]))