In [1]:
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
import numpy as np
import sklearn.cross_validation as cv
import time
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.svm import LinearSVC
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
import operator
import math
import itertools
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics

def add_heroes_bags(data):
    hero_cols = data.columns[data.columns.str.contains('hero')]
    X_pick = np.zeros((data.shape[0], 113))
    for i, match_id in enumerate(data.index):
        for p in xrange(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    data = data.drop(hero_cols,axis=1)
    for i in xrange(112):
        data['hero_%d' % (i+1)] = X_pick[:, i]
    return data

def fill_nans(data):
    data['first_blood_team'].replace(0,-1)
    data['first_blood_team'] = data['first_blood_team'].fillna(0)
    nullcols = list(data.columns[data.isnull().any(axis=0)].values)
    for nil in nullcols:
        data[nil] = data[nil].fillna(data[nil].median())
    return data

def replace_col_withsum(data, name):
    cols = data.columns[data.columns.str.contains(name)]
    data[name] = data[cols[cols.str.contains('^r')]].sum(axis=1) - data[cols[cols.str.contains('^d')]].sum(axis=1)
    data = data.drop(cols,axis=1)
    return data

def iqr_clean(data, col):
    q75, q25 = np.percentile(data[col], [75 ,25])
    iqr = q75 - q25
    d1 = q25 - 1.5 * iqr
    d2 = q75 + 1.5 * iqr
    data = data[data[col] > d1]
    data = data[data[col] < d2]
    return data
    
def get_test_data(filename):
    data = pd.read_csv(filename, index_col=None)
    data = fill_nans(data)
    data = replace_col_withsum(data,'level')
    data = replace_col_withsum(data,'gold')
    data = replace_col_withsum(data,'xp')
    data = replace_col_withsum(data,'kills')
    data = replace_col_withsum(data,'lh')
    data = replace_col_withsum(data,'items')
    data = replace_col_withsum(data,'deaths')
    return data

def get_train_data(filename):
    data = pd.read_csv(filename, index_col='match_id')
    data = iqr_clean(data,'duration')
#     data['radiant_win'] = data['radiant_win'].replace(0, -1)
    data = data.drop(['duration',
                              'tower_status_radiant',
                              'tower_status_dire',
                              'barracks_status_radiant',
                              'barracks_status_dire'],
                     axis=1)

    data = fill_nans(data)
    data = replace_col_withsum(data,'level')
    data = replace_col_withsum(data,'gold')
    data = replace_col_withsum(data,'xp')
    data = replace_col_withsum(data,'kills')
    data = replace_col_withsum(data,'lh')
    data = replace_col_withsum(data,'items')
    data = replace_col_withsum(data,'deaths')  
    return data

def get_lobby_games(data, lobby):
    data = data[data['lobby_type'] == lobby]
    del data['lobby_type']
    target = data['radiant_win']
    del data['radiant_win']
    return data, target

def modelfit(alg, dtrain, target, feature_names=None, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain, label=target)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=0, seed=241)
#         alg.set_params(n_estimators=cvresult.shape[0])
        print 'cv_result: '
        print cvresult.mean()
    
    #Fit the algorithm on the data
    alg.fit(dtrain, target, eval_metric='auc')
        
#     #Predict training set:
    dtrain_predictions = alg.predict(dtrain)
    dummies = pd.get_dummies(dtrain_predictions, prefix='gb_pred')
    dummies['match_id'] = dtrain.index.values
    dummies.to_csv('gb_prediction.csv', encoding='utf-8', index=False)
#     dtrain_predprob = alg.predict_proba(dtrain)[:,1]
        
#     #Print model report:
#     print "\nModel Report"
#     print "Accuracy : %.4g" % metrics.accuracy_score(target, dtrain_predictions)
#     print "AUC Score (Train): %f" % metrics.roc_auc_score(target, dtrain_predprob) 
#     matplotlib.use('TkAgg')
#     plt.switch_backend('TkAgg')
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    print 'feature importance: '
    print feat_imp
#     feat_imp.plot(kind='bar', title='Feature Importances')
#     plt.ylabel('Feature Importance Score')
#     xgb.plot_importance(alg)
#     xgb.to_graphviz(alg, num_trees=2)
#     xgb.plot_tree(alg, num_trees=2)

%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 100, 50

In [3]:
data = get_train_data('features.csv')
data, target = get_lobby_games(data, 0)
# lr_pred = pd.read_csv('lr_prediction.csv', index_col='match_id', header=0)
# data = data.join(lr_pred)
data = add_heroes_bags(data)
# ss = StandardScaler()
# X_train = ss.fit_transform(X=data, y=target)
X_train = data
print X_train.shape

(12482, 140)


In [105]:
data = get_train_data('features.csv')
data, target = get_lobby_games(data, 0)
data = add_heroes_bags(data)
X_train = data
print X_train.shape

(12482, 140)


In [4]:
xgb1 = XGBClassifier(
 learning_rate = 0.1,
 n_estimators=1000,
 max_depth=1,
 min_child_weight=2,
 gamma=0,
 subsample=0.9,
 colsample_bytree=0.7,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=241,
#  max_delta_step=1,
 reg_alpha=1e-06)
modelfit(xgb1, X_train, target)

cv_result: 
test-auc-mean     0.738016
test-auc-std      0.004853
train-auc-mean    0.758027
train-auc-std     0.001741
dtype: float64
feature importance: 
gold                        55
lh                          41
hero_67                     30
hero_57                     26
hero_2                      24
hero_102                    23
hero_71                     23
hero_26                     20
hero_25                     20
items                       20
hero_22                     20
start_time                  19
hero_39                     19
hero_52                     19
dire_first_ward_time        19
hero_13                     18
first_blood_time            18
radiant_bottle_time         18
hero_110                    18
hero_86                     17
hero_3                      17
xp                          17
hero_42                     16
hero_70                     16
hero_7                      16
hero_53                     15
hero_11                     14
dire_bo

In [63]:
param_test1 = {
 'max_depth':range(1,4,1),
 'min_child_weight':range(1,3,1)
}
gs1 = GridSearchCV(estimator = XGBClassifier(
         learning_rate = 0.1,
         n_estimators=1000,
         gamma=0,
         subsample=0.8,
         colsample_bytree=0.8,
         objective= 'binary:logistic',
         nthread=4,
         scale_pos_weight=1,
         seed=241), param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gs1.fit(X_train,target)
gs1.grid_scores_, gs1.best_params_, gs1.best_score_

([mean: 0.74561, std: 0.00483, params: {'max_depth': 1, 'min_child_weight': 1},
  mean: 0.74588, std: 0.00472, params: {'max_depth': 1, 'min_child_weight': 2},
  mean: 0.73986, std: 0.00597, params: {'max_depth': 2, 'min_child_weight': 1},
  mean: 0.73934, std: 0.00539, params: {'max_depth': 2, 'min_child_weight': 2},
  mean: 0.72937, std: 0.00667, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.72948, std: 0.00500, params: {'max_depth': 3, 'min_child_weight': 2}],
 {'max_depth': 1, 'min_child_weight': 2},
 0.74587663453230435)

In [67]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gs2 = GridSearchCV(estimator = XGBClassifier(
         learning_rate = 0.1,
         n_estimators=1000,
         max_depth=1,
         min_child_weight=2,
         subsample=0.8,
         colsample_bytree=0.8,
         objective= 'binary:logistic',
         nthread=4,
         scale_pos_weight=1,
         seed=241), param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gs2.fit(X_train,target)
gs2.grid_scores_, gs2.best_params_, gs2.best_score_

([mean: 0.74588, std: 0.00472, params: {'gamma': 0.0},
  mean: 0.74588, std: 0.00472, params: {'gamma': 0.1},
  mean: 0.74588, std: 0.00472, params: {'gamma': 0.2},
  mean: 0.74588, std: 0.00472, params: {'gamma': 0.3},
  mean: 0.74588, std: 0.00472, params: {'gamma': 0.4}],
 {'gamma': 0.0},
 0.74587663453230435)

In [69]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gs4 = GridSearchCV(estimator = XGBClassifier(
         learning_rate = 0.1,
         n_estimators=1000,
         gamma=0,
         subsample=0.8,
         max_depth=1,
         min_child_weight=2,
         colsample_bytree=0.8,
         objective= 'binary:logistic',
         nthread=4,
         scale_pos_weight=1,
         seed=241), param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gs4.fit(X_train,target)
gs4.grid_scores_, gs4.best_params_, gs4.best_score_

([mean: 0.74511, std: 0.00532, params: {'subsample': 0.6, 'colsample_bytree': 0.6},
  mean: 0.74556, std: 0.00597, params: {'subsample': 0.7, 'colsample_bytree': 0.6},
  mean: 0.74601, std: 0.00533, params: {'subsample': 0.8, 'colsample_bytree': 0.6},
  mean: 0.74602, std: 0.00595, params: {'subsample': 0.9, 'colsample_bytree': 0.6},
  mean: 0.74504, std: 0.00553, params: {'subsample': 0.6, 'colsample_bytree': 0.7},
  mean: 0.74600, std: 0.00594, params: {'subsample': 0.7, 'colsample_bytree': 0.7},
  mean: 0.74569, std: 0.00509, params: {'subsample': 0.8, 'colsample_bytree': 0.7},
  mean: 0.74613, std: 0.00576, params: {'subsample': 0.9, 'colsample_bytree': 0.7},
  mean: 0.74528, std: 0.00539, params: {'subsample': 0.6, 'colsample_bytree': 0.8},
  mean: 0.74591, std: 0.00612, params: {'subsample': 0.7, 'colsample_bytree': 0.8},
  mean: 0.74588, std: 0.00472, params: {'subsample': 0.8, 'colsample_bytree': 0.8},
  mean: 0.74583, std: 0.00644, params: {'subsample': 0.9, 'colsample_bytree'

In [8]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gs6 = GridSearchCV(estimator = XGBClassifier(
         learning_rate = 0.1,
         n_estimators=1000,
         gamma=0,
         max_depth=1,
         min_child_weight=2,
         subsample=0.9,
         colsample_bytree=0.7,
         objective= 'binary:logistic',
         nthread=4,
         scale_pos_weight=1,
         seed=241), param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gs6.fit(X_train,target)
gs6.grid_scores_, gs6.best_params_, gs6.best_score_

([mean: 0.74613, std: 0.00576, params: {'reg_alpha': 1e-05},
  mean: 0.74602, std: 0.00589, params: {'reg_alpha': 0.01},
  mean: 0.74604, std: 0.00583, params: {'reg_alpha': 0.1},
  mean: 0.74563, std: 0.00586, params: {'reg_alpha': 1},
  mean: 0.71021, std: 0.00981, params: {'reg_alpha': 100}],
 {'reg_alpha': 1e-05},
 0.7461339648194969)

In [10]:
param_test7 = {
 'reg_alpha':[0.000001, 0.0000005, 0.0000001, 0.00000005, 0.00000001]
}
gs7 = GridSearchCV(estimator = XGBClassifier(
         learning_rate = 0.1,
         n_estimators=1000,
         gamma=0,
         max_depth=1,
         min_child_weight=2,
         subsample=0.9,
         colsample_bytree=0.7,
         objective= 'binary:logistic',
         nthread=4,
         scale_pos_weight=1,
         seed=241), param_grid = param_test7, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gs7.fit(X_train,target)
gs7.grid_scores_, gs7.best_params_, gs7.best_score_

([mean: 0.74613, std: 0.00576, params: {'reg_alpha': 1e-06},
  mean: 0.74613, std: 0.00576, params: {'reg_alpha': 5e-07},
  mean: 0.74613, std: 0.00576, params: {'reg_alpha': 1e-07},
  mean: 0.74613, std: 0.00576, params: {'reg_alpha': 5e-08},
  mean: 0.74613, std: 0.00576, params: {'reg_alpha': 1e-08}],
 {'reg_alpha': 1e-06},
 0.7461341577575773)