In [None]:
import pandas
import numpy as np
import numpy.random as rnd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
import datetime

from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import scale

#ML Final task Step1. Gradient Boosting.

features = pandas.read_csv('data/features.csv', index_col='match_id')

# Count cols with missed values
all_count = len(features)
missed_cnt = features.count()
print(missed_cnt[missed_cnt < all_count])

features = features.fillna(0)

#Delete result-related cols
from sklearn.preprocessing import scale
X = scale(features.drop(['radiant_win','start_time', 'duration',  'tower_status_radiant', 'tower_status_dire',
                   'barracks_status_dire', 'barracks_status_radiant'], 1))
y = np.ravel(features[['radiant_win']])

grid = { 'n_estimators': [10, 20, 30, 40] }
kf = KFold(y.size, n_folds=5, shuffle=True)
gbc = GradientBoostingClassifier(verbose=True)
gs = GridSearchCV(gbc, grid, scoring='roc_auc', cv=kf)
gs.fit(X, y)

for r in gs.grid_scores_:
    print('%0.5f' % r.mean_validation_score )
    
# 0.66491
# 0.68250
# 0.68991
# 0.69465

#Calc time needed for 30 trees
gbc = GradientBoostingClassifier(n_estimators=30, verbose=False)
kf = KFold(y.size, n_folds=5, shuffle=True)
start_time = datetime.datetime.now()
r = cross_val_score(gbc, X, y, scoring='roc_auc', cv=kf)
print(datetime.datetime.now() - start_time)
print('%0.5f' % r.mean())

# 0:03:37.785168
# 0.68950

gbc.fit(X, y)

#prediction for test data
test = pandas.read_csv('data/features_test.csv', index_col='match_id')
test = test.fillna(0)
X_test = scale(test.drop(['start_time'], 1))
pred = gbc.predict_proba(X_test)[:, 1]

for i in range(0, pred.size):
    print('%d,%0.9f' % (test.index.values[i], pred[i]))

#ML Final task STEP2. Regression.-----------------------------------------------------

features = pandas.read_csv('data/features.csv', index_col='match_id')
features = features.fillna(0)

Xfeatures = features.drop(['start_time', 'duration', 'lobby_type', 'radiant_win', 'tower_status_radiant', 'tower_status_dire',
                   'barracks_status_dire', 'barracks_status_radiant'], 1)
 
hfeatures = ['r1_hero','r2_hero','r3_hero','r4_hero','r5_hero','d1_hero','d2_hero','d3_hero','d4_hero','d5_hero']

heroes = features[hfeatures]

#Calc unique heroes
hcols = [s for s in heroes.columns if "hero" in s]
hunique = np.unique(heroes[hcols])
print(len(hunique))
# 108

#Generate Bag of words
def BagOfWords(f):
    N = 112 #Count of heroes
    Xp = np.zeros((f.shape[0], N))
    for i, match_id in enumerate(f.index):
        for p in range(5):
            Xp[i, f.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            Xp[i, f.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    return Xp

X_pick = BagOfWords(features)
Xnoheroes = Xfeatures.drop(hfeatures, 1)

X = scale(Xfeatures)
y = np.ravel(features[['radiant_win']])

# Use  LogisticRegressionCV (instead of  LogisticRegression) to find best C parameter, use default array Cs
lgr = LogisticRegressionCV(cv=5, penalty='l2', scoring='roc_auc', max_iter=1000, verbose=False) 

def ROC_AUC(lgr, X, y):
    lgr.fit(X, y)
  
    kf = KFold(y.size, n_folds=5, shuffle=True)
    start_time = datetime.datetime.now()
    r = cross_val_score(lgr, X, y, scoring='roc_auc', cv=kf)
    print(datetime.datetime.now() - start_time)
    print('%0.5f' % r.mean())

# 1. Simple case    
ROC_AUC(lgr, X, y)
# 0:01:03  
# 0.71608         

# 2. Removed categorial features
X = scale(Xnoheroes)
ROC_AUC(lgr, X, y)
# 0:00:59 
# 0.71624         

# 3. Using Bag of words
X = np.hstack((scale(Xnoheroes), X_pick))
ROC_AUC(lgr, X, y)
# 0:03:54  
# 0.75166         

# Prediction for test data
test = pandas.read_csv('data/features_test.csv', index_col='match_id')
test = test.fillna(0)

X_pick = BagOfWords(test)
X_test = test.drop(hfeatures, 1)
X_test = X_test.drop(['start_time', 'lobby_type'], 1)
X = np.hstack((scale(X_test), X_pick))
pred = lgr.predict_proba(X)[:, 1]

print(np.min(pred))
print(np.max(pred))

# 0.00847697236435
# 0.996555969889  

first_blood_time               77677
first_blood_team               77677
first_blood_player1            77677
first_blood_player2            53243
radiant_bottle_time            81539
radiant_courier_time           96538
radiant_flying_courier_time    69751
radiant_first_ward_time        95394
dire_bottle_time               81087
dire_courier_time              96554
dire_flying_courier_time       71132
dire_first_ward_time           95404
dtype: int64
      Iter       Train Loss   Remaining Time 
         1           1.3785           13.53s
         2           1.3732           12.46s
         3           1.3681           10.96s
         4           1.3638            9.43s
         5           1.3591            7.79s
         6           1.3546            6.14s
         7           1.3505            4.59s
         8           1.3463            3.04s
         9           1.3425            1.52s
        10           1.3388            0.00s
      Iter       Train Loss   Remaining Time 
 