In [36]:
import pandas as pd
import numpy as np
import math as mt

In [49]:
##Reading data
features = pd.read_csv('features.csv', index_col='match_id')
features_test = pd.read_csv('features_test.csv', index_col='match_id')

In [50]:
##Deleting features we don't have in test dataset
features_answr_data = features[['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire']]
y = features_answr_data['radiant_win']
del features['duration'] 
del features['radiant_win']
del features['tower_status_radiant'] 
del features['tower_status_dire'] 
del features['barracks_status_radiant'] 
del features['barracks_status_dire']

In [51]:
##Searching for features with Nun meanings and filling with 0
cnt = features.count()
features = features.fillna(0)

In [44]:
##Gradient Boosting

from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
import time
import datetime

clf = KFold(n_splits = 5, shuffle = True, random_state = 42)  ##Partition generator for cross-validation
for k in [10, 20, 30, 40, 50]:
    start_time = datetime.datetime.now()  ##Time-counter
    time.sleep(3)
    gbc = GradientBoostingClassifier(n_estimators = k, random_state = 42) 
    cvs = cross_val_score(estimator = gbc, X = features, y = y, cv = clf, scoring='roc_auc')  
    answ = cvs.mean()
    print ('Number of trees:', k)
    print ('Precision:', answ)
    print ('Time elapsed:', datetime.datetime.now() - start_time, '\n')

Number of trees: 10
Precision: 0.6648506879750012
Time elapsed: 0:00:29.145862 

Number of trees: 20
Precision: 0.6824618768044435
Time elapsed: 0:00:52.375024 

Number of trees: 30
Precision: 0.6900064710388155
Time elapsed: 0:01:14.606087 

Number of trees: 40
Precision: 0.6940387245121103
Time elapsed: 0:01:35.284865 

Number of trees: 50
Precision: 0.6974943609466162
Time elapsed: 0:01:58.886339 



In [52]:
##Logistic regression

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

##Preprocessing
scaler = StandardScaler()
scaler.fit(features)
X_train = scaler.transform(features)
print(X_train)

[[-2.54436416  1.54068827 -1.24422828 ... -0.55115386  1.84600409
  -1.12149424]
 [-2.54045236 -0.92779756 -0.29225805 ...  0.67817009  0.43778816
   0.04394713]
 [-2.53923104  1.54068827 -0.5686365  ...  0.67817009  0.43778816
   0.49028637]
 ...
 [ 1.09874571 -0.57515673  1.42743012 ...  0.67817009  0.43778816
  -0.20401912]
 [ 1.09895204 -0.57515673  1.48884755 ...  0.67817009  0.43778816
  -0.87352799]
 [ 1.1026479   1.54068827 -0.04658831 ... -0.55115386 -0.97042777
  -0.79913812]]


In [46]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:  ##Trying different values of parametr C to find optimal 
    start_time = datetime.datetime.now()  ##Time-counter
    time.sleep(3)
    lg = LogisticRegression(penalty = 'l2', random_state = 42, C = C)
    cvs = cross_val_score(estimator = lg, X = X_train, y = y, cv = clf, scoring='roc_auc')  
    answ = cvs.mean()
    print ('C:', C)
    print ('Precision:', answ)
    print ('Time elapsed:', datetime.datetime.now() - start_time, '\n')
    
##Conclusion: we will use C = 0.01

C: 0.001
Precision: 0.7163635378209221
Time elapsed: 0:03:09.906313 

C: 0.01
Precision: 0.7165502697259141
Time elapsed: 0:00:15.957636 

C: 0.1
Precision: 0.7165271486657933
Time elapsed: 0:00:17.508568 

C: 1
Precision: 0.7165226003626438
Time elapsed: 0:00:17.524102 

C: 10
Precision: 0.7165222888959446
Time elapsed: 0:00:16.799537 

C: 100
Precision: 0.7165222952575674
Time elapsed: 0:00:16.703701 



In [53]:
##Deleting categorical features from dataset
y_categorical = features[['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]
columns = ['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
features.drop(columns, inplace=True, axis=1)

In [54]:
##Scaling for new set of featuresscaler = StandardScaler()
scaler.fit(features)
X_train_new = scaler.transform(features) 

In [55]:
##Logistic regression without categorical features
for C in [0.001, 0.01, 0.1, 1, 10, 100]:  ##Trying different values of parametr C to find optimal 
    start_time = datetime.datetime.now()  ##Time-counter
    time.sleep(3)
    lg = LogisticRegression(penalty = 'l2', random_state = 42, C = C)
    cvs = cross_val_score(estimator = lg, X = X_train_new, y = y, cv = clf, scoring='roc_auc')  
    answ = cvs.mean()
    print ('C:', C)
    print ('Precision:', answ)
    print ('Time elapsed:', datetime.datetime.now() - start_time, '\n')

C: 0.001
Precision: 0.7163757999081172
Time elapsed: 0:00:11.416046 

C: 0.01
Precision: 0.7165593885630225
Time elapsed: 0:00:13.945163 

C: 0.1
Precision: 0.7165342403465319
Time elapsed: 0:00:15.469703 

C: 1
Precision: 0.7165303634514961
Time elapsed: 0:00:18.624751 

C: 10
Precision: 0.7165299715025929
Time elapsed: 0:00:15.159210 

C: 100
Precision: 0.7165298910060073
Time elapsed: 0:00:15.876479 



In [56]:
##Counting number of different heroes
hero_count = y_categorical['r1_hero'] + y_categorical['r2_hero'] + y_categorical['r3_hero']
+ y_categorical['r4_hero'] + y_categorical['r5_hero'] + y_categorical['d1_hero']
+ y_categorical['d2_hero'] + y_categorical['d3_hero'] + y_categorical['d4_hero'] + y_categorical['d5_hero'] ##all heroes
hero_count = hero_count.unique()
hero_count.__len__()

##numv=ber of heroes = 317

317

In [57]:
X_pick = np.zeros((y_categorical.shape[0], 317)) ## 317 - number of heroes
for i, match_id in enumerate(y_categorical.index):
    for p in range(5):
        X_pick[i, y_categorical.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, y_categorical.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


In [58]:
data = np.concatenate((X_train_new, X_pick), axis=1)
##Logistic regression with categorical features coded as bag of features
for C in [0.001, 0.01, 0.1, 1, 10, 100]:  ##Trying different values of parametr C to find optimal 
    start_time = datetime.datetime.now()  ##Time-counter
    time.sleep(3)
    lg = LogisticRegression(penalty = 'l2', random_state = 42, C = C)
    cvs = cross_val_score(estimator = lg, X = data, y = y, cv = clf, scoring='roc_auc')  
    answ = cvs.mean()
    print ('C:', C)
    print ('Precision:', answ)
    print ('Time elapsed:', datetime.datetime.now() - start_time, '\n')

C: 0.001
Precision: 0.7463341511009906
Time elapsed: 0:00:14.489664 

C: 0.01
Precision: 0.7517377305756991
Time elapsed: 0:00:22.072357 

C: 0.1
Precision: 0.751947432864572
Time elapsed: 0:00:32.974364 

C: 1
Precision: 0.7519273152918454
Time elapsed: 0:00:33.147550 

C: 10
Precision: 0.7519250751874296
Time elapsed: 0:00:31.108165 

C: 100
Precision: 0.7519248717899478
Time elapsed: 0:00:34.551234 



In [59]:
##Predicting a winning team for test dataset
##Training the model
lg = LogisticRegression(penalty = 'l2', random_state = 42, C = 0.1)
lg.fit(data, y)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [60]:
##Preprocessing test dataset
features_test = features_test.fillna(0)

y_categorical_test = features_test[['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]
columns = ['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
features_test.drop(columns, inplace=True, axis=1)

X_pick_test = np.zeros((y_categorical_test.shape[0], 317)) ## 317 - number of heroes
for i, match_id in enumerate(y_categorical_test.index):
    for p in range(5):
        X_pick[i, y_categorical_test.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, y_categorical_test.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
scaler = StandardScaler()
scaler.fit(features_test)
X_test = scaler.transform(features_test)
data_test = np.concatenate((X_test, X_pick_test), axis=1)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # This is added back by InteractiveShellApp.init_path()
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if sys.path[0] == '':


In [61]:
lg.predict(data_test)

array([1, 1, 0, ..., 0, 0, 1])

In [64]:
pred = lg.predict_proba(data_test)[:, 1]
pred

array([0.51587803, 0.66356237, 0.31798575, ..., 0.22524244, 0.35760494,
       0.52572676])

In [65]:
min(pred)

0.005789361732874468

In [66]:
max(pred)

0.9942010945149464