In [5]:
import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier as GB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [6]:
dota_train = pd.read_csv('features.csv', index_col='match_id')
dota_test = pd.read_csv('features_test.csv', index_col='match_id')

In [28]:
dota_train.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63


### Removing columns, which can result in leakage

In [7]:
X = pd.DataFrame(dota_train.drop(['radiant_win','duration','tower_status_radiant',\
                 'tower_status_dire','barracks_status_radiant','barracks_status_dire'],\
                    axis=1))

### Finding cols with missing values

In [52]:
for i in dota_train.columns:
    print
    #print(f'{i}: {dota_train[i].isnull().sum()/dota_train.shape[0] :.2f}% data is lost')
    #if dota_train[i].isnull().sum()>0:
    #    print(f"\'{i}\'",end=',')

In [8]:
cols_with_missing = ['first_blood_time','first_blood_team','first_blood_player1',\
                     'first_blood_player2','radiant_bottle_time','radiant_courier_time',\
                     'radiant_flying_courier_time','radiant_first_ward_time',\
                     'dire_bottle_time','dire_courier_time','dire_flying_courier_time',\
                     'dire_first_ward_time']

Если событие "первая кровь" не успело произойти за первые 5 минут, то признаки принимают пропущенное значение.
Поэтому признаки 'first_blood_time','first_blood_team','first_blood_player1' имеют 0.20% 
значений. Признак 'first_blood_player2' имеет 0.45% пропущенных значений скорее всего из-за того, что "первую кровь" пустил только один игрок, без помощи других.

### Replace missing values

According to the task we will fill the missing values with zeros. Later we can do some hyperparameter optimization

In [9]:
X.fillna(0,inplace = True)

### Target column

We predict who will win, so we need to label 'radiant_win'

### Prediction with Gradient Boosting

In [20]:
y = dota_train['radiant_win']
kfold = KFold(n_splits=5, shuffle = True)

In [59]:
for i in [10,20,30]:

    clf = GB(n_estimators=i)
    start_time = datetime.datetime.now()
    mean_score = cross_val_score(clf, X, y, scoring="roc_auc", cv = kfold).mean()
    #results = cross_val_score(clf, X, y, cv=kfold)
    #print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    print ('Time elapsed:', datetime.datetime.now() - start_time)
    print(mean_score)

Time elapsed: 0:01:09.984544
0.665306650864717
Time elapsed: 0:02:26.949287
0.6814140908787021
Time elapsed: 0:03:41.682899
0.6896463600745194


Now we need to use AUC ROC, and probably decrease processing time.
For a normal cross-validation:<br>
10 algos Time elapsed: 0:01:09.984544 Acc:0.665306650864717<br>
20 algos Time elapsed: 0:02:26.949287 Acc:0.6814140908787021<br>
30 algos Time elapsed: 0:03:41.682899 Acc:0.6896463600745194

# Logistic Regression prediction

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [14]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X))

### Finding the accuracy by ROC

In [17]:
from sklearn.linear_model import RidgeClassifier

In [25]:
for i in [1e-5, 1e-4, 5e-5,6e-5]:
    start_time = datetime.datetime.now()
    model = RidgeClassifier(alpha=i)
    mean_score = cross_val_score(model, X_scaled, y, scoring="roc_auc", cv = kfold).mean()
    print(i)
    print ('Time elapsed:', datetime.datetime.now() - start_time)
    print(mean_score)

1e-05
Time elapsed: 0:00:01.977317
0.7164579771021937
0.0001
Time elapsed: 0:00:02.102919
0.7163518677161613
5e-05
Time elapsed: 0:00:01.997777
0.7161498063256865
6e-05
Time elapsed: 0:00:02.353167
0.7163755453923766


In [28]:
100*(0.7164579771021937 - 0.6896463600745194)/0.6896463600745194

3.8877341460596124

In [30]:
X_scaled.shape

(97230, 102)

### Dropping nominal features

In [41]:
drop_columns = ['lobby_type','r1_hero', 'r2_hero', 'r3_hero','r4_hero', 'r5_hero', 'd1_hero', 'd2_hero',\
                'd3_hero','d4_hero',\
                    'd5_hero']

In [42]:
X_drop = pd.DataFrame(X.drop(drop_columns,axis=1))

In [43]:
scaler = StandardScaler()
X_scaled_drop = pd.DataFrame(scaler.fit_transform(X_drop))

In [44]:
for i in [1e-5, 1e-4, 5e-5,6e-5]:
    start_time = datetime.datetime.now()
    model = RidgeClassifier(alpha=i)
    mean_score = cross_val_score(model, X_scaled_drop, y, scoring="roc_auc", cv = kfold).mean()
    print(i)
    print ('Time elapsed:', datetime.datetime.now() - start_time)
    print(mean_score)

1e-05
Time elapsed: 0:00:02.440072
0.716586382786688
0.0001
Time elapsed: 0:00:01.838245
0.7164035458744885
5e-05
Time elapsed: 0:00:02.605598
0.7164924852707244
6e-05
Time elapsed: 0:00:02.210787
0.7165119301661695


### How many different heroes do we have?

In [51]:
heroes = ['r1_hero', 'r2_hero', 'r3_hero','r4_hero', 'r5_hero', 'd1_hero', 'd2_hero',\
                'd3_hero','d4_hero',\
                    'd5_hero']
#for i in heroes:
np.unique(X[heroes].values)

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 109, 110, 112])

112 heroes

### Create bag of words

In [56]:
# 112 — количество различных героев в выборке
X_pick = np.zeros((X.shape[0], 112))

for i, match_id in enumerate(X.index):
    for p in range(5):
        X_pick[i, X.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, X.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [93]:
X_pick=pd.DataFrame(X_pick)
#X_pick.head()
#X_pick[109].value_counts()

In [77]:
#X_pick.rename(columns=lambda x: x+ 1, inplace=True)

In [94]:
X_for_bag = X_scaled_drop.copy()
X_for_bag.reset_index(drop=True, inplace=True)
X_pick.reset_index(drop=True, inplace=True)

In [95]:
X_for_bag = pd.concat([X_for_bag,X_pick],axis=1)

In [96]:
X_for_bag.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,103,104,105,106,107,108,109,110,111,112
0,-2.544364,1.400808,1.525972,0.734957,0.969743,-0.537757,-0.578083,-0.509023,-0.332256,-0.625222,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2.540452,0.501314,-0.080139,-0.24757,-0.246859,-0.537757,1.017574,1.49293,0.578881,0.732454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2.539231,0.501314,0.15107,0.263085,1.190944,-0.537757,-0.578083,1.49293,-0.332256,0.224676,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2.532622,0.501314,0.96295,-0.198013,0.306142,-0.537757,-0.578083,-1.309804,-1.243393,-1.170813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-2.529221,0.501314,0.348745,-0.124754,-0.357459,0.968527,-0.578083,-0.108632,-1.243393,-1.008757,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Below I'm trying to define if the rows in X_for_bag mean the same as X. <br>
I also probably need to name the columns of X_for_bags by the names we have in X.

In [104]:
X.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,0,35.0,103.0,-84.0,221.0,3,4,2,2,-52.0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,0,-20.0,149.0,-84.0,195.0,5,4,3,1,-5.0
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,1,-39.0,45.0,-77.0,221.0,3,4,3,1,13.0
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,0,-30.0,124.0,-80.0,184.0,0,4,2,0,27.0
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,0,46.0,182.0,-80.0,225.0,6,3,3,0,-16.0


In [105]:
for i in heroes:
    print(X[i].loc[X.index==0])

match_id
0    11
Name: r1_hero, dtype: int64
match_id
0    67
Name: r2_hero, dtype: int64
match_id
0    29
Name: r3_hero, dtype: int64
match_id
0    20
Name: r4_hero, dtype: int64
match_id
0    105
Name: r5_hero, dtype: int64
match_id
0    4
Name: d1_hero, dtype: int64
match_id
0    42
Name: d2_hero, dtype: int64
match_id
0    21
Name: d3_hero, dtype: int64
match_id
0    37
Name: d4_hero, dtype: int64
match_id
0    84
Name: d5_hero, dtype: int64
