In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import time
import datetime
import warnings
warnings.filterwarnings('ignore')

## Подход 1: градиентный бустинг "в лоб"

#### Загрузка таблицы с признаками из файла features.csv.

In [35]:
features = pd.read_csv('C:\\Users\\kvanc\\coursera\\files\\features.csv', index_col = 'match_id')
features.shape

(97230, 108)

In [36]:
features.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63


#### Проверка выборки на наличие пропусков с помощью функции count(). Ниже названия признаков, имеющих пропуски и полное количество пропусков.

In [37]:
skips = {}
for i in range(108):
    if features.iloc[:, i].count() != 97230:
        s = 97230 - features.iloc[:, i].count()
        skips[i] = s
all_skips = 0
for key, value in skips.items():
    all_skips += value
    print(features.columns.values[key])
print('all_skips = ', all_skips)

first_blood_time
first_blood_team
first_blood_player1
first_blood_player2
radiant_bottle_time
radiant_courier_time
radiant_flying_courier_time
radiant_first_ward_time
dire_bottle_time
dire_courier_time
dire_flying_courier_time
dire_first_ward_time
all_skips =  193087


#### Замена пропусков на нули с помощью функции fillna(). 

In [38]:
features = features.fillna(0)

#### Разделение данных таблицы на признаки и целевую переменную.

In [42]:
data = features.iloc[:, :102]
target = features['radiant_win']

#### Функция, вычисляющая пплощадь под ROC-кривой.

In [43]:
def my_score(clf, data, target):
    return roc_auc_score(target, np.round(clf.predict_proba(data))[:, 1])

In [44]:
start_time = datetime.datetime.now()

#### Реализация градиентного бустинга с количеством деревьев равным 10, 20 и 30.

In [45]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 1)
scores = {}
for i in [10, 20, 30]:
    clf = GradientBoostingClassifier(n_estimators = i, random_state = 241)
    score = cross_val_score(clf, data, target, scoring = my_score, cv = kf)
    scores[i] = score.mean()
scores

{10: 0.6073191978932565, 20: 0.6240009687172192, 30: 0.6312914169540803}

In [46]:
print('time elapsed:', datetime.datetime.now() - start_time)

time elapsed: 0:02:19.565503


## Подход 2: логистическая регрессия

In [47]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

In [48]:
start_time = datetime.datetime.now()

#### Обучение модели логистической регрессии и поиск наилучшего параметра С.

In [49]:
C_list = list(np.power(10.0, np.arange(-5, 6)))
kf = KFold(n_splits = 5, shuffle = True, random_state = 1)
scores = {}
for C in C_list:
    clf = LogisticRegression(penalty = 'l2', C = C)
    score = cross_val_score(clf, data_scaled, target, scoring = my_score, cv = kf)
    scores[C] = score.mean()
scores

{1e-05: 0.638202710013997,
 0.0001: 0.6507590559936953,
 0.001: 0.6537160725196971,
 0.01: 0.6537102318130077,
 0.1: 0.65400441142375,
 1.0: 0.6540046338282889,
 10.0: 0.6539747315867996,
 100.0: 0.6539747315867996,
 1000.0: 0.6539747315867996,
 10000.0: 0.6539747315867996,
 100000.0: 0.6539747315867996}

In [50]:
print('time elapsed:', datetime.datetime.now() - start_time)

time elapsed: 0:02:03.586637


In [51]:
best_score = 0
for key, value in scores.items():
    if value > best_score:
        best_score = value
        best_C = key
print(best_C, best_score)

1.0 0.6540046338282889


In [55]:
features_new = pd.read_csv('C:\\Users\\kvanc\\coursera\\files\\features.csv', index_col = 'match_id')
features_new.drop(['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant',
                   'barracks_status_dire', 'lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero',
                   'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1, inplace = True)
features_new = features_new.fillna(0)

In [56]:
data_scaled = scaler.fit_transform(features_new)

In [57]:
scores_new = {}
for C in C_list:
    clf = LogisticRegression(penalty = 'l2', C = C)
    score = cross_val_score(clf, data_scaled, target, scoring = my_score, cv = kf)
    scores_new[C] = score.mean()
scores_new

{1e-05: 0.6383703031168049,
 0.0001: 0.6507739854115643,
 0.001: 0.6535026437629672,
 0.01: 0.6541187132970514,
 0.1: 0.654230377830497,
 1.0: 0.6541563817707394,
 10.0: 0.6541662987933495,
 100.0: 0.6541769484206126,
 1000.0: 0.6541769484206126,
 10000.0: 0.6541769484206126,
 100000.0: 0.6541769484206126}

In [63]:
best_score_new = 0
for key, value in scores_new.items():
    if value > best_score_new:
        best_score_new = value
        best_C_new = key
print(best_C_new, best_score_new)

0.1 0.654230377830497


In [70]:
features_count = pd.read_csv('C:\\Users\\kvanc\\coursera\\files\\features.csv', index_col = 'match_id')
heros = ['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
features_count = features_count[heros]
features_count

Unnamed: 0_level_0,r1_hero,r2_hero,r3_hero,r4_hero,r5_hero,d1_hero,d2_hero,d3_hero,d4_hero,d5_hero
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,11,67,29,20,105,4,42,21,37,84
1,42,49,67,37,26,39,88,79,7,12
2,33,98,20,27,4,22,66,86,29,80
3,29,30,75,37,41,96,48,15,102,20
4,13,27,30,72,93,26,69,22,25,8
...,...,...,...,...,...,...,...,...,...,...
114402,47,7,1,21,71,26,19,93,3,28
114403,43,26,4,29,110,72,75,5,20,98
114404,98,11,112,81,50,28,39,55,59,31
114405,100,72,79,20,39,59,9,50,28,106


In [76]:
pd.Series(features_count.values.flatten()).drop_duplicates().shape

(108,)

In [106]:
counts = pd.Series(features_count.values.flatten()).drop_duplicates()
counts.index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            359, 369, 409, 436, 446, 513, 634, 672, 763, 800],
           dtype='int64', length=108)

In [111]:
x_pick_d = pd.get_dummies(features_count[heros[5:]].astype('str'))
x_pick_r = pd.get_dummies(features_count[heros[:5]].astype('str'))
x_pick_r *= -1
x_pick_d.columns=[col[1:] for col in list(x_pick_d.columns)]#убираем в колонках принадлежность к классам
x_pick_r.columns=[col[1:] for col in list(x_pick_r.columns)]#убираем в колонках принадлежность к классам

x_pick=x_pick_d+x_pick_r

del x_pick_d,x_pick_r

total=features_new.join(x_pick,rsuffix='_',how='inner')#pd.DataFrame(data=np.concatenate([x_pick,data_full_norm],axis=1))

del x_pick,features_new

In [114]:
total

Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,5_hero_90,5_hero_91,5_hero_92,5_hero_93,5_hero_94,5_hero_95,5_hero_96,5_hero_97,5_hero_98,5_hero_99
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,5,2098,1489,20,0,0,7,3,842,...,0,0,0,0,0,0,0,0,0,0
1,1430220345,4,1188,1033,9,0,1,12,4,1596,...,0,0,0,0,0,0,0,0,0,0
2,1430227081,4,1319,1270,22,0,0,12,3,1314,...,0,0,0,0,0,0,0,0,0,0
3,1430263531,4,1779,1056,14,0,0,5,2,539,...,0,0,0,0,0,0,0,0,0,0
4,1430282290,4,1431,1090,8,1,0,8,2,629,...,0,0,0,-1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114402,1450265551,4,1706,1198,17,0,1,8,2,616,...,0,0,0,0,0,0,0,0,0,0
114403,1450277704,4,1793,1416,17,0,1,5,3,764,...,0,0,0,0,0,0,0,0,1,0
114404,1450291848,4,1399,540,1,0,0,5,4,1448,...,0,0,0,0,0,0,0,0,0,0
114405,1450292986,3,1135,766,6,0,2,6,5,1954,...,0,0,0,0,0,0,0,0,0,0


In [94]:
#features_new.ix[11, 'r2_hero']
features_new

Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,5,2098,1489,20,0,0,7,3,842,...,0,35.0,103.0,-84.0,221.0,3,4,2,2,-52.0
1,1430220345,4,1188,1033,9,0,1,12,4,1596,...,0,-20.0,149.0,-84.0,195.0,5,4,3,1,-5.0
2,1430227081,4,1319,1270,22,0,0,12,3,1314,...,1,-39.0,45.0,-77.0,221.0,3,4,3,1,13.0
3,1430263531,4,1779,1056,14,0,0,5,2,539,...,0,-30.0,124.0,-80.0,184.0,0,4,2,0,27.0
4,1430282290,4,1431,1090,8,1,0,8,2,629,...,0,46.0,182.0,-80.0,225.0,6,3,3,0,-16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114402,1450265551,4,1706,1198,17,0,1,8,2,616,...,0,-29.0,180.0,-76.0,180.0,3,4,3,0,-24.0
114403,1450277704,4,1793,1416,17,0,1,5,3,764,...,0,-5.0,0.0,-82.0,0.0,4,3,2,0,-17.0
114404,1450291848,4,1399,540,1,0,0,5,4,1448,...,2,-32.0,249.0,-70.0,0.0,1,1,3,1,-15.0
114405,1450292986,3,1135,766,6,0,2,6,5,1954,...,0,-21.0,254.0,-85.0,183.0,5,3,3,1,-42.0
