In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import time
import datetime

In [2]:
features = pd.read_csv('features.csv', index_col='match_id')
features.drop(['tower_status_radiant',
               'tower_status_dire',
               'barracks_status_radiant',
               'barracks_status_dire',
               'duration'], axis=1, inplace=True)

# Подход 1: градиентный бустинг "в лоб"

### Пропуски в таблице признаков

Следующие поля имеют пропуски:

In [3]:
missing_values = []
missing_values_name = []
for name in list(features.columns.values):
    if len(features[name]) - features[name].count():
        missing_values_name.append(name)
        print(f'{name} have {len(features[name]) - features[name].count()} missing values\n')
    missing_values.append(len(features[name]) - features[name].count())
sorted_missing_values = sorted(missing_values)
max_1 = sorted_missing_values[len(missing_values) - 1]
max_2 = sorted_missing_values[len(missing_values) - 2]
feature_with_max_missing_values = list(features.columns.values)[missing_values.index(max_1)]
second_feature_with_max_missing_values = list(features.columns.values)[missing_values.index(max_2)]
print(f"""Two features with max values of missing values are {feature_with_max_missing_values.upper()} with {max_1} 
          missing values and {second_feature_with_max_missing_values.upper()} with {max_2}""")

first_blood_time have 19553 missing values

first_blood_team have 19553 missing values

first_blood_player1 have 19553 missing values

first_blood_player2 have 43987 missing values

radiant_bottle_time have 15691 missing values

radiant_courier_time have 692 missing values

radiant_flying_courier_time have 27479 missing values

radiant_first_ward_time have 1836 missing values

dire_bottle_time have 16143 missing values

dire_courier_time have 676 missing values

dire_flying_courier_time have 26098 missing values

dire_first_ward_time have 1826 missing values

Two features with max values of missing values are FIRST_BLOOD_PLAYER2 with 43987 
          missing values and RADIANT_FLYING_COURIER_TIME with 27479


Всего полей с пропусками 12. Поля с наибольшим количеством пропусков `FIRST_BLOOD_PLAYER2` и `RADIANT_FLYING_COURIER_TIME`.
`FIRST_BLOOD_PLAYER2` - второй игрок, причастный к совершению первого убийства в игре. 
`RADIANT_FLYING_COURIER_TIME` - время приобретения предмета "courier". 
Такое большое количество пропусков в данных признаках можно объяснить тем, что:
- `FIRST_BLOOD_PLAYER2` : Не всегда в первом убийстве участвует второй игрок
- `RADIANT_FLYING_COURIER_TIME` : Игроки не в каждой игре берут предмет "courier"

#### Заполнение таблицы признаков нулями

In [4]:
features_0 = features.copy(deep = True)
for name in list(features_0.columns.values):
    features_0[name].fillna(value=0, inplace=True)

#### Заполнение таблицы признаков большими значениями 

In [5]:
features_BigNumbers = features.copy(deep = True)
for name in list(features_BigNumbers.columns.values):
    features_BigNumbers[name].fillna(value=1e8, inplace=True)

#### Заполнение таблицы признаков средними значениями

In [6]:
features_Mean = features.copy(deep = True)
for name in list(features_Mean.columns.values):
    features_Mean[name].fillna(value=features_Mean[name].mean(), inplace=True)

### Целевая переменная

In [7]:
y = pd.DataFrame(features['radiant_win'])

- `Целевая переменная имеет название radiant_win`

### Градиентый бустинг над решающими деревьями

#### Обучение на таблице признаков, заполненной нулями

In [8]:
X_0 = features_0.loc[:,features_0.columns != 'radiant_win']

In [9]:
kf = KFold(n_splits=5, random_state=1 ,shuffle=True)

In [24]:
result_0 = pd.DataFrame(columns=['10', '20', '30', '40', '50', '100'], index=['1','0.5', '0.2', '0.1'])
result_time_0 = pd.DataFrame(columns=['10', '20', '30', '40', '50', '100'], index=['1','0.5', '0.2', '0.1'])
for n_estimators in [10, 20, 30, 40, 50, 100]:
    for learning_rate in [1, 0.5, 0.2, 0.1]: 
        roc_auc_score_list = []
        start_time = datetime.datetime.now()
        clf = GradientBoostingClassifier(n_estimators=n_estimators, verbose=True, 
                                                random_state=241, learning_rate = learning_rate)
        for train, test in kf.split(X_0, y):
            X_0_train = X_0.iloc[train, :].values.tolist()
            X_0_test = X_0.iloc[test, :].values.tolist()
            y_train = list(map(lambda x: x.pop(), y.iloc[train, :].values.tolist()))
            y_test = list(map(lambda x: x.pop(),y.iloc[test, :].values.tolist()))
            probs = clf.fit(X_0_train, y_train).predict_proba(X_0_test)[:, 1]
            roc_auc_value = roc_auc_score(y_test, probs)
            roc_auc_score_list.append(roc_auc_value)
        delta_time = datetime.datetime.now() - start_time
        print(f'Classifier with number of trees {n_estimators} and learn_rate {learning_rate} has roc_auc value equal {np.mean(roc_auc_score_list)}')
        print(f'Classifier with number of trees {n_estimators} and learn_rate {learning_rate} has elapsed time {delta_time}')
        result_0[str(n_estimators)][str(learning_rate)] = np.mean(roc_auc_score_list)
        result_time_0[str(n_estimators)][str(learning_rate)] = delta_time

      Iter       Train Loss   Remaining Time 
         1           1.3504            6.76s
         2           1.3265            5.64s
         3           1.3074            4.78s
         4           1.2932            4.06s
         5           1.2838            3.40s
         6           1.2759            2.81s
         7           1.2687            2.14s
         8           1.2638            1.45s
         9           1.2602            0.71s
        10           1.2557            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3507            5.65s
         2           1.3279            4.83s
         3           1.3100            4.17s
         4           1.2948            3.57s
         5           1.2842            2.98s
         6           1.2770            2.39s
         7           1.2711            1.80s
         8           1.2662            1.20s
         9           1.2618            0.60s
        10           1.2582            0.00s
      It

         5           1.3584            3.01s
         6           1.3544            2.41s
         7           1.3499            1.80s
         8           1.3456            1.20s
         9           1.3417            0.60s
        10           1.3379            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3784            5.54s
         2           1.3730            4.85s
         3           1.3678            4.26s
         4           1.3634            3.63s
         5           1.3590            3.04s
         6           1.3544            2.42s
         7           1.3502            1.80s
         8           1.3457            1.20s
         9           1.3417            0.59s
        10           1.3381            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3787            5.72s
         2           1.3732            5.02s
         3           1.3680            4.32s
         4           1.3635            3.69s
        

      Iter       Train Loss   Remaining Time 
         1           1.3725           12.22s
         2           1.3629           12.99s
         3           1.3539           12.24s
         4           1.3456           11.27s
         5           1.3374           10.27s
         6           1.3304            9.39s
         7           1.3237            9.05s
         8           1.3176            8.60s
         9           1.3123            8.02s
        10           1.3071            7.22s
        20           1.2724            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3726           12.21s
         2           1.3633           11.31s
         3           1.3538           10.51s
         4           1.3454            9.79s
         5           1.3377            9.15s
         6           1.3307            8.68s
         7           1.3247            8.18s
         8           1.3188            7.50s
         9           1.3135            6.92s
        

         2           1.3282           17.71s
         3           1.3081           16.90s
         4           1.2936           16.30s
         5           1.2811           15.85s
         6           1.2739           15.16s
         7           1.2684           14.44s
         8           1.2637           14.11s
         9           1.2601           13.42s
        10           1.2566           12.78s
        20           1.2325            6.49s
        30           1.2148            0.00s
Classifier with number of trees 30 and learn_rate 1 has roc_auc value equal 0.6974148022907791
Classifier with number of trees 30 and learn_rate 1 has elapsed time 0:01:50.718263
      Iter       Train Loss   Remaining Time 
         1           1.3592           20.92s
         2           1.3404           19.51s
         3           1.3240           18.17s
         4           1.3093           17.44s
         5           1.2987           16.72s
         6           1.2897           17.08s
         7

         3           1.3680           28.31s
         4           1.3635           25.41s
         5           1.3592           23.00s
         6           1.3546           20.91s
         7           1.3500           19.26s
         8           1.3459           17.81s
         9           1.3418           16.53s
        10           1.3379           15.43s
        20           1.3077            7.83s
        30           1.2876            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3784           25.61s
         2           1.3728           21.17s
         3           1.3677           19.33s
         4           1.3631           18.65s
         5           1.3586           17.60s
         6           1.3538           16.52s
         7           1.3494           15.52s
         8           1.3453           14.63s
         9           1.3412           13.80s
        10           1.3372           13.05s
        20           1.3071            6.24s
        3

        30           1.2203            6.14s
        40           1.2081            0.00s
Classifier with number of trees 40 and learn_rate 0.5 has roc_auc value equal 0.706125886099495
Classifier with number of trees 40 and learn_rate 0.5 has elapsed time 0:02:12.646806
      Iter       Train Loss   Remaining Time 
         1           1.3725           24.66s
         2           1.3629           24.22s
         3           1.3539           23.50s
         4           1.3456           22.57s
         5           1.3374           22.57s
         6           1.3304           21.72s
         7           1.3237           20.94s
         8           1.3176           20.87s
         9           1.3123           20.09s
        10           1.3071           19.43s
        20           1.2724           13.50s
        30           1.2523            6.87s
        40           1.2390            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3726           24.72s
      

        40           1.2030            6.96s
        50           1.1896            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3512           30.07s
         2           1.3285           28.69s
         3           1.3068           27.84s
         4           1.2909           26.89s
         5           1.2820           26.65s
         6           1.2737           26.20s
         7           1.2678           25.66s
         8           1.2626           25.09s
         9           1.2587           24.41s
        10           1.2548           23.85s
        20           1.2326           17.54s
        30           1.2156           12.24s
        40           1.2012            6.45s
        50           1.1891            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3503           37.31s
         2           1.3273           36.34s
         3           1.3061           35.14s
         4           1.2923           34.06s
        

         7           1.3235           33.23s
         8           1.3173           32.34s
         9           1.3120           31.79s
        10           1.3071           32.14s
        20           1.2722           24.36s
        30           1.2520           16.03s
        40           1.2388            7.85s
        50           1.2296            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3728           33.70s
         2           1.3632           35.24s
         3           1.3543           35.51s
         4           1.3461           35.23s
         5           1.3376           34.29s
         6           1.3300           33.65s
         7           1.3237           32.70s
         8           1.3176           31.96s
         9           1.3124           30.95s
        10           1.3073           29.72s
        20           1.2726           22.05s
        30           1.2527           14.42s
        40           1.2394            7.11s
        5

        70           1.1671           22.84s
        80           1.1581           14.77s
        90           1.1491            7.21s
       100           1.1414            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3512            1.02m
         2           1.3282           59.39s
         3           1.3081           57.78s
         4           1.2936           56.99s
         5           1.2811           56.64s
         6           1.2739           56.18s
         7           1.2684           55.38s
         8           1.2637           55.00s
         9           1.2601           54.89s
        10           1.2566           54.82s
        20           1.2325           47.72s
        30           1.2148           41.82s
        40           1.2008           35.63s
        50           1.1880           29.63s
        60           1.1776           23.81s
        70           1.1673           18.82s
        80           1.1557           13.01s
        9

        10           1.3071            1.17m
        20           1.2729           58.08s
        30           1.2531           50.31s
        40           1.2398           44.77s
        50           1.2302           36.71s
        60           1.2225           29.77s
        70           1.2159           22.18s
        80           1.2103           14.57s
        90           1.2054            7.32s
       100           1.2005            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3725            2.91m
         2           1.3631            2.10m
         3           1.3537            1.74m
         4           1.3447            1.56m
         5           1.3368            1.43m
         6           1.3294            1.34m
         7           1.3235            1.28m
         8           1.3173            1.23m
         9           1.3120            1.19m
        10           1.3071            1.15m
        20           1.2722            1.01m
        3

In [25]:
print('AUC-ROC results')
result_0

AUC-ROC results


Unnamed: 0,10,20,30,40,50,100
1.0,0.684291,0.693152,0.697415,0.699814,0.700226,0.700775
0.5,0.68529,0.697754,0.702799,0.706126,0.708608,0.712554
0.2,0.677809,0.691567,0.697992,0.702197,0.704708,0.711951
0.1,0.664833,0.682114,0.689695,0.693934,0.697114,0.706211


In [26]:
print('Time results')
result_time_0

Time results


Unnamed: 0,10,20,30,40,50,100
1.0,0:00:41.509367,0:01:13.236914,0:01:50.718263,0:02:08.280604,0:03:06.484069,0:06:25.443827
0.5,0:00:38.173546,0:01:14.769823,0:02:06.120102,0:02:12.646806,0:03:37.585390,0:06:02.514269
0.2,0:00:38.582550,0:01:16.851784,0:02:28.429832,0:02:33.790065,0:03:23.839025,0:07:22.032017
0.1,0:00:38.406783,0:01:44.590004,0:02:26.494815,0:02:35.003708,0:03:50.090809,0:07:25.582876


#### Обучение на таблице признаков, заполненной большими значениями

In [14]:
X_BigNumber = features_BigNumbers.loc[:,features_BigNumbers.columns != 'radiant_win']

In [15]:
result_BN = pd.DataFrame(columns=['10', '20', '30', '40', '50', '100'], index=['1','0.5', '0.2', '0.1'])
result_time_BN = pd.DataFrame(columns=['10', '20', '30', '40', '50', '100'], index=['1','0.5', '0.2', '0.1'])
for n_estimators in [10, 20, 30, 40, 50, 100]:
    for learning_rate in [1, 0.5, 0.2, 0.1]: 
        roc_auc_score_list = []
        start_time = datetime.datetime.now()
        clf = GradientBoostingClassifier(n_estimators=n_estimators, verbose=True, 
                                                random_state=241, learning_rate = learning_rate)
        for train, test in kf.split(X_BigNumber, y):
            X_BigNumber_train = X_BigNumber.iloc[train, :].values.tolist()
            X_BigNumber_test = X_BigNumber.iloc[test, :].values.tolist()
            y_train = list(map(lambda x: x.pop(), y.iloc[train, :].values.tolist()))
            y_test = list(map(lambda x: x.pop(),y.iloc[test, :].values.tolist()))
            probs = clf.fit(X_BigNumber_train, y_train).predict_proba(X_BigNumber_test)[:, 1]
            roc_auc_value = roc_auc_score(y_test, probs)
            roc_auc_score_list.append(roc_auc_value)
        delta_time = datetime.datetime.now() - start_time
        print(f'Classifier with number of trees {n_estimators} and learn_rate {learning_rate} has roc_auc value equal {np.mean(roc_auc_score_list)}')
        print(f'Classifier with number of trees {n_estimators} and learn_rate {learning_rate} has elapsed time {delta_time}')
        result_BN[str(n_estimators)][str(learning_rate)] = np.mean(roc_auc_score_list)
        result_time_BN[str(n_estimators)][str(learning_rate)] = delta_time

      Iter       Train Loss   Remaining Time 
         1           1.3511            5.71s
         2           1.3272            4.88s
         3           1.3092            4.21s
         4           1.2956            3.61s
         5           1.2853            3.01s
         6           1.2773            2.41s
         7           1.2717            1.80s
         8           1.2676            1.19s
         9           1.2631            0.60s
        10           1.2587            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3521            5.53s
         2           1.3276            4.74s
         3           1.3092            4.13s
         4           1.2955            3.52s
         5           1.2866            2.94s
         6           1.2795            2.37s
         7           1.2733            1.78s
         8           1.2678            1.19s
         9           1.2630            0.60s
        10           1.2590            0.00s
      It

         5           1.3583            3.07s
         6           1.3541            2.47s
         7           1.3497            1.84s
         8           1.3456            1.22s
         9           1.3417            0.61s
        10           1.3379            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3787            5.71s
         2           1.3734            4.98s
         3           1.3687            4.33s
         4           1.3638            3.65s
         5           1.3593            3.03s
         6           1.3545            2.41s
         7           1.3502            1.80s
         8           1.3466            1.20s
         9           1.3425            0.60s
        10           1.3386            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3790            5.68s
         2           1.3736            4.98s
         3           1.3688            4.34s
         4           1.3641            3.71s
        

      Iter       Train Loss   Remaining Time 
         1           1.3728           11.95s
         2           1.3637           11.09s
         3           1.3549           10.47s
         4           1.3461            9.73s
         5           1.3383            9.04s
         6           1.3310            8.39s
         7           1.3243            7.77s
         8           1.3182            7.18s
         9           1.3129            6.54s
        10           1.3077            5.93s
        20           1.2729            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3732           11.63s
         2           1.3642           11.04s
         3           1.3552           10.23s
         4           1.3465            9.54s
         5           1.3385            8.89s
         6           1.3312            8.27s
         7           1.3249            7.69s
         8           1.3194            7.11s
         9           1.3135            6.53s
        

         2           1.3275           17.32s
         3           1.3066           16.41s
         4           1.2895           15.79s
         5           1.2784           15.25s
         6           1.2719           14.63s
         7           1.2659           14.04s
         8           1.2609           13.42s
         9           1.2568           12.81s
        10           1.2537           12.21s
        20           1.2300            6.04s
        30           1.2152            0.00s
Classifier with number of trees 30 and learn_rate 1 has roc_auc value equal 0.6964804304950603
Classifier with number of trees 30 and learn_rate 1 has elapsed time 0:01:37.306657
      Iter       Train Loss   Remaining Time 
         1           1.3598           18.18s
         2           1.3406           16.84s
         3           1.3235           16.21s
         4           1.3098           15.50s
         5           1.2989           14.89s
         6           1.2899           14.26s
         7

         3           1.3688           16.31s
         4           1.3641           15.72s
         5           1.3591           15.14s
         6           1.3544           14.42s
         7           1.3505           13.83s
         8           1.3462           13.26s
         9           1.3421           12.63s
        10           1.3381           11.99s
        20           1.3081            5.86s
        30           1.2880            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3786           17.86s
         2           1.3733           17.15s
         3           1.3684           16.35s
         4           1.3638           15.77s
         5           1.3590           15.02s
         6           1.3542           14.32s
         7           1.3496           13.68s
         8           1.3453           13.05s
         9           1.3412           12.43s
        10           1.3375           11.79s
        20           1.3073            5.85s
        3

        30           1.2202            5.98s
        40           1.2081            0.00s
Classifier with number of trees 40 and learn_rate 0.5 has roc_auc value equal 0.7054884649069383
Classifier with number of trees 40 and learn_rate 0.5 has elapsed time 0:02:07.259370
      Iter       Train Loss   Remaining Time 
         1           1.3728           24.01s
         2           1.3637           23.14s
         3           1.3549           22.48s
         4           1.3461           21.63s
         5           1.3383           20.86s
         6           1.3310           20.14s
         7           1.3243           19.53s
         8           1.3182           18.93s
         9           1.3129           18.24s
        10           1.3077           17.63s
        20           1.2729           11.67s
        30           1.2523            5.87s
        40           1.2387            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3732           24.09s
     

        40           1.2054            5.87s
        50           1.1925            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3527           30.32s
         2           1.3292           29.11s
         3           1.3098           27.95s
         4           1.2895           27.27s
         5           1.2786           26.84s
         6           1.2720           26.25s
         7           1.2664           25.76s
         8           1.2622           25.46s
         9           1.2588           24.76s
        10           1.2551           24.20s
        20           1.2305           17.88s
        30           1.2148           11.87s
        40           1.2012            5.91s
        50           1.1889            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3514           29.96s
         2           1.3273           28.55s
         3           1.3063           27.93s
         4           1.2913           27.15s
        

         7           1.3240           25.47s
         8           1.3180           24.86s
         9           1.3121           24.20s
        10           1.3071           23.51s
        20           1.2720           17.57s
        30           1.2518           11.71s
        40           1.2384            5.87s
        50           1.2287            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3733           30.34s
         2           1.3641           29.24s
         3           1.3554           28.80s
         4           1.3467           28.42s
         5           1.3385           27.64s
         6           1.3314           26.70s
         7           1.3251           25.84s
         8           1.3188           25.07s
         9           1.3135           24.31s
        10           1.3081           23.75s
        20           1.2732           17.77s
        30           1.2529           11.88s
        40           1.2394            5.96s
        5

        70           1.1684           17.11s
        80           1.1612           11.32s
        90           1.1525            5.65s
       100           1.1436            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3525            1.00m
         2           1.3275           58.69s
         3           1.3066           57.35s
         4           1.2895           56.59s
         5           1.2784           56.49s
         6           1.2719           55.84s
         7           1.2659           55.48s
         8           1.2609           54.91s
         9           1.2568           54.26s
        10           1.2537           53.75s
        20           1.2300           47.18s
        30           1.2152           40.53s
        40           1.2014           34.62s
        50           1.1893           28.95s
        60           1.1782           23.08s
        70           1.1673           17.26s
        80           1.1588           11.45s
        9

        10           1.3079           53.12s
        20           1.2734           46.67s
        30           1.2530           40.72s
        40           1.2397           34.95s
        50           1.2302           29.08s
        60           1.2223           23.45s
        70           1.2158           17.58s
        80           1.2101           11.69s
        90           1.2049            5.83s
       100           1.2002            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3730           59.42s
         2           1.3640           58.59s
         3           1.3543           57.32s
         4           1.3464           56.62s
         5           1.3382           55.56s
         6           1.3310           54.72s
         7           1.3240           53.95s
         8           1.3180           53.42s
         9           1.3121           52.79s
        10           1.3071           52.12s
        20           1.2720           46.27s
        3

In [16]:
print('AUC-ROC results')
result_BN

AUC-ROC results


Unnamed: 0,10,20,30,40,50,100
1.0,0.683625,0.693236,0.69648,0.698713,0.700209,0.700218
0.5,0.6855,0.697404,0.702367,0.705488,0.707895,0.71198
0.2,0.676771,0.690825,0.697691,0.701796,0.704352,0.712388
0.1,0.666886,0.682805,0.689896,0.694669,0.698003,0.706592


In [17]:
print('Time results')
result_time_BN

Time results


Unnamed: 0,10,20,30,40,50,100
1.0,0:00:38.672577,0:01:07.874161,0:01:37.306657,0:02:05.680663,0:02:35.629929,0:04:52.942210
0.5,0:00:38.621031,0:01:08.231983,0:01:37.070148,0:02:07.259370,0:02:35.721414,0:04:56.510218
0.2,0:00:38.442980,0:01:07.556256,0:01:37.411754,0:02:06.840491,0:02:35.381533,0:05:01.012190
0.1,0:00:38.599032,0:01:07.378220,0:01:36.070203,0:02:05.248511,0:02:33.254981,0:04:58.560014


#### Обучение на таблице признаков, заполненной средними значениями

In [18]:
X_Mean =  features_Mean.loc[:,features_Mean.columns != 'radiant_win']

In [19]:
result = pd.DataFrame(columns=['10', '20', '30', '40', '50', '100'], index=['1','0.5', '0.2', '0.1'])
result_time = pd.DataFrame(columns=['10', '20', '30', '40', '50', '100'], index=['1','0.5', '0.2', '0.1'])
for n_estimators in [10, 20, 30, 40, 50, 100]:
    for learning_rate in [1, 0.5, 0.2, 0.1]: 
        roc_auc_score_list = []
        start_time = datetime.datetime.now()
        clf = GradientBoostingClassifier(n_estimators=n_estimators, verbose=True, 
                                                random_state=241, learning_rate = learning_rate)
        for train, test in kf.split(X_Mean, y):
            X_Mean_train = X_Mean.iloc[train, :].values.tolist()
            X_Mean_test = X_Mean.iloc[test, :].values.tolist()
            y_train = list(map(lambda x: x.pop(), y.iloc[train, :].values.tolist()))
            y_test = list(map(lambda x: x.pop(),y.iloc[test, :].values.tolist()))
            probs = clf.fit(X_Mean_train, y_train).predict_proba(X_Mean_test)[:, 1]
            roc_auc_value = roc_auc_score(y_test, probs)
            roc_auc_score_list.append(roc_auc_value)
        delta_time = datetime.datetime.now() - start_time
        print(f'Classifier with number of trees {n_estimators} and learn_rate {learning_rate} has roc_auc value equal {np.mean(roc_auc_score_list)}')
        print(f'Classifier with number of trees {n_estimators} and learn_rate {learning_rate} has elapsed time {delta_time}')
        result[str(n_estimators)][str(learning_rate)] = np.mean(roc_auc_score_list)
        result_time[str(n_estimators)][str(learning_rate)] = delta_time

      Iter       Train Loss   Remaining Time 
         1           1.3504            5.71s
         2           1.3265            4.88s
         3           1.3074            4.20s
         4           1.2932            3.62s
         5           1.2839            3.00s
         6           1.2758            2.42s
         7           1.2689            1.82s
         8           1.2638            1.22s
         9           1.2596            0.61s
        10           1.2563            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3507            5.51s
         2           1.3279            4.74s
         3           1.3100            4.10s
         4           1.2948            3.49s
         5           1.2842            2.92s
         6           1.2770            2.33s
         7           1.2706            1.75s
         8           1.2666            1.18s
         9           1.2626            0.59s
        10           1.2590            0.00s
      It

         5           1.3586            3.07s
         6           1.3541            2.43s
         7           1.3499            1.82s
         8           1.3456            1.21s
         9           1.3416            0.60s
        10           1.3378            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3784            5.60s
         2           1.3730            4.88s
         3           1.3679            4.24s
         4           1.3635            3.61s
         5           1.3590            3.05s
         6           1.3542            2.42s
         7           1.3500            1.81s
         8           1.3458            1.20s
         9           1.3418            0.60s
        10           1.3379            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3787            5.60s
         2           1.3732            4.85s
         3           1.3681            4.22s
         4           1.3636            3.60s
        

      Iter       Train Loss   Remaining Time 
         1           1.3725           11.62s
         2           1.3626           10.90s
         3           1.3532           10.13s
         4           1.3448            9.49s
         5           1.3369            8.83s
         6           1.3298            8.18s
         7           1.3232            7.57s
         8           1.3174            6.98s
         9           1.3117            6.42s
        10           1.3067            5.84s
        20           1.2721            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3726           11.41s
         2           1.3631           10.68s
         3           1.3538            9.97s
         4           1.3454            9.24s
         5           1.3376            8.61s
         6           1.3312            8.10s
         7           1.3247            7.52s
         8           1.3186            6.95s
         9           1.3131            6.34s
        

         2           1.3282           16.91s
         3           1.3081           16.17s
         4           1.2936           15.46s
         5           1.2811           14.95s
         6           1.2739           14.37s
         7           1.2689           13.59s
         8           1.2643           13.08s
         9           1.2596           12.52s
        10           1.2559           11.94s
        20           1.2327            5.96s
        30           1.2161            0.00s
Classifier with number of trees 30 and learn_rate 1 has roc_auc value equal 0.6958802672296451
Classifier with number of trees 30 and learn_rate 1 has elapsed time 0:01:36.446779
      Iter       Train Loss   Remaining Time 
         1           1.3592           17.36s
         2           1.3404           16.42s
         3           1.3240           15.66s
         4           1.3093           14.94s
         5           1.2987           14.31s
         6           1.2897           13.88s
         7

         3           1.3681           16.38s
         4           1.3636           15.66s
         5           1.3594           15.02s
         6           1.3549           14.28s
         7           1.3505           13.63s
         8           1.3461           12.99s
         9           1.3420           12.48s
        10           1.3384           11.92s
        20           1.3081            5.89s
        30           1.2879            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3784           17.70s
         2           1.3728           16.93s
         3           1.3676           16.16s
         4           1.3630           15.57s
         5           1.3585           14.95s
         6           1.3539           14.25s
         7           1.3493           13.58s
         8           1.3449           12.95s
         9           1.3408           12.32s
        10           1.3370           11.70s
        20           1.3069            5.84s
        3

        30           1.2203            5.86s
        40           1.2082            0.00s
Classifier with number of trees 40 and learn_rate 0.5 has roc_auc value equal 0.7046840900964207
Classifier with number of trees 40 and learn_rate 0.5 has elapsed time 0:02:05.613724
      Iter       Train Loss   Remaining Time 
         1           1.3725           25.20s
         2           1.3626           23.72s
         3           1.3532           22.59s
         4           1.3448           21.61s
         5           1.3369           20.86s
         6           1.3298           20.15s
         7           1.3232           19.45s
         8           1.3174           18.85s
         9           1.3117           18.25s
        10           1.3067           17.65s
        20           1.2721           11.72s
        30           1.2522            5.82s
        40           1.2384            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3726           24.18s
     

        40           1.2054            5.89s
        50           1.1932            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3512           31.02s
         2           1.3285           29.34s
         3           1.3068           28.26s
         4           1.2909           27.38s
         5           1.2820           26.93s
         6           1.2737           26.49s
         7           1.2679           26.00s
         8           1.2627           25.42s
         9           1.2590           24.85s
        10           1.2553           24.25s
        20           1.2325           17.82s
        30           1.2151           11.81s
        40           1.2006            5.91s
        50           1.1893            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3503           30.43s
         2           1.3273           28.72s
         3           1.3061           27.89s
         4           1.2923           27.32s
        

         7           1.3231           25.55s
         8           1.3169           24.90s
         9           1.3116           24.25s
        10           1.3066           23.61s
        20           1.2717           17.65s
        30           1.2522           11.75s
        40           1.2390            5.92s
        50           1.2293            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3728           31.03s
         2           1.3632           29.67s
         3           1.3535           28.68s
         4           1.3450           27.82s
         5           1.3372           27.08s
         6           1.3299           26.36s
         7           1.3230           25.82s
         8           1.3174           25.39s
         9           1.3119           24.74s
        10           1.3069           24.05s
        20           1.2725           17.89s
        30           1.2529           11.98s
        40           1.2396            6.00s
        5

        70           1.1718           17.36s
        80           1.1607           11.56s
        90           1.1504            5.78s
       100           1.1413            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3512            1.02m
         2           1.3282           59.11s
         3           1.3081           58.20s
         4           1.2936           57.21s
         5           1.2811           56.96s
         6           1.2739           56.35s
         7           1.2689           55.11s
         8           1.2643           54.79s
         9           1.2596           54.32s
        10           1.2559           53.70s
        20           1.2327           47.45s
        30           1.2161           41.40s
        40           1.2027           35.32s
        50           1.1901           29.39s
        60           1.1797           23.45s
        70           1.1693           17.56s
        80           1.1586           11.70s
        9

        10           1.3072           53.60s
        20           1.2730           47.69s
        30           1.2534           41.68s
        40           1.2402           35.63s
        50           1.2304           29.66s
        60           1.2228           23.72s
        70           1.2162           17.76s
        80           1.2105           11.82s
        90           1.2054            5.91s
       100           1.2004            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3725            1.03m
         2           1.3628           59.87s
         3           1.3533           57.95s
         4           1.3456           57.78s
         5           1.3372           57.25s
         6           1.3301           56.18s
         7           1.3231           55.43s
         8           1.3169           54.76s
         9           1.3116           54.17s
        10           1.3066           53.53s
        20           1.2717           47.30s
        3

In [21]:
print('AUC-ROC results')
result

AUC-ROC results


Unnamed: 0,10,20,30,40,50,100
1.0,0.684801,0.692275,0.69588,0.698346,0.69806,0.697786
0.5,0.685604,0.697528,0.702277,0.704684,0.707135,0.711857
0.2,0.677429,0.690816,0.697447,0.701272,0.704072,0.711717
0.1,0.663027,0.68157,0.689139,0.693607,0.696953,0.705901


In [22]:
print('Time results')
result_time

Time results


Unnamed: 0,10,20,30,40,50,100
1.0,0:00:38.884072,0:01:07.668267,0:01:36.446779,0:02:05.689019,0:02:35.236219,0:04:59.584411
0.5,0:00:38.548270,0:01:08.236508,0:01:37.007466,0:02:05.613724,0:02:35.290523,0:05:00.336017
0.2,0:00:38.230483,0:01:07.169480,0:01:36.526868,0:02:06.574971,0:02:37.563227,0:05:05.029424
0.1,0:00:38.465220,0:01:07.622859,0:01:36.701187,0:02:05.190580,0:02:36.363736,0:05:03.692192


## Резюме по градиентному бустингу "в лоб"

1. Какие признаки имеют пропуски среди своих значений? Что могут означать пропуски в этих признаках (ответьте на этот вопрос для двух любых признаков)?

In [23]:
print('Признаки, которые имеют пропуски:\n')
print(* missing_values_name)

Признаки, которые имеют пропуски:

first_blood_time first_blood_team first_blood_player1 first_blood_player2 radiant_bottle_time radiant_courier_time radiant_flying_courier_time radiant_first_ward_time dire_bottle_time dire_courier_time dire_flying_courier_time dire_first_ward_time


Всего полей с пропусками 12. Поля с наибольшим количеством пропусков `FIRST_BLOOD_PLAYER2` и `RADIANT_FLYING_COURIER_TIME`.
`FIRST_BLOOD_PLAYER2` - второй игрок, причастный к совершению первого убийства в игре. 
`RADIANT_FLYING_COURIER_TIME` - время приобретения предмета "courier". 
Такое большое количество пропусков в данных признаках можно объяснить тем, что:
- `FIRST_BLOOD_PLAYER2` : Не всегда в первом убийстве участвует второй игрок
- `RADIANT_FLYING_COURIER_TIME` : Игроки не в каждой игре берут предмет "courier"

2. Как называется столбец, содержащий целевую переменную?

- `Целевая переменная имеет название radiant_win`

3. Как долго проводилась кросс-валидация для градиентного бустинга с 30 деревьями? Инструкцию по измерению времени можно найти ниже по тексту. Какое качество при этом получилось? Напомним, что в данном задании мы используем метрику качества AUC-ROC.

`Кросс-валидация для градиентного бустинга с 30 деревьями в среднем 1 минуту 37 секунд. Подробнее,для различных значений learning rate и для различных вариантов заполнения пропусков в таблице значений можно посмотреть в таблицах выше.
Значение параметра AUC-ROC для градиентного бустинга с 30 деревьями в среднем равен 0.69. Подробнее для различных значений learning rate и для различных вариантов заполнения пропусков в таблице значений можно посмотреть в таблицах выше.`

4. Имеет ли смысл использовать больше 30 деревьев в градиентном бустинге? Что бы вы предложили делать, чтобы ускорить его обучение при увеличении количества деревьев?

`Больше 30 деревьев в градиентом бустинге использовать не имеет смысла, так как значение метрики практически не увеличиается, а времени затрачивается намного больше (см. таблицы). Для ускорение времени на обучение можно рассмотреть вариант уменьшение глубины деревьев в градиентном бустинге.`