In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import time
import datetime

In [3]:
features = pd.read_csv('features.csv', index_col='match_id')
features.drop(['tower_status_radiant',
               'tower_status_dire',
               'barracks_status_radiant',
               'barracks_status_dire',
               'duration'], axis=1, inplace=True)

# Подход 1: градиентный бустинг "в лоб"

### Пропуски в таблице признаков

Следующие поля имеют пропуски:

In [4]:
missing_values = []
missing_values_name = []
for name in list(features.columns.values):
    if len(features[name]) - features[name].count():
        missing_values_name.append(name)
        print(f'{name} have {len(features[name]) - features[name].count()} missing values\n')
    missing_values.append(len(features[name]) - features[name].count())
sorted_missing_values = sorted(missing_values)
max_1 = sorted_missing_values[len(missing_values) - 1]
max_2 = sorted_missing_values[len(missing_values) - 2]
feature_with_max_missing_values = list(features.columns.values)[missing_values.index(max_1)]
second_feature_with_max_missing_values = list(features.columns.values)[missing_values.index(max_2)]
print(f"""Two features with max values of missing values are {feature_with_max_missing_values.upper()} with {max_1} 
          missing values and {second_feature_with_max_missing_values.upper()} with {max_2}""")

first_blood_time have 19553 missing values

first_blood_team have 19553 missing values

first_blood_player1 have 19553 missing values

first_blood_player2 have 43987 missing values

radiant_bottle_time have 15691 missing values

radiant_courier_time have 692 missing values

radiant_flying_courier_time have 27479 missing values

radiant_first_ward_time have 1836 missing values

dire_bottle_time have 16143 missing values

dire_courier_time have 676 missing values

dire_flying_courier_time have 26098 missing values

dire_first_ward_time have 1826 missing values

Two features with max values of missing values are FIRST_BLOOD_PLAYER2 with 43987 
          missing values and RADIANT_FLYING_COURIER_TIME with 27479


Всего полей с пропусками 12. Поля с наибольшим количеством пропусков `FIRST_BLOOD_PLAYER2` и `RADIANT_FLYING_COURIER_TIME`.
`FIRST_BLOOD_PLAYER2` - второй игрок, причастный к совершению первого убийства в игре. 
`RADIANT_FLYING_COURIER_TIME` - время приобретения предмета "courier". 
Такое большое количество пропусков в данных признаках можно объяснить тем, что:
- `FIRST_BLOOD_PLAYER2` : Не всегда в первом убийстве участвует второй игрок
- `RADIANT_FLYING_COURIER_TIME` : Игроки не в каждой игре берут предмет "courier"

#### Заполнение таблицы признаков нулями

In [20]:
features_0 = features.copy(deep = True)
for name in list(features_0.columns.values):
    features_0[name].fillna(value=0, inplace=True)

#### Заполнение таблицы признаков большими значениями 

In [5]:
features_BigNumbers = features.copy(deep = True)
for name in list(features_BigNumbers.columns.values):
    features_BigNumbers[name].fillna(value=1e8, inplace=True)

#### Заполнение таблицы признаков средними значениями

In [6]:
features_Mean = features.copy(deep = True)
for name in list(features_Mean.columns.values):
    features_Mean[name].fillna(value=features_Mean[name].mean(), inplace=True)

### Целевая переменная

In [7]:
y = pd.DataFrame(features['radiant_win'])

- `Целевая переменная имеет название radiant_win`

### Градиентый бустинг на решающими деревьями

#### Обучение на таблице признаков, заполненной нулями

In [21]:
X_0 = features_0.loc[:,features_0.columns != 'radiant_win']

In [10]:
kf = KFold(n_splits=5, random_state=1 ,shuffle=True)

In [22]:
result_0 = pd.DataFrame(columns=['10', '20', '30', '40', '50', '100'], index=['1','0.5','0.3', '0.2', '0.1'])
result_time_0 = pd.DataFrame(columns=['10', '20', '30', '40', '50', '100'], index=['1','0.5','0.3', '0.2', '0.1'])
for n_estimators in [10, 20, 30, 40, 50, 100]:
    for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]: 
        roc_auc_score_list = []
        start_time = datetime.datetime.now()
        for train, test in kf.split(X_0, y):
            X_0_train = X_0.iloc[train, :].values.tolist()
            X_0_test = X_0.iloc[test, :].values.tolist()
            y_train = list(map(lambda x: x.pop(), y.iloc[train, :].values.tolist()))
            y_test = list(map(lambda x: x.pop(),y.iloc[test, :].values.tolist()))
            clf = GradientBoostingClassifier(n_estimators=n_estimators, verbose=True, 
                                                random_state=241, learning_rate = learning_rate)
            probs = clf.fit(X_0_train, y_train).predict_proba(X_0_test)[:, 1]
            roc_auc_value = roc_auc_score(y_test, probs)
            roc_auc_score_list.append(roc_auc_value)
        delta_time = datetime.datetime.now() - start_time
        print(f'Classifier with number of trees {n_estimators} and learn_rate {learning_rate} has roc_auc value equal {np.mean(roc_auc_score_list)}')
        print(f'Classifier with number of trees {n_estimators} and learn_rate {learning_rate} has elapsed time {delta_time}')
        result_0[str(n_estimators)][str(learning_rate)] = np.mean(roc_auc_score_list)
        result_time_0[str(n_estimators)][str(learning_rate)] = delta_time

      Iter       Train Loss   Remaining Time 
         1           1.3504            6.22s
         2           1.3265            5.53s
         3           1.3074            5.06s
         4           1.2932            4.31s
         5           1.2838            3.63s
         6           1.2759            2.88s
         7           1.2687            2.14s
         8           1.2638            1.45s
         9           1.2602            0.71s
        10           1.2557            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3507            6.10s
         2           1.3279            5.19s
         3           1.3100            4.71s
         4           1.2948            3.98s
         5           1.2842            3.35s
         6           1.2770            2.75s
         7           1.2711            2.07s
         8           1.2662            1.37s
         9           1.2618            0.71s
        10           1.2582            0.00s
      It

         5           1.3374            3.19s
         6           1.3304            2.54s
         7           1.3237            1.91s
         8           1.3176            1.27s
         9           1.3123            0.63s
        10           1.3071            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3726            5.90s
         2           1.3633            5.17s
         3           1.3538            4.47s
         4           1.3454            3.82s
         5           1.3377            3.18s
         6           1.3307            2.53s
         7           1.3247            1.90s
         8           1.3188            1.27s
         9           1.3135            0.63s
        10           1.3085            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3730            5.88s
         2           1.3634            5.10s
         3           1.3537            4.39s
         4           1.3450            3.74s
        

         5           1.2987           10.68s
         6           1.2897            9.82s
         7           1.2820            9.21s
         8           1.2761            8.45s
         9           1.2700            7.66s
        10           1.2655            6.90s
        20           1.2351            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3594           12.13s
         2           1.3407           11.02s
         3           1.3239           10.27s
         4           1.3096            9.71s
         5           1.2990            9.01s
         6           1.2910            8.40s
         7           1.2839            7.81s
         8           1.2779            7.20s
         9           1.2722            6.62s
        10           1.2675            6.05s
        20           1.2378            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3599           12.66s
         2           1.3398           11.63s
        

        20           1.2726            0.00s
Classifier with number of trees 20 and learn_rate 0.2 has roc_auc value equal 0.6915670128485805
Classifier with number of trees 20 and learn_rate 0.2 has elapsed time 0:01:17.489181
      Iter       Train Loss   Remaining Time 
         1           1.3783           12.24s
         2           1.3727           11.40s
         3           1.3678           10.68s
         4           1.3631           10.08s
         5           1.3584            9.36s
         6           1.3544            8.74s
         7           1.3499            8.10s
         8           1.3456            7.44s
         9           1.3417            6.78s
        10           1.3379            6.13s
        20           1.3081            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3784           12.60s
         2           1.3730           11.92s
         3           1.3678           11.23s
         4           1.3634           10.45s
     

         5           1.2966           17.39s
         6           1.2875           17.66s
         7           1.2799           18.38s
         8           1.2733           18.16s
         9           1.2682           17.69s
        10           1.2639           17.15s
        20           1.2353            8.01s
        30           1.2195            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3598           19.63s
         2           1.3408           22.97s
         3           1.3234           21.98s
         4           1.3100           19.85s
         5           1.2976           20.02s
         6           1.2894           18.90s
         7           1.2820           17.70s
         8           1.2753           16.70s
         9           1.2699           15.73s
        10           1.2654           14.76s
        20           1.2360            7.36s
        30           1.2203            0.00s
Classifier with number of trees 30 and learn_rate 0.5 

         6           1.3544           16.76s
         7           1.3502           15.89s
         8           1.3457           15.21s
         9           1.3417           14.37s
        10           1.3381           13.60s
        20           1.3086            7.26s
        30           1.2886            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3787           21.56s
         2           1.3732           22.32s
         3           1.3680           20.82s
         4           1.3635           19.40s
         5           1.3592           19.28s
         6           1.3546           18.50s
         7           1.3500           18.26s
         8           1.3459           17.19s
         9           1.3418           16.15s
        10           1.3379           15.18s
        20           1.3077            7.25s
        30           1.2876            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3784           24.67s
        

         2           1.3408           25.62s
         3           1.3234           24.93s
         4           1.3100           23.87s
         5           1.2976           22.98s
         6           1.2894           22.07s
         7           1.2820           21.35s
         8           1.2753           20.81s
         9           1.2699           20.18s
        10           1.2654           19.47s
        20           1.2360           13.26s
        30           1.2203            6.57s
        40           1.2081            0.00s
Classifier with number of trees 40 and learn_rate 0.5 has roc_auc value equal 0.706125886099495
Classifier with number of trees 40 and learn_rate 0.5 has elapsed time 0:02:21.634853
      Iter       Train Loss   Remaining Time 
         1           1.3674           25.04s
         2           1.3540           24.45s
         3           1.3418           23.20s
         4           1.3304           22.35s
         5           1.3209           21.97s
       

         4           1.3634           24.60s
         5           1.3590           23.72s
         6           1.3544           23.14s
         7           1.3502           22.33s
         8           1.3457           21.57s
         9           1.3417           21.75s
        10           1.3381           20.93s
        20           1.3086           13.96s
        30           1.2886            6.78s
        40           1.2743            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3787           26.31s
         2           1.3732           26.22s
         3           1.3680           25.49s
         4           1.3635           24.57s
         5           1.3592           24.08s
         6           1.3546           23.17s
         7           1.3500           22.32s
         8           1.3459           21.53s
         9           1.3418           20.87s
        10           1.3379           20.16s
        20           1.3077           13.17s
        3

         2           1.3396           29.65s
         3           1.3223           28.63s
         4           1.3068           27.95s
         5           1.2966           27.42s
         6           1.2875           26.96s
         7           1.2799           26.50s
         8           1.2733           26.00s
         9           1.2682           25.46s
        10           1.2639           24.85s
        20           1.2353           18.33s
        30           1.2195           12.30s
        40           1.2074            6.13s
        50           1.1970            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3598           31.14s
         2           1.3408           30.03s
         3           1.3234           29.49s
         4           1.3100           28.52s
         5           1.2976           27.69s
         6           1.2894           26.85s
         7           1.2820           26.17s
         8           1.2753           25.74s
         

        20           1.2726           28.86s
        30           1.2527           20.24s
        40           1.2394           10.40s
        50           1.2297            0.00s
Classifier with number of trees 50 and learn_rate 0.2 has roc_auc value equal 0.7047084323919888
Classifier with number of trees 50 and learn_rate 0.2 has elapsed time 0:03:49.796921
      Iter       Train Loss   Remaining Time 
         1           1.3783           42.30s
         2           1.3727           40.76s
         3           1.3678           39.94s
         4           1.3631           39.62s
         5           1.3584           39.65s
         6           1.3544           38.55s
         7           1.3499           37.52s
         8           1.3456           36.37s
         9           1.3417           37.12s
        10           1.3379           37.56s
        20           1.3081           31.28s
        30           1.2876           19.18s
        40           1.2727            9.13s
      

        60           1.1776           33.41s
        70           1.1673           24.20s
        80           1.1557           15.74s
        90           1.1467            7.65s
       100           1.1389            0.00s
Classifier with number of trees 100 and learn_rate 1 has roc_auc value equal 0.7007752069368333
Classifier with number of trees 100 and learn_rate 1 has elapsed time 0:06:59.263468
      Iter       Train Loss   Remaining Time 
         1           1.3592            1.03m
         2           1.3404            1.00m
         3           1.3240           58.70s
         4           1.3093           58.04s
         5           1.2987           57.11s
         6           1.2897           56.75s
         7           1.2820           56.46s
         8           1.2761           55.62s
         9           1.2700           54.85s
        10           1.2655           54.41s
        20           1.2351           48.63s
        30           1.2182           42.48s
        

         9           1.2913           58.15s
        10           1.2861           57.37s
        20           1.2522           53.02s
        30           1.2346           47.49s
        40           1.2226           40.18s
        50           1.2136           32.82s
        60           1.2058           26.08s
        70           1.1989           19.98s
        80           1.1925           13.80s
        90           1.1868            6.95s
       100           1.1817            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3678            1.04m
         2           1.3539           59.94s
         3           1.3413           58.68s
         4           1.3300           57.90s
         5           1.3203           56.85s
         6           1.3127           57.08s
         7           1.3053           56.89s
         8           1.2990           59.31s
         9           1.2926           59.93s
        10           1.2871           59.53s
        2

         3           1.3680            1.06m
         4           1.3635            1.04m
         5           1.3592            1.02m
         6           1.3546           59.95s
         7           1.3500           58.76s
         8           1.3459           57.80s
         9           1.3418           56.83s
        10           1.3379           56.00s
        20           1.3077           49.19s
        30           1.2876           42.74s
        40           1.2731           36.62s
        50           1.2617           30.45s
        60           1.2528           24.44s
        70           1.2453           18.80s
        80           1.2392           12.80s
        90           1.2339            6.49s
       100           1.2293            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3784            2.35m
         2           1.3728            1.67m
         3           1.3677            1.44m
         4           1.3631            1.32m
         

In [23]:
print('ROC-AUC results')
result_0

ROC-AUC results


Unnamed: 0,10,20,30,40,50,100
1.0,0.684291,0.693152,0.697415,0.699814,0.700226,0.700775
0.5,0.68529,0.697754,0.702799,0.706126,0.708608,0.712554
0.3,0.683911,0.695696,0.701111,0.704532,0.706804,0.713539
0.2,0.677809,0.691567,0.697992,0.702197,0.704708,0.711951
0.1,0.664833,0.682114,0.689695,0.693934,0.697114,0.706211


In [24]:
print('Time results')
result_time_0

Time results


Unnamed: 0,10,20,30,40,50,100
1.0,0:00:45.849378,0:01:15.626197,0:01:44.071127,0:02:27.703044,0:03:04.983553,0:06:59.263468
0.5,0:00:45.896983,0:01:19.191175,0:01:53.795846,0:02:21.634853,0:02:43.848597,0:05:32.652043
0.3,0:00:41.712911,0:01:18.869184,0:01:46.149307,0:02:22.141157,0:02:48.390154,0:05:24.812534
0.2,0:00:40.526469,0:01:17.489181,0:01:41.248251,0:02:25.421200,0:03:49.796921,0:05:13.887897
0.1,0:00:45.055303,0:01:10.163601,0:01:56.713391,0:02:22.779087,0:03:45.514012,0:05:43.127329


#### Обучение на таблице признаков, заполненной большими значениями

In [25]:
X_BigNumber = features_BigNumbers.loc[:,features_BigNumbers.columns != 'radiant_win']

In [26]:
result_BN = pd.DataFrame(columns=['10', '20', '30', '40', '50', '100'], index=['1','0.5','0.3', '0.2', '0.1'])
result_time_BN = pd.DataFrame(columns=['10', '20', '30', '40', '50', '100'], index=['1','0.5','0.3', '0.2', '0.1'])
for n_estimators in [10, 20, 30, 40, 50, 100]:
    for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]: 
        roc_auc_score_list = []
        start_time = datetime.datetime.now()
        for train, test in kf.split(X_BigNumber, y):
            X_BigNumber_train = X_BigNumber.iloc[train, :].values.tolist()
            X_BigNumber_test = X_BigNumber.iloc[test, :].values.tolist()
            y_train = list(map(lambda x: x.pop(), y.iloc[train, :].values.tolist()))
            y_test = list(map(lambda x: x.pop(),y.iloc[test, :].values.tolist()))
            clf = GradientBoostingClassifier(n_estimators=n_estimators, verbose=True, 
                                                random_state=241, learning_rate = learning_rate)
            probs = clf.fit(X_BigNumber_train, y_train).predict_proba(X_BigNumber_test)[:, 1]
            roc_auc_value = roc_auc_score(y_test, probs)
            roc_auc_score_list.append(roc_auc_value)
        delta_time = datetime.datetime.now() - start_time
        print(f'Classifier with number of trees {n_estimators} and learn_rate {learning_rate} has roc_auc value equal {np.mean(roc_auc_score_list)}')
        print(f'Classifier with number of trees {n_estimators} and learn_rate {learning_rate} has elapsed time {delta_time}')
        result_BN[str(n_estimators)][str(learning_rate)] = np.mean(roc_auc_score_list)
        result_time_BN[str(n_estimators)][str(learning_rate)] = delta_time

      Iter       Train Loss   Remaining Time 
         1           1.3511            5.92s
         2           1.3272            5.12s
         3           1.3092            4.42s
         4           1.2956            3.84s
         5           1.2853            3.22s
         6           1.2773            2.58s
         7           1.2717            1.93s
         8           1.2676            1.28s
         9           1.2631            0.64s
        10           1.2587            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3521            5.92s
         2           1.3276            5.15s
         3           1.3092            4.51s
         4           1.2955            3.86s
         5           1.2866            3.23s
         6           1.2795            2.59s
         7           1.2733            1.95s
         8           1.2678            1.30s
         9           1.2630            0.65s
        10           1.2590            0.00s
      It

         5           1.3383            3.09s
         6           1.3310            2.46s
         7           1.3243            1.85s
         8           1.3182            1.24s
         9           1.3129            0.61s
        10           1.3077            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3732            5.66s
         2           1.3642            5.15s
         3           1.3552            4.40s
         4           1.3465            3.72s
         5           1.3385            3.10s
         6           1.3312            2.46s
         7           1.3249            1.85s
         8           1.3194            1.24s
         9           1.3135            0.62s
        10           1.3087            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3735            5.80s
         2           1.3645            5.09s
         3           1.3550            4.36s
         4           1.3459            3.73s
        

         5           1.2989            9.26s
         6           1.2899            8.64s
         7           1.2827            7.99s
         8           1.2758            7.42s
         9           1.2700            6.80s
        10           1.2657            6.19s
        20           1.2362            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3605           11.77s
         2           1.3408           10.94s
         3           1.3246           10.53s
         4           1.3110            9.81s
         5           1.2997            9.18s
         6           1.2915            8.63s
         7           1.2842            8.19s
         8           1.2786            7.56s
         9           1.2730            6.91s
        10           1.2683            6.29s
        20           1.2392            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3610           12.57s
         2           1.3417           11.29s
        

        20           1.2732            0.00s
Classifier with number of trees 20 and learn_rate 0.2 has roc_auc value equal 0.6908246192610712
Classifier with number of trees 20 and learn_rate 0.2 has elapsed time 0:01:09.051428
      Iter       Train Loss   Remaining Time 
         1           1.3785           11.92s
         2           1.3731           11.10s
         3           1.3683           10.56s
         4           1.3631            9.73s
         5           1.3583            9.06s
         6           1.3541            8.50s
         7           1.3497            7.84s
         8           1.3456            7.25s
         9           1.3417            6.61s
        10           1.3379            6.00s
        20           1.3084            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3787           12.24s
         2           1.3734           11.47s
         3           1.3687           10.82s
         4           1.3638           10.04s
     

         5           1.2973           15.13s
         6           1.2884           14.60s
         7           1.2812           13.95s
         8           1.2749           13.31s
         9           1.2701           12.66s
        10           1.2655           12.04s
        20           1.2363            6.04s
        30           1.2203            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3608           18.39s
         2           1.3406           16.94s
         3           1.3234           16.35s
         4           1.3089           15.76s
         5           1.2984           15.21s
         6           1.2893           14.90s
         7           1.2821           14.18s
         8           1.2754           13.56s
         9           1.2701           13.00s
        10           1.2655           12.38s
        20           1.2362            6.14s
        30           1.2202            0.00s
Classifier with number of trees 30 and learn_rate 0.5 

         6           1.3545           14.64s
         7           1.3502           13.93s
         8           1.3466           13.37s
         9           1.3425           12.67s
        10           1.3386           12.06s
        20           1.3090            5.95s
        30           1.2887            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3790           18.23s
         2           1.3736           17.48s
         3           1.3688           16.74s
         4           1.3641           16.07s
         5           1.3591           15.36s
         6           1.3544           14.61s
         7           1.3505           14.14s
         8           1.3462           13.48s
         9           1.3421           12.83s
        10           1.3381           12.22s
        20           1.3081            6.01s
        30           1.2880            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3786           18.74s
        

         2           1.3406           23.13s
         3           1.3234           22.26s
         4           1.3089           21.70s
         5           1.2984           21.31s
         6           1.2893           20.67s
         7           1.2821           19.94s
         8           1.2754           19.35s
         9           1.2701           18.84s
        10           1.2655           18.26s
        20           1.2362           12.13s
        30           1.2202            6.09s
        40           1.2081            0.00s
Classifier with number of trees 40 and learn_rate 0.5 has roc_auc value equal 0.7054884649069383
Classifier with number of trees 40 and learn_rate 0.5 has elapsed time 0:02:09.357716
      Iter       Train Loss   Remaining Time 
         1           1.3678           24.29s
         2           1.3550           24.88s
         3           1.3425           23.87s
         4           1.3315           22.73s
         5           1.3217           21.94s
      

         4           1.3638           22.13s
         5           1.3593           21.52s
         6           1.3545           20.70s
         7           1.3502           20.04s
         8           1.3466           19.48s
         9           1.3425           18.78s
        10           1.3386           18.18s
        20           1.3090           12.14s
        30           1.2887            6.06s
        40           1.2743            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3790           24.39s
         2           1.3736           23.57s
         3           1.3688           22.65s
         4           1.3641           22.52s
         5           1.3591           21.66s
         6           1.3544           20.79s
         7           1.3505           20.25s
         8           1.3462           19.52s
         9           1.3421           18.85s
        10           1.3381           18.25s
        20           1.3081           11.91s
        3

         2           1.3408           30.07s
         3           1.3229           28.89s
         4           1.3094           27.94s
         5           1.2973           27.68s
         6           1.2884           27.20s
         7           1.2812           26.43s
         8           1.2749           25.77s
         9           1.2701           25.00s
        10           1.2655           24.24s
        20           1.2363           18.10s
        30           1.2203           12.01s
        40           1.2085            6.01s
        50           1.1987            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3608           31.35s
         2           1.3406           29.34s
         3           1.3234           30.28s
         4           1.3089           29.16s
         5           1.2984           28.54s
         6           1.2893           28.38s
         7           1.2821           27.40s
         8           1.2754           26.67s
         

        20           1.2732           17.67s
        30           1.2529           11.77s
        40           1.2394            5.89s
        50           1.2295            0.00s
Classifier with number of trees 50 and learn_rate 0.2 has roc_auc value equal 0.7043518055944455
Classifier with number of trees 50 and learn_rate 0.2 has elapsed time 0:02:37.626415
      Iter       Train Loss   Remaining Time 
         1           1.3785           31.14s
         2           1.3731           29.69s
         3           1.3683           29.12s
         4           1.3631           28.22s
         5           1.3583           27.37s
         6           1.3541           26.85s
         7           1.3497           26.04s
         8           1.3456           25.38s
         9           1.3417           24.68s
        10           1.3379           24.06s
        20           1.3084           17.77s
        30           1.2880           11.81s
        40           1.2728            5.91s
      

        60           1.1782           24.02s
        70           1.1673           17.91s
        80           1.1588           11.90s
        90           1.1491            5.96s
       100           1.1399            0.00s
Classifier with number of trees 100 and learn_rate 1 has roc_auc value equal 0.7002175311707874
Classifier with number of trees 100 and learn_rate 1 has elapsed time 0:05:05.467991
      Iter       Train Loss   Remaining Time 
         1           1.3598            1.05m
         2           1.3406            1.01m
         3           1.3235            1.01m
         4           1.3098           59.70s
         5           1.2989           59.52s
         6           1.2899           58.73s
         7           1.2827           57.81s
         8           1.2758           57.31s
         9           1.2700           56.63s
        10           1.2657           56.03s
        20           1.2362           49.24s
        30           1.2195           42.98s
        

         9           1.2919           56.24s
        10           1.2864           55.56s
        20           1.2535           48.65s
        30           1.2358           42.24s
        40           1.2238           36.22s
        50           1.2146           30.26s
        60           1.2065           24.16s
        70           1.1998           18.07s
        80           1.1934           12.01s
        90           1.1873            6.01s
       100           1.1818            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3685            1.03m
         2           1.3557            1.02m
         3           1.3428           59.33s
         4           1.3311           58.21s
         5           1.3213           57.32s
         6           1.3130           57.12s
         7           1.3052           56.68s
         8           1.2989           55.82s
         9           1.2925           55.40s
        10           1.2876           54.65s
        2

         3           1.3688            1.06m
         4           1.3641            1.03m
         5           1.3591            1.01m
         6           1.3544           58.81s
         7           1.3505           58.21s
         8           1.3462           57.19s
         9           1.3421           56.16s
        10           1.3381           55.37s
        20           1.3081           47.70s
        30           1.2880           41.47s
        40           1.2732           35.54s
        50           1.2617           29.58s
        60           1.2528           23.76s
        70           1.2454           17.86s
        80           1.2390           11.96s
        90           1.2338            5.99s
       100           1.2290            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3786            1.15m
         2           1.3733            1.08m
         3           1.3684            1.05m
         4           1.3638            1.03m
         

In [27]:
print('ROC-AUC results')
result_BN

ROC-AUC results


Unnamed: 0,10,20,30,40,50,100
1.0,0.683625,0.693236,0.69648,0.698713,0.700209,0.700218
0.5,0.6855,0.697404,0.702367,0.705488,0.707895,0.71198
0.3,0.681391,0.695126,0.700788,0.704223,0.706899,0.714178
0.2,0.676771,0.690825,0.697691,0.701796,0.704352,0.712388
0.1,0.666886,0.682805,0.689896,0.694669,0.698003,0.706592


In [28]:
print('Time results')
result_time_BN

Time results


Unnamed: 0,10,20,30,40,50,100
1.0,0:00:41.600645,0:01:09.370484,0:01:37.655783,0:02:08.517848,0:02:38.973709,0:05:05.467991
0.5,0:00:41.715472,0:01:10.428336,0:01:40.075126,0:02:09.357716,0:02:38.432789,0:05:08.919669
0.3,0:00:42.159615,0:01:09.000392,0:01:39.627105,0:02:09.578474,0:02:39.024227,0:05:08.088912
0.2,0:00:39.462461,0:01:09.051428,0:01:39.018168,0:02:09.625098,0:02:37.626415,0:05:08.432524
0.1,0:00:41.013071,0:01:08.612914,0:01:38.574667,0:02:09.269748,0:02:37.170207,0:05:15.149075


#### Обучение на таблице признаков, заполненной средними значениями

In [14]:
X_Mean =  features_Mean.loc[:,features_Mean.columns != 'radiant_win']

In [15]:
result = pd.DataFrame(columns=['10', '20', '30', '40', '50', '100'], index=['1','0.5','0.3', '0.2', '0.1'])
result_time = pd.DataFrame(columns=['10', '20', '30', '40', '50', '100'], index=['1','0.5','0.3', '0.2', '0.1'])
for n_estimators in [10, 20, 30, 40, 50, 100]:
    for learning_rate in [1, 0.5, 0.3, 0.2, 0.1]: 
        roc_auc_score_list = []
        start_time = datetime.datetime.now()
        for train, test in kf.split(X_Mean, y):
            X_Mean_train = X_Mean.iloc[train, :].values.tolist()
            X_Mean_test = X_Mean.iloc[test, :].values.tolist()
            y_train = list(map(lambda x: x.pop(), y.iloc[train, :].values.tolist()))
            y_test = list(map(lambda x: x.pop(),y.iloc[test, :].values.tolist()))
            clf = GradientBoostingClassifier(n_estimators=n_estimators, verbose=True, 
                                                random_state=241, learning_rate = learning_rate)
            probs = clf.fit(X_Mean_train, y_train).predict_proba(X_Mean_test)[:, 1]
            roc_auc_value = roc_auc_score(y_test, probs)
            roc_auc_score_list.append(roc_auc_value)
        delta_time = datetime.datetime.now() - start_time
        print(f'Classifier with number of trees {n_estimators} and learn_rate {learning_rate} has roc_auc value equal {np.mean(roc_auc_score_list)}')
        print(f'Classifier with number of trees {n_estimators} and learn_rate {learning_rate} has elapsed time {delta_time}')
        result[str(n_estimators)][str(learning_rate)] = np.mean(roc_auc_score_list)
        result_time[str(n_estimators)][str(learning_rate)] = delta_time

      Iter       Train Loss   Remaining Time 
         1           1.3504            5.75s
         2           1.3265            4.91s
         3           1.3074            4.20s
         4           1.2932            3.61s
         5           1.2839            3.01s
         6           1.2758            2.42s
         7           1.2689            1.82s
         8           1.2638            1.21s
         9           1.2596            0.61s
        10           1.2563            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3507            7.66s
         2           1.3279            5.86s
         3           1.3100            5.00s
         4           1.2948            4.27s
         5           1.2842            3.65s
         6           1.2770            2.98s
         7           1.2706            2.31s
         8           1.2666            1.51s
         9           1.2626            0.74s
        10           1.2590            0.00s
      It

         5           1.3369            2.97s
         6           1.3298            2.38s
         7           1.3232            1.78s
         8           1.3174            1.18s
         9           1.3117            0.59s
        10           1.3067            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3726            5.67s
         2           1.3631            4.93s
         3           1.3538            4.26s
         4           1.3454            3.59s
         5           1.3376            2.97s
         6           1.3312            2.39s
         7           1.3247            1.79s
         8           1.3186            1.19s
         9           1.3131            0.59s
        10           1.3081            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3730            5.58s
         2           1.3635            4.89s
         3           1.3541            4.22s
         4           1.3451            3.59s
        

         5           1.2987            8.95s
         6           1.2897            8.40s
         7           1.2820            7.81s
         8           1.2761            7.18s
         9           1.2700            6.57s
        10           1.2656            5.95s
        20           1.2356            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3594           11.94s
         2           1.3407           10.93s
         3           1.3239           10.21s
         4           1.3096            9.59s
         5           1.2990            8.96s
         6           1.2910            8.38s
         7           1.2839            7.78s
         8           1.2778            7.23s
         9           1.2721            6.63s
        10           1.2675            6.04s
        20           1.2379            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3599           11.86s
         2           1.3398           11.00s
        

        20           1.2725            0.00s
Classifier with number of trees 20 and learn_rate 0.2 has roc_auc value equal 0.6908159672504474
Classifier with number of trees 20 and learn_rate 0.2 has elapsed time 0:01:08.560299
      Iter       Train Loss   Remaining Time 
         1           1.3783           11.83s
         2           1.3727           11.04s
         3           1.3675           10.41s
         4           1.3629            9.76s
         5           1.3586            9.11s
         6           1.3541            8.48s
         7           1.3499            7.88s
         8           1.3456            7.25s
         9           1.3416            6.60s
        10           1.3378            5.99s
        20           1.3080            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3784           11.81s
         2           1.3730           11.03s
         3           1.3679           10.40s
         4           1.3635            9.75s
     

         5           1.2966           14.95s
         6           1.2875           14.47s
         7           1.2799           13.89s
         8           1.2734           13.42s
         9           1.2680           12.84s
        10           1.2640           12.12s
        20           1.2345            6.07s
        30           1.2194            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3598           18.25s
         2           1.3408           17.02s
         3           1.3234           16.53s
         4           1.3100           15.68s
         5           1.2976           14.97s
         6           1.2894           14.26s
         7           1.2820           13.59s
         8           1.2753           13.05s
         9           1.2705           12.48s
        10           1.2655           11.90s
        20           1.2361            6.01s
        30           1.2203            0.00s
Classifier with number of trees 30 and learn_rate 0.5 

         6           1.3542           14.54s
         7           1.3500           13.84s
         8           1.3458           13.36s
         9           1.3418           12.73s
        10           1.3379           12.12s
        20           1.3086            5.96s
        30           1.2887            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3787           17.99s
         2           1.3732           17.04s
         3           1.3681           16.32s
         4           1.3636           15.68s
         5           1.3594           15.03s
         6           1.3549           14.39s
         7           1.3505           13.69s
         8           1.3461           13.03s
         9           1.3420           12.38s
        10           1.3384           11.83s
        20           1.3081            5.86s
        30           1.2879            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3784           17.82s
        

         2           1.3408           22.55s
         3           1.3234           21.88s
         4           1.3100           21.06s
         5           1.2976           20.45s
         6           1.2894           19.70s
         7           1.2820           19.07s
         8           1.2753           18.65s
         9           1.2705           18.12s
        10           1.2655           17.58s
        20           1.2361           11.80s
        30           1.2203            5.88s
        40           1.2082            0.00s
Classifier with number of trees 40 and learn_rate 0.5 has roc_auc value equal 0.7046840900964207
Classifier with number of trees 40 and learn_rate 0.5 has elapsed time 0:02:07.622082
      Iter       Train Loss   Remaining Time 
         1           1.3674           23.72s
         2           1.3543           23.10s
         3           1.3421           22.12s
         4           1.3310           21.17s
         5           1.3215           20.51s
      

         4           1.3635           21.79s
         5           1.3590           21.13s
         6           1.3542           20.34s
         7           1.3500           19.59s
         8           1.3458           18.89s
         9           1.3418           18.25s
        10           1.3379           17.63s
        20           1.3086           11.64s
        30           1.2887            5.81s
        40           1.2742            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3787           24.00s
         2           1.3732           23.08s
         3           1.3681           22.33s
         4           1.3636           21.76s
         5           1.3594           21.07s
         6           1.3549           20.30s
         7           1.3505           19.58s
         8           1.3461           18.90s
         9           1.3420           18.24s
        10           1.3384           17.71s
        20           1.3081           11.70s
        3

         2           1.3396           35.70s
         3           1.3223           36.46s
         4           1.3068           37.19s
         5           1.2966           34.64s
         6           1.2875           32.85s
         7           1.2799           31.49s
         8           1.2734           30.37s
         9           1.2680           29.14s
        10           1.2640           28.05s
        20           1.2345           20.02s
        30           1.2194           13.15s
        40           1.2075            6.61s
        50           1.1973            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3598           33.08s
         2           1.3408           30.99s
         3           1.3234           30.14s
         4           1.3100           29.00s
         5           1.2976           27.93s
         6           1.2894           26.96s
         7           1.2820           26.27s
         8           1.2753           25.71s
         

        20           1.2725           19.22s
        30           1.2529           13.33s
        40           1.2396            6.51s
        50           1.2296            0.00s
Classifier with number of trees 50 and learn_rate 0.2 has roc_auc value equal 0.7040721427444647
Classifier with number of trees 50 and learn_rate 0.2 has elapsed time 0:02:41.733838
      Iter       Train Loss   Remaining Time 
         1           1.3783           30.73s
         2           1.3727           29.82s
         3           1.3675           29.11s
         4           1.3629           28.46s
         5           1.3586           27.82s
         6           1.3541           27.02s
         7           1.3499           26.48s
         8           1.3456           25.79s
         9           1.3416           25.02s
        10           1.3378           24.35s
        20           1.3080           17.98s
        30           1.2877           11.91s
        40           1.2730            5.95s
      

        60           1.1797           24.27s
        70           1.1693           18.07s
        80           1.1586           11.99s
        90           1.1484            5.96s
       100           1.1408            0.00s
Classifier with number of trees 100 and learn_rate 1 has roc_auc value equal 0.6977858100816298
Classifier with number of trees 100 and learn_rate 1 has elapsed time 0:05:05.036364
      Iter       Train Loss   Remaining Time 
         1           1.3592            1.01m
         2           1.3404            1.00m
         3           1.3240           58.60s
         4           1.3093           57.33s
         5           1.2987           56.69s
         6           1.2897           57.11s
         7           1.2820           56.64s
         8           1.2761           55.78s
         9           1.2700           54.99s
        10           1.2656           54.25s
        20           1.2356           47.60s
        30           1.2192           41.88s
        

         9           1.2922            1.08m
        10           1.2869            1.12m
        20           1.2540            1.01m
        30           1.2363           52.18s
        40           1.2246           43.74s
        50           1.2153           36.67s
        60           1.2070           29.56s
        70           1.1999           21.61s
        80           1.1934           14.31s
        90           1.1876            7.05s
       100           1.1823            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3678            1.29m
         2           1.3550            1.17m
         3           1.3422            1.16m
         4           1.3315            1.11m
         5           1.3218            1.09m
         6           1.3137            1.06m
         7           1.3063            1.05m
         8           1.2993            1.03m
         9           1.2937            1.02m
        10           1.2881            1.00m
        2

         3           1.3681            1.27m
         4           1.3636            1.24m
         5           1.3594            1.22m
         6           1.3549            1.20m
         7           1.3505            1.19m
         8           1.3461            1.18m
         9           1.3420            1.16m
        10           1.3384            1.16m
        20           1.3081           58.39s
        30           1.2879           50.24s
        40           1.2733           43.57s
        50           1.2620           37.36s
        60           1.2531           30.09s
        70           1.2458           22.76s
        80           1.2396           15.39s
        90           1.2343            7.68s
       100           1.2297            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.3784            1.42m
         2           1.3728            1.34m
         3           1.3676            1.34m
         4           1.3630            1.25m
         

In [16]:
print('ROC-AUC results')
result

ROC-AUC results


Unnamed: 0,10,20,30,40,50,100
1.0,0.684801,0.692275,0.69588,0.698346,0.69806,0.697786
0.5,0.685604,0.697528,0.702277,0.704684,0.707135,0.711857
0.3,0.680761,0.693968,0.699857,0.703655,0.706416,0.713544
0.2,0.677429,0.690816,0.697447,0.701272,0.704072,0.711717
0.1,0.663027,0.68157,0.689139,0.693607,0.696953,0.705901


In [17]:
print('Time results')
result_time

Time results


Unnamed: 0,10,20,30,40,50,100
1.0,0:00:41.069582,0:01:15.176800,0:01:38.112976,0:02:07.497396,0:02:38.046812,0:05:05.036364
0.5,0:00:38.274420,0:01:08.905859,0:01:37.904447,0:02:07.622082,0:02:47.327061,0:05:08.832299
0.3,0:00:38.118459,0:01:08.665940,0:01:38.836331,0:02:06.163074,0:02:38.907557,0:06:06.007967
0.2,0:00:38.194378,0:01:08.560299,0:01:38.222163,0:02:07.803828,0:02:41.733838,0:05:51.300087
0.1,0:00:39.797493,0:01:08.796024,0:01:38.662917,0:02:11.546958,0:02:38.406793,0:06:19.186214


## Резюме по градиентному бустингу "в лоб"

1. Какие признаки имеют пропуски среди своих значений? Что могут означать пропуски в этих признаках (ответьте на этот вопрос для двух любых признаков)?

In [8]:
print('Признаки, которые имеют пропуски:\n')
print(* missing_values_name)

Признаки, которые имеют пропуски:

first_blood_time first_blood_team first_blood_player1 first_blood_player2 radiant_bottle_time radiant_courier_time radiant_flying_courier_time radiant_first_ward_time dire_bottle_time dire_courier_time dire_flying_courier_time dire_first_ward_time


Всего полей с пропусками 12. Поля с наибольшим количеством пропусков `FIRST_BLOOD_PLAYER2` и `RADIANT_FLYING_COURIER_TIME`.
`FIRST_BLOOD_PLAYER2` - второй игрок, причастный к совершению первого убийства в игре. 
`RADIANT_FLYING_COURIER_TIME` - время приобретения предмета "courier". 
Такое большое количество пропусков в данных признаках можно объяснить тем, что:
- `FIRST_BLOOD_PLAYER2` : Не всегда в первом убийстве участвует второй игрок
- `RADIANT_FLYING_COURIER_TIME` : Игроки не в каждой игре берут предмет "courier"

2. Как называется столбец, содержащий целевую переменную?

- `Целевая переменная имеет название radiant_win`

3. Как долго проводилась кросс-валидация для градиентного бустинга с 30 деревьями? Инструкцию по измерению времени можно найти ниже по тексту. Какое качество при этом получилось? Напомним, что в данном задании мы используем метрику качества AUC-ROC.

`Кросс-валидация для градиентного бустинга с 30 деревьями в среднем 1 минуту 8 секунд. Подробнее,для различных значений learning rate и для различных вариантов заполнения пропусков в таблице значений можно посмотреть в таблицах выше.
Значение параметра AUC-ROC для градиентного бустинга с 30 деревьями в среднем равен 0.69. Подробнее для различных значений learning rate и для различных вариантов заполнения пропусков в таблице значений можно посмотреть в таблицах выше.`

4. Имеет ли смысл использовать больше 30 деревьев в градиентном бустинге? Что бы вы предложили делать, чтобы ускорить его обучение при увеличении количества деревьев?

`Больше 30 деревьев в градиентом бустинге использовать не имеет смысла, так как значение метрики практически не увеличиается, а времени затрачивается намного больше (см. таблицы). Для ускорение времени на обучение можно рассмотреть вариант уменьшение глубины деревьев в градиентном бустинге.`