# Step 0.1. Import necessary libraries 

In [1]:
# Standard python libraries
import logging
import os
import time
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler

# Step 0.2. Parameters 

In [2]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 300 # Time in seconds for automl run
TARGET_NAME = 'TARGET' # Target column name

# Step 0.3. Fix torch number of threads and numpy seed 

In [3]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Change profiling decorators settings 

By default, profiling decorators are turned off for speed and memory reduction. If you want to see profiling report after using LAMA, you need to turn on the decorators using command below: 

In [4]:
p = Profiler()
p.change_deco_settings({'enabled': True})

# Step 0.5. Example data load 

In [5]:
%%time

data = pd.read_csv('./example_data/test_data_files/sampled_app_train.csv')
data.head()

Wall time: 130 ms


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


# Step 0.6. (Optional) Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [6]:
%%time

data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data['report_dt'] = np.datetime64('2018-01-01')

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

[2020-12-29 19:52:56,930] (INFO): NumExpr defaulting to 8 threads.


Wall time: 202 ms


# Step 0.7. (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [7]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

[2020-12-29 19:52:57,072] (INFO): Data splitted. Parts sizes: train_data = (8000, 125), test_data = (2000, 125)


Wall time: 18 ms


In [8]:
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,BIRTH_DATE,EMP_DATE,constant,allnan,report_dt
6444,112261,0,Cash loans,F,N,N,1,90000.0,640080.0,31261.5,...,0.0,0.0,0.0,1.0,0.0,1985-06-28,2012-06-21,1,,2018-01-01
3586,115058,0,Cash loans,F,N,Y,0,180000.0,239850.0,23850.0,...,0.0,0.0,0.0,0.0,3.0,1953-12-27,2018-01-01,1,,2018-01-01
9349,326623,0,Cash loans,F,N,Y,0,112500.0,337500.0,31086.0,...,0.0,0.0,0.0,0.0,2.0,1975-06-21,2016-06-17,1,,2018-01-01
7734,191976,0,Cash loans,M,Y,Y,1,67500.0,135000.0,9018.0,...,,,,,,1988-04-27,2009-06-05,1,,2018-01-01
2174,281519,0,Revolving loans,F,N,Y,0,67500.0,202500.0,10125.0,...,0.0,0.0,0.0,0.0,2.0,1975-06-13,1997-01-22,1,,2018-01-01


# ========= AutoML preset usage =========


## Step 1. Create Task

In [9]:
%%time

task = Task('binary', )

Wall time: 4.97 ms


## Step 2. Setup columns roles

Roles setup here set target column and base date, which is used to calculate date differences:

In [10]:
%%time

roles = {'target': TARGET_NAME,
         DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt',
         }

Wall time: 0 ns


## Step 3. Create AutoML from preset

To create AutoML model here we use `TabularAutoML` preset, which looks like:

![TabularAutoML preset pipeline](imgs/tutorial_2_pipeline.png)

All params we set above can be send inside preset to change its configuration:

In [11]:
%%time 

automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},
                       lgb_params = {'default_params': {'num_threads': N_THREADS}})
oof_pred = automl.fit_predict(train_data, roles = roles)
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


Start automl preset with listed constraints:
- time: 300 seconds
- cpus: 4 cores
- memory: 16 gb


Train data shape: (8000, 125)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 296.85083985328674 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...
Linear model: C = 1e-05 score = 0.6871134489044266
Linear model: C = 5e-05 score = 0.7049890148660134
Linear model: C = 0.0001 score = 0.7231478957187378
Linear model: C = 0.0005 score = 0.7541949313362306
Linear model: C = 0.001 score = 0.7613472959464588
Linear model: C = 0.005 score = 0.7633411913123894
Linear model: C = 0.01 score = 0.7602514553297945
Linear model: C = 0.05 score = 0.7480582238829107
Linear model: C = 1e-05 score = 0.7097380264945652
Linear model: C = 5e-05 score = 0.7196522588315217
Linear model: C = 0.0001 score = 0.726806640625
Linear model: C = 0.0005 score = 0.7401706861413043
Linear model: C = 0.001 score = 0.7423785665760869
Linear model: C = 0.005 score = 0.742102581

[2020-12-29 19:53:23,782] (INFO): A new study created in memory with name: no-name-69a7bc6b-3082-4e59-b76d-1b804aae157a


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.742162
[200]	valid's auc: 0.746348
[300]	valid's auc: 0.752356
[400]	valid's auc: 0.755585
[500]	valid's auc: 0.756317
[600]	valid's auc: 0.756884
[700]	valid's auc: 0.75582
[800]	valid's auc: 0.75528
Early stopping, best iteration is:
[626]	valid's auc: 0.757253
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:53:30,917] (INFO): Trial 0 finished with value: 0.7572525939349232 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.746203
[200]	valid's auc: 0.75234
[300]	valid's auc: 0.753404
[400]	valid's auc: 0.755376
[500]	valid's auc: 0.756627
[600]	valid's auc: 0.755125
[700]	valid's auc: 0.754617
Early stopping, best iteration is:
[529]	valid's auc: 0.757215
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:53:36,571] (INFO): Trial 1 finished with value: 0.7572151749870369 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.739944
[200]	valid's auc: 0.743691
Early stopping, best iteration is:
[16]	valid's auc: 0.747016
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:53:38,669] (INFO): Trial 2 finished with value: 0.7470158389060837 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.734742
[200]	valid's auc: 0.739169
Early stopping, best iteration is:
[16]	valid's auc: 0.741964
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:53:40,666] (INFO): Trial 3 finished with value: 0.7419642809414607 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.750041
[200]	valid's auc: 0.746668
[300]	valid's auc: 0.745145
Early stopping, best iteration is:
[100]	valid's auc: 0.750041
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:53:43,097] (INFO): Trial 4 finished with value: 0.7500414281208739 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 103}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.737666
[200]	valid's auc: 0.743386
Early stopping, best iteration is:
[29]	valid's auc: 0.747989
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:53:45,101] (INFO): Trial 5 finished with value: 0.7479887315511223 and parameters: {'feature_fraction': 0.6668543055695109, 'num_leaves': 119}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.740505
[200]	valid's auc: 0.749534
[300]	valid's auc: 0.750175
[400]	valid's auc: 0.749908
Early stopping, best iteration is:
[270]	valid's auc: 0.752313
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:53:49,848] (INFO): Trial 6 finished with value: 0.7523132928139582 and parameters: {'feature_fraction': 0.8540362888980227, 'num_leaves': 165}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.750041
[200]	valid's auc: 0.746668
[300]	valid's auc: 0.745145
Early stopping, best iteration is:
[100]	valid's auc: 0.750041
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:53:52,135] (INFO): Trial 7 finished with value: 0.7500414281208739 and parameters: {'feature_fraction': 0.5282057895135501, 'num_leaves': 103}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.754366
[200]	valid's auc: 0.765089
[300]	valid's auc: 0.767933
[400]	valid's auc: 0.768852
[500]	valid's auc: 0.7699
[600]	valid's auc: 0.767789
[700]	valid's auc: 0.766794
Early stopping, best iteration is:
[527]	valid's auc: 0.770579
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:53:58,512] (INFO): Trial 8 finished with value: 0.7705790849463573 and parameters: {'feature_fraction': 0.9162213204002109, 'num_leaves': 53}. Best is trial 8 with value: 0.7705790849463573.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.745669
Early stopping, best iteration is:
[51]	valid's auc: 0.757568
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.719153
[200]	valid's auc: 0.724758
[300]	valid's auc: 0.726382
Early stopping, best iteration is:
[295]	valid's auc: 0.727438
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.717248
Early stopping, best iteration is:
[18]	valid's auc: 0.72208
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.727475
Early stopping, best iteration is:
[81]	valid's auc: 0.733467
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.745154
Early stopping, best iteration is:
[84]	valid's auc: 0.747516
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Time left 230.7673819065094
Blending: Optimization starts with equal weights 

[2020-12-29 19:54:06,752] (INFO): oof_pred:
array([[0.03324839],
       [0.03636894],
       [0.0360034 ],
       ...,
       [0.03097693],
       [0.19479665],
       [0.08057104]], dtype=float32)
Shape = (8000, 1)


Wall time: 1min 9s


## Step 4. Predict to test data and check scores

In [12]:
%%time

test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

[2020-12-29 19:54:07,410] (INFO): Prediction for test data:
array([[0.06259418],
       [0.07673503],
       [0.0326997 ],
       ...,
       [0.05875177],
       [0.04640286],
       [0.20436436]], dtype=float32)
Shape = (2000, 1)
[2020-12-29 19:54:07,411] (INFO): Check scores...
[2020-12-29 19:54:07,415] (INFO): OOF score: 0.7504589492607807
[2020-12-29 19:54:07,418] (INFO): TEST score: 0.7319565217391304


Wall time: 660 ms


## Step 5. Profiling AutoML 

To build report here, we **must** turn on decorators on step 0.4. Report is interactive and you can go as deep into functions call stack as you want:

In [13]:
%%time
p.profile('my_report_profile.html')
assert os.path.exists('my_report_profile.html'), 'Profile report failed to build'

AssertionError: Profiler calls graph has more than 1 connected component but it must be a tree...

## Step 6. Create AutoML with time utilization 

Below we are going to create specific AutoML preset for TIMEOUT utilization (try to spend it as much as possible):

In [14]:
%%time 

automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},
                       lgb_params = {'default_params': {'num_threads': N_THREADS}})
oof_pred = automl.fit_predict(train_data, roles = roles)
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

CUR SETUP FOR RANDOM STATE: {'reader_params': {'random_state': 42}}
FOUND reader_params in kwargs, need to combine
MERGED VARIANT FOR reader_params = {'cv': 5, 'random_state': 42}
Start automl preset with listed constraints:
- time: 299.9980001449585 seconds
- cpus: 4 cores
- memory: 16 gb


Train data shape: (8000, 125)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 296.94696831703186 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...
Linear model: C = 1e-05 score = 0.6871107761224348
Linear model: C = 5e-05 score = 0.7049836693020297
Linear model: C = 0.0001 score = 0.7231478957187378
Linear model: C = 0.0005 score = 0.7541949313362306
Linear model: C = 0.001 score = 0.7612083112828818
Linear model: C = 0.005 score = 0.7631915155208451
Linear model: C = 0.01 score = 0.7601979996899572
Linear model: C = 0.05 score = 0.7480421871909595
Linear model: C = 1e-05 score = 0.7097380264945652
Linear model: C = 5e-05 score = 0.7196522588315217


[2020-12-29 19:54:37,909] (INFO): A new study created in memory with name: no-name-5731bd21-d12b-4008-9e4e-59db6e5a1485


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.755023
[200]	valid's auc: 0.757825
[300]	valid's auc: 0.755419
[400]	valid's auc: 0.755114
Early stopping, best iteration is:
[231]	valid's auc: 0.758846
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:54:41,995] (INFO): Trial 0 finished with value: 0.7588455720020741 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.7588455720020741.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.754708
[200]	valid's auc: 0.760583
[300]	valid's auc: 0.760134
[400]	valid's auc: 0.759915
[500]	valid's auc: 0.761011
Early stopping, best iteration is:
[323]	valid's auc: 0.761593
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:54:46,370] (INFO): Trial 1 finished with value: 0.7615931918897103 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 1 with value: 0.7615931918897103.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.75312
[200]	valid's auc: 0.758209
[300]	valid's auc: 0.759963
[400]	valid's auc: 0.758375
[500]	valid's auc: 0.760476
[600]	valid's auc: 0.759225
Early stopping, best iteration is:
[489]	valid's auc: 0.760898
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:54:53,357] (INFO): Trial 2 finished with value: 0.7608982685718257 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 1 with value: 0.7615931918897103.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.762587
[200]	valid's auc: 0.760946
Early stopping, best iteration is:
[48]	valid's auc: 0.764587
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:54:56,202] (INFO): Trial 3 finished with value: 0.7645867077205981 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 3 with value: 0.7645867077205981.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.756167
[200]	valid's auc: 0.761133
[300]	valid's auc: 0.758792
Early stopping, best iteration is:
[155]	valid's auc: 0.762539
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:54:59,428] (INFO): Trial 4 finished with value: 0.7625393567148303 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 103}. Best is trial 3 with value: 0.7645867077205981.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.746011
[200]	valid's auc: 0.74956
Early stopping, best iteration is:
[32]	valid's auc: 0.755922
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:55:01,559] (INFO): Trial 5 finished with value: 0.7559215485029749 and parameters: {'feature_fraction': 0.6668543055695109, 'num_leaves': 119}. Best is trial 3 with value: 0.7645867077205981.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.752885
[200]	valid's auc: 0.757087
[300]	valid's auc: 0.759172
[400]	valid's auc: 0.757445
Early stopping, best iteration is:
[245]	valid's auc: 0.759947
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:55:06,335] (INFO): Trial 6 finished with value: 0.7599467581827221 and parameters: {'feature_fraction': 0.8540362888980227, 'num_leaves': 165}. Best is trial 3 with value: 0.7645867077205981.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.756167
[200]	valid's auc: 0.761133
[300]	valid's auc: 0.758792
Early stopping, best iteration is:
[155]	valid's auc: 0.762539
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:55:09,207] (INFO): Trial 7 finished with value: 0.7625393567148303 and parameters: {'feature_fraction': 0.5282057895135501, 'num_leaves': 103}. Best is trial 3 with value: 0.7645867077205981.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.744482
Early stopping, best iteration is:
[17]	valid's auc: 0.753527
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.723707
[200]	valid's auc: 0.726005
Early stopping, best iteration is:
[142]	valid's auc: 0.728675
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.720772
[200]	valid's auc: 0.720751
Early stopping, best iteration is:
[160]	valid's auc: 0.724386
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.738409
[200]	valid's auc: 0.737623
Early stopping, best iteration is:
[116]	valid's auc: 0.742708
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.742235
Early stopping, best iteration is:
[29]	valid's auc: 0.750547
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Time left 233.20300555229187
Blending: Optimiz

[2020-12-29 19:55:44,237] (INFO): A new study created in memory with name: no-name-6954bac7-2acc-40c2-a579-37eae98cd103


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.722891
[200]	valid's auc: 0.732075
[300]	valid's auc: 0.737672
[400]	valid's auc: 0.741788
[500]	valid's auc: 0.739949
[600]	valid's auc: 0.740163
Early stopping, best iteration is:
[409]	valid's auc: 0.742232
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:55:50,304] (INFO): Trial 0 finished with value: 0.7422315591406471 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.7422315591406471.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.731524
[200]	valid's auc: 0.739447
[300]	valid's auc: 0.74561
[400]	valid's auc: 0.745086
[500]	valid's auc: 0.747315
[600]	valid's auc: 0.747449
[700]	valid's auc: 0.7453
Early stopping, best iteration is:
[559]	valid's auc: 0.749026
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:56:00,780] (INFO): Trial 1 finished with value: 0.7490257709639655 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 1 with value: 0.7490257709639655.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.719347
[200]	valid's auc: 0.727857
[300]	valid's auc: 0.734507
[400]	valid's auc: 0.736779
[500]	valid's auc: 0.737586
Early stopping, best iteration is:
[338]	valid's auc: 0.738383
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:56:06,476] (INFO): Trial 2 finished with value: 0.7383827530723629 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 1 with value: 0.7490257709639655.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.728579
[200]	valid's auc: 0.730984
[300]	valid's auc: 0.736052
[400]	valid's auc: 0.739815
[500]	valid's auc: 0.739896
[600]	valid's auc: 0.740836
Early stopping, best iteration is:
[427]	valid's auc: 0.7412
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:56:16,474] (INFO): Trial 3 finished with value: 0.7411998652917877 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 1 with value: 0.7490257709639655.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.734497
[200]	valid's auc: 0.733465
Early stopping, best iteration is:
[116]	valid's auc: 0.738126
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.731037
[200]	valid's auc: 0.733988
[300]	valid's auc: 0.731498
Early stopping, best iteration is:
[232]	valid's auc: 0.73644
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.709244
Early stopping, best iteration is:
[74]	valid's auc: 0.71634
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.706994
Early stopping, best iteration is:
[51]	valid's auc: 0.715783
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.747813
[200]	valid's auc: 0.748402
Early stopping, best iteration is:
[156]	valid's auc: 0.753184
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Time left 163.741035

[2020-12-29 19:57:07,544] (INFO): A new study created in memory with name: no-name-d62dfdf7-8071-4df5-8feb-d3ffa3abf7d8


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.716156
[200]	valid's auc: 0.72008
[300]	valid's auc: 0.721865
[400]	valid's auc: 0.724094
[500]	valid's auc: 0.72411
[600]	valid's auc: 0.725115
[700]	valid's auc: 0.727082
[800]	valid's auc: 0.7272
[900]	valid's auc: 0.727168
[1000]	valid's auc: 0.728258
[1100]	valid's auc: 0.728745
[1200]	valid's auc: 0.729285
Early stopping, best iteration is:
[1062]	valid's auc: 0.729595
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:57:24,055] (INFO): Trial 0 finished with value: 0.7295946458831138 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.7295946458831138.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.71808
[200]	valid's auc: 0.726943
[300]	valid's auc: 0.729039
[400]	valid's auc: 0.731583
[500]	valid's auc: 0.733294
[600]	valid's auc: 0.732706
Early stopping, best iteration is:
[488]	valid's auc: 0.733839
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:57:31,521] (INFO): Trial 1 finished with value: 0.733839023686194 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 1 with value: 0.733839023686194.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.717508
[200]	valid's auc: 0.724067
[300]	valid's auc: 0.729883
[400]	valid's auc: 0.729985
[500]	valid's auc: 0.729338
Early stopping, best iteration is:
[373]	valid's auc: 0.730899
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:57:39,524] (INFO): Trial 2 finished with value: 0.7308989634951435 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 1 with value: 0.733839023686194.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.718748
[200]	valid's auc: 0.728991
[300]	valid's auc: 0.731305
Early stopping, best iteration is:
[279]	valid's auc: 0.732529
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.729869
[200]	valid's auc: 0.729768
[300]	valid's auc: 0.733122
[400]	valid's auc: 0.731689
Early stopping, best iteration is:
[364]	valid's auc: 0.735012
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.736647
[200]	valid's auc: 0.737119
Early stopping, best iteration is:
[129]	valid's auc: 0.739088
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.732831
Early stopping, best iteration is:
[34]	valid's auc: 0.740192
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.74162
[200]	valid's auc: 0.7418
Early stopping, best iteration is:
[126]	valid's auc: 0.7445

Time limit exceeded after calculating fold 2


[300]	valid's auc: 0.737889
Early stopping, best iteration is:
[111]	valid's auc: 0.743122
Lvl_0_Pipe_1_Mod_0_LightGBM fitting and predicting completed
Optuna may run 4.23353991508484 secs


[2020-12-29 19:58:23,410] (INFO): A new study created in memory with name: no-name-1ff4c00a-9867-408d-946a-31f8cbf4299a


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.720652
[200]	valid's auc: 0.722319
[300]	valid's auc: 0.727868
[400]	valid's auc: 0.728296
[500]	valid's auc: 0.727162
Early stopping, best iteration is:
[381]	valid's auc: 0.729611
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-29 19:58:32,559] (INFO): Trial 0 finished with value: 0.729610682575065 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.729610682575065.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.713141
[200]	valid's auc: 0.71622
Early stopping, best iteration is:
[170]	valid's auc: 0.719727
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.763704
[200]	valid's auc: 0.766368
Early stopping, best iteration is:
[140]	valid's auc: 0.769128
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.718623
[200]	valid's auc: 0.723957
[300]	valid's auc: 0.727969
Early stopping, best iteration is:
[279]	valid's auc: 0.728542
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.726196
[200]	valid's auc: 0.71953
Early stopping, best iteration is:
[119]	valid's auc: 0.731185
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.7001
[200]	valid's auc: 0.701888
Early stopping, best iteration is:
[174]	valid's auc: 0.703608
Lvl_0_Pipe_1_Mod_1_LightG

Time limit exceeded in one of the tasks. AutoML will blend level 1 models.                                         
Try to set higher time limits or use Profiler to find bottleneck and optimize Pipelines settings


Blending: Optimization starts with equal weights and score 0.7413122791755135
Blending, iter 0: score = 0.7424197101885567, weights = [0.32361817 0.14539269 0.5309891 ]
Blending, iter 1: score = 0.7424095054105521, weights = [0.3451426  0.14076588 0.5140915 ]
Blending, iter 2: score = 0.7424095054105521, weights = [0.3451426  0.14076588 0.5140915 ]
No score update. Terminated


Automl preset training completed in 53.93 seconds.
Blending: Optimization starts with equal weights and score 0.7552575335179122
Blending, iter 0: score = 0.7577173102161094, weights = [0.37913635 0.08142336 0.5394403  0.        ]
Blending, iter 1: score = 0.757830838371411, weights = [0.48130795 0.         0.518692   0.        ]
Blending, iter 2: score = 0.757830838371411, weights = [0.48130795 0.         0.518692   0.        ]
No score update. Terminated


[2020-12-29 19:58:46,686] (INFO): oof_pred:
array([[0.03258627],
       [0.03032798],
       [0.0332821 ],
       ...,
       [0.02629477],
       [0.1690769 ],
       [0.09159414]], dtype=float32)
Shape = (8000, 1)


Wall time: 4min 36s


## Step 7. Predict to test data and check scores for utilized automl

In [15]:
%%time

test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

[2020-12-29 19:58:48,383] (INFO): Prediction for test data:
array([[0.05990201],
       [0.07784364],
       [0.02795133],
       ...,
       [0.04760466],
       [0.03866495],
       [0.21747506]], dtype=float32)
Shape = (2000, 1)
[2020-12-29 19:58:48,384] (INFO): Check scores...
[2020-12-29 19:58:48,388] (INFO): OOF score: 0.757830838371411
[2020-12-29 19:58:48,392] (INFO): TEST score: 0.7345788043478262


Wall time: 1.69 s


## Step 8. Profiling utilized AutoML 

To build report here, we **must** turn on decorators on step 0.4. Report is interactive and you can go as deep into functions call stack as you want:

In [16]:
%%time
p.profile('my_report_profile.html')
assert os.path.exists('my_report_profile.html'), 'Profile report failed to build'

AssertionError: Profiler calls graph has more than 1 connected component but it must be a tree...

# Appendix. Profiling report screenshots 

After loading HTML with profiling report, you can see fully folded report (please wait for green LOAD OK text for full load finish). If you click on triangle on the left, it unfolds and look like this:  

<img src="imgs/tutorial_2_initial_report.png" alt="Initial profiling report" style="width: 500px;"/>

If we go even deeper we will receive situation like this:

<img src="imgs/tutorial_2_unfolded_report.png" alt="Profiling report after several unfoldings on different levels" style="width: 600px;"/>
