#### Please upvote if you find the notebook useful (and don't forget about github star :) )

# Step 0.0. Install [LightAutoML](https://github.com/sberbank-ai-lab/LightAutoML)

In [1]:
!pip install -U lightautoml

Collecting lightautoml
  Downloading LightAutoML-0.2.14-py3-none-any.whl (250 kB)
[K     |████████████████████████████████| 250 kB 4.6 MB/s eta 0:00:01
Collecting torch>=1.6
  Downloading torch-1.8.1-cp38-none-macosx_10_9_x86_64.whl (119.6 MB)
[K     |████████████████████████████████| 119.6 MB 31.3 MB/s eta 0:00:01
[?25hCollecting typing-extensions
  Using cached typing_extensions-3.10.0.0-py3-none-any.whl (26 kB)
Installing collected packages: typing-extensions, torch, lightautoml
  Attempting uninstall: torch
    Found existing installation: torch 1.6.0
    Uninstalling torch-1.6.0:
      Successfully uninstalled torch-1.6.0
  Attempting uninstall: lightautoml
    Found existing installation: lightautoml 0.2.13
    Uninstalling lightautoml-0.2.13:
      Successfully uninstalled lightautoml-0.2.13
Successfully installed lightautoml-0.2.14 torch-1.8.1 typing-extensions-3.10.0.0


# Step 0.1. Import necessary libraries 

In [2]:
# Standard python libraries
import os
import time
import re

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

# Step 0.2. Parameters 

In [3]:
N_THREADS = 6 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 3 * 3600 # Time in seconds for automl run
TARGET_NAME = 'target'

# Step 0.3. Data load 

In [10]:
%%time

train_data = pd.read_csv('./input/train.csv')
train_data[TARGET_NAME] = train_data[TARGET_NAME].str.slice(start=6).astype(int) - 1
train_data.head()

CPU times: user 556 ms, sys: 79.2 ms, total: 636 ms
Wall time: 638 ms


Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,0,6,1,0,0,0,0,7,...,0,0,0,0,0,0,2,0,0,5
1,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,5
2,2,0,0,0,0,0,1,0,3,0,...,0,0,0,0,1,0,0,0,0,1
3,3,0,0,7,0,1,5,2,2,0,...,0,4,0,2,2,0,4,3,0,7
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
test_data = pd.read_csv('./input/test.csv')
test_data.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74
0,200000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,200001,1,2,0,0,0,0,0,0,0,...,3,1,3,0,0,0,0,3,0,0
2,200002,0,1,7,1,0,0,0,0,6,...,3,0,0,0,0,3,0,2,0,0
3,200003,0,0,0,4,3,1,0,0,0,...,0,0,0,1,0,0,0,4,0,0
4,200004,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [12]:
submission = pd.read_csv('./input/sample_submission.csv')
submission.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,200000,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
1,200001,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
2,200002,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
3,200003,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111
4,200004,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111,0.1111


# Step 0.5. Add new features

In [13]:
def create_gr_feats(data):
    pass
    

all_df = pd.concat([train_data, test_data]).reset_index(drop = True)
create_gr_feats(all_df)
train_data, test_data = all_df[:len(train_data)], all_df[len(train_data):]
print(train_data.shape, test_data.shape)

(200000, 77) (100000, 77)


In [14]:
train_data.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
0,0,0,0,6,1,0,0,0,0,7,...,0,0,0,0,0,0,2,0,0,5.0
1,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,5.0
2,2,0,0,0,0,0,1,0,3,0,...,0,0,0,0,1,0,0,0,0,1.0
3,3,0,0,7,0,1,5,2,2,0,...,0,4,0,2,2,0,4,3,0,7.0
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0


# ========= AutoML preset usage =========


## Step 1. Create Task

In [15]:
%%time

task = Task('multiclass',)

CPU times: user 3.02 ms, sys: 1.33 ms, total: 4.35 ms
Wall time: 3.16 ms


## Step 2. Setup columns roles

In [16]:
%%time

roles = {
    'target': TARGET_NAME,
    'drop': ['id'],
}

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.72 µs


## Step 3. Train on full data 

In [17]:
%%time 

automl = TabularUtilizedAutoML(task = task, 
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               general_params = {'use_algos': [['lgb', 'cb', 'cb_tuned'], ['linear_l2', 'cb']]},
                               reader_params = {'n_jobs': N_THREADS},
                               )
oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
Found reader_params in kwargs, need to combine
Merged variant for reader_params = {'n_jobs': 6, 'random_state': 42}
Found general_params in kwargs, need to combine
Merged variant for general_params = {'use_algos': [['lgb', 'cb', 'cb_tuned'], ['linear_l2', 'cb']], 'return_all_predictions': False}
Start automl preset with listed constraints:
- time: 10799.995699167252 seconds
- cpus: 6 cores
- memory: 16 gb

Train data shape: (200000, 77)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 10781.880586147308 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's multi_logloss: 1.75966
[200]	valid's multi_logloss: 1.75508
[300]	valid's multi_logloss: 1.75654
Early stopping, best iteration i

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


Start fitting Lvl_0_Pipe_0_Mod_2_CatBoost ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_2_CatBoost =====

0:	learn: 2.1729409	test: 2.1731485	best: 2.1731485 (0)	total: 84.3ms	remaining: 5m 37s
100:	learn: 1.7669339	test: 1.7726229	best: 1.7726229 (100)	total: 9.79s	remaining: 6m 17s
200:	learn: 1.7511661	test: 1.7597123	best: 1.7597123 (200)	total: 19.1s	remaining: 6m 1s
300:	learn: 1.7457115	test: 1.7565433	best: 1.7565433 (300)	total: 27.7s	remaining: 5m 40s
400:	learn: 1.7420037	test: 1.7548603	best: 1.7548603 (400)	total: 36.2s	remaining: 5m 24s
500:	learn: 1.7382577	test: 1.7536908	best: 1.7536908 (500)	total: 44.7s	remaining: 5m 12s
600:	learn: 1.7352083	test: 1.7530468	best: 1.7530455 (598)	total: 52.5s	remaining: 4m 56s
700:	learn: 1.7325366	test: 1.7525887	best: 1.7525887 (700)	total: 1m	remaining: 4m 42s
800:	learn: 1.7301623	test: 1.7522229	best: 1.7522229 (800)	total: 1m 7s	remaining: 4m 29s
900:	learn: 1.7279965	test: 1.7520129	best: 1.7520129 (900)	total: 1m 

KeyboardInterrupt: 

In [12]:
%%time

# Fast feature importances calculation
#fast_fi = automl.get_feature_scores('fast', silent = False)
#fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)

CPU times: user 16 µs, sys: 0 ns, total: 16 µs
Wall time: 35 µs


## Step 4. Predict for test data and check OOF score

In [13]:
%%time

test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(log_loss(train_data[TARGET_NAME].values, oof_pred.data)))

Prediction for test data:
array([[0.06476909, 0.3916093 , 0.15487702, 0.02635641, 0.01283768,
        0.15494484, 0.02237028, 0.04978392, 0.12245146],
       [0.04236097, 0.07071404, 0.05685499, 0.02075427, 0.01503311,
        0.2744168 , 0.08466147, 0.30401257, 0.13119176],
       [0.02001443, 0.02370084, 0.01564323, 0.00997829, 0.0063512 ,
        0.73568875, 0.02834253, 0.11780702, 0.04247368],
       [0.04652037, 0.11051005, 0.0837672 , 0.03297291, 0.01738988,
        0.25059748, 0.07831381, 0.22150673, 0.15842158],
       [0.04172738, 0.11030082, 0.08049527, 0.02490849, 0.01534015,
        0.28902256, 0.06611353, 0.22361517, 0.14847662],
       [0.04552011, 0.19149403, 0.10645209, 0.02604306, 0.01334615,
        0.29393595, 0.04544733, 0.1297297 , 0.14803158],
       [0.04401483, 0.11259232, 0.08304422, 0.03093427, 0.01896748,
        0.2223151 , 0.08736981, 0.24247345, 0.15828852],
       [0.04221279, 0.42807233, 0.19504562, 0.02217503, 0.01443897,
        0.03345913, 0.03714093,

## Step 5. Prepare submission

In [14]:
submission.iloc[:, 1:] = test_pred.data
submission.to_csv('lightautoml_2lvl_3hours.csv', index = False)

In [15]:
submission

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,200000,0.064769,0.391609,0.154877,0.026356,0.012838,0.154945,0.022370,0.049784,0.122451
1,200001,0.042361,0.070714,0.056855,0.020754,0.015033,0.274417,0.084661,0.304013,0.131192
2,200002,0.020014,0.023701,0.015643,0.009978,0.006351,0.735689,0.028343,0.117807,0.042474
3,200003,0.046520,0.110510,0.083767,0.032973,0.017390,0.250597,0.078314,0.221507,0.158422
4,200004,0.041727,0.110301,0.080495,0.024908,0.015340,0.289023,0.066114,0.223615,0.148477
...,...,...,...,...,...,...,...,...,...,...
99995,299995,0.076573,0.373476,0.150132,0.030341,0.015206,0.102688,0.033028,0.075069,0.143487
99996,299996,0.050449,0.240075,0.129792,0.028192,0.014861,0.189044,0.050416,0.138321,0.158850
99997,299997,0.069616,0.285622,0.127926,0.029599,0.015611,0.172455,0.041709,0.104603,0.152860
99998,299998,0.033462,0.023512,0.020290,0.012150,0.010273,0.369281,0.074179,0.375746,0.081107
