In [1]:
!pip install -U lightautoml

Collecting lightautoml
  Downloading LightAutoML-0.2.14-py3-none-any.whl (250 kB)
[K     |████████████████████████████████| 250 kB 6.8 MB/s 
Collecting lightgbm<3.0,>=2.3
  Downloading lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 19.7 MB/s 
[?25hCollecting json2html
  Downloading json2html-1.3.0.tar.gz (7.0 kB)
Collecting poetry-core<2.0.0,>=1.0.0
  Downloading poetry_core-1.0.3-py2.py3-none-any.whl (424 kB)
[K     |████████████████████████████████| 424 kB 32.8 MB/s 
Collecting log-calls
  Downloading log_calls-0.3.2.tar.gz (232 kB)
[K     |████████████████████████████████| 232 kB 41.5 MB/s 
Collecting autowoe>=1.2
  Downloading AutoWoE-1.2.5-py3-none-any.whl (204 kB)
[K     |████████████████████████████████| 204 kB 39.3 MB/s 
Collecting efficientnet-pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
Collecting importlib-metadata<2.0,>=1.0
  Downloading importlib_metadata-1.7.0-py2.py3-none-any.whl (31 kB

In [8]:
# Standard python libraries
import os
import time
import re

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

plt.style.use("fivethirtyeight")
sns.set_style("darkgrid")
pd.set_option("display.max_columns", 999)

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

os.chdir('/kaggle/working/')

In [3]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 3 * 3600 # Time in seconds for automl run
TARGET_NAME = 'target'

In [10]:
%%time

train_data = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
train_data[TARGET_NAME] = train_data[TARGET_NAME].str.slice(start=6).astype(int) - 1
test_data = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

CPU times: user 425 ms, sys: 52 ms, total: 477 ms
Wall time: 475 ms


# TODO Add New Features

In [11]:
def create_gr_feats(data):
    pass
    
all_df = pd.concat([train_data, test_data]).reset_index(drop = True)
create_gr_feats(all_df)
train_data, test_data = all_df[:len(train_data)], all_df[len(train_data):]
print(train_data.shape, test_data.shape)


(100000, 52) (50000, 52)


# AUTOML preset use

In [None]:
# create task
task = Task('multiclass',)
# setup columns roles
roles = {
    'target': TARGET_NAME,
    'drop': ['id'],
}
# train on full data
automl = TabularUtilizedAutoML(task = task, 
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS},
                               configs_list=[
                                   '../input/lightautoml-configs/conf_0_sel_type_0.yml',
                                   '../input/lightautoml-configs/conf_1_sel_type_1.yml'
                               ])
oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))
# fast feature importance calculation
# Fast feature importances calculation
fast_fi = automl.get_feature_scores('fast', silent = False)
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)

AutoML:

1. [first tutorial](https://github.com/sberbank-ai-lab/LightAutoML/blob/master/Tutorial_1.%20Create%20your%20own%20pipeline.ipynb)

2. [costom feature generator](https://www.kaggle.com/simakov/lama-custom-automl-pipeline-example)

Pycaret:

1. [Tabular Playground Series May 2021 using Pycaret](https://www.kaggle.com/siddharthpchauhan/tabular-playground-series-may-2021-using-pycaret) Newer.

2. [Tabular Playground May 2021 using Catboost](https://www.kaggle.com/siddharthpchauhan/tabular-playground-may-2021-using-catboost) Older.