# Step 0.0. Install LightAutoML

In [None]:
!pip install -U lightautoml==0.3.8b1
!pip install -U pandas==1.5.3

# Step 0.1. Import necessary libraries 

In [None]:
# Standard python libraries
import os
import time

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task

# Step 0.2. Parameters 

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 13 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 11 * 3600 # Time in seconds for automl run
TARGET = 'smoking' # Target column name

# Step 0.3. Fix torch number of threads and numpy seed 

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Data load 

In [None]:
%%time
pd.set_option('display.max_columns', 100)
train_data = pd.read_csv('../input/playground-series-s3e24/train.csv').astype(np.float32).drop('id', axis = 1)
train_data

In [None]:
train_data_orig = pd.read_csv('../input/smoker-status-prediction-using-biosignals/train_dataset.csv').astype(np.float32)
train_data = pd.concat([train_data, train_data_orig]).reset_index(drop = True)
print(train_data.shape)
train_data_orig

In [None]:
test_data = pd.read_csv('../input/playground-series-s3e24/test.csv').astype(np.float32)
test_data.head()

In [None]:
submission = pd.read_csv('../input/playground-series-s3e24/sample_submission.csv')
submission.head()

# Step 0.5. Add new features

In [None]:
def create_extra_features(data):
    pass
    

create_extra_features(train_data)
create_extra_features(test_data)

# Step 0.6. Data splitting for train-test 

In [None]:
# tr_data, te_data = train_test_split(train_data, 
#                                      test_size=TEST_SIZE, 
#                                      stratify=train_data[TARGET], 
#                                      random_state=RANDOM_STATE)
# print('Data splitted. Parts sizes: tr_data = {}, te_data = {}'.format(tr_data.shape, te_data.shape))

# ========= AutoML preset usage =========


## Step 1. Create Task

In [None]:
%%time
task = Task('binary',)

## Step 2. Setup columns roles

In [None]:
%%time

roles = {
    'target': TARGET,
    #'drop': ['id']
}

## Step 3. Create AutoML from preset and train on 80% of data

In [None]:
# %%time 

# automl = TabularAutoML(task = task, 
#                        timeout = TIMEOUT,
#                        cpu_limit = N_THREADS,
#                        reader_params = {'n_jobs': N_THREADS})
# oof_pred = automl.fit_predict(tr_data, roles = roles, verbose = 3)
# print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

## Step 4. Predict to validation data and check scores

In [None]:
# %%time

# test_pred = automl.predict(te_data)
# print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

# print('Check scores...')
# print('OOF score: {}'.format(roc_auc_score(tr_data[TARGET].values, oof_pred.data[:, 0])))
# print('VALID score: {}'.format(roc_auc_score(te_data[TARGET].values, test_pred.data[:, 0])))

## Step 5. Create AutoML with time utilization 

Below we are going to create specific AutoML preset for TIMEOUT utilization (try to spend it as much as possible):

In [None]:
# %%time 

# automl = TabularUtilizedAutoML(task = task, 
#                        timeout = TIMEOUT,
#                        cpu_limit = N_THREADS,
#                        general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
#                        reader_params = {'n_jobs': N_THREADS})
# oof_pred = automl.fit_predict(tr_data, roles = roles)
# print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

## Step 6. Predict to validation data and check scores for utilized automl

In [None]:
# %%time

# test_pred = automl.predict(te_data)
# print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

# print('Check scores...')
# print('OOF score: {}'.format(acc_score(tr_data[TARGET].values, oof_pred.data[:, 0])))
# print('VALID score: {}'.format(acc_score(te_data[TARGET].values, test_pred.data[:, 0])))

## Step 7. Train on full data 

In [None]:
%%time 

automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS},
                       tuning_params = {'max_tuning_time': 1200},
                       configs_list = [
                           '../input/lightautoml-configs/conf_0_sel_type_0.yml',
                           '../input/lightautoml-configs/conf_2_select_mode_1_no_typ.yml',
                           '../input/lightautoml-configs/conf_4_sel_type_0_no_int.yml',
                           '../input/lightautoml-configs/conf_5_sel_type_1_tuning_full.yml',
                           '../input/lightautoml-configs/conf_6_sel_type_1_tuning_full_no_int_lgbm.yml'
                       ])
oof_pred = automl.fit_predict(train_data, roles = roles, verbose = 1)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

## Step 8. Predict for test data and check OOF score

In [None]:
%%time

test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(train_data[TARGET].values, oof_pred.data[:, 0])))

## Step 9. Prepare submission

In [None]:
submission[TARGET] = test_pred.data[:, 0]
submission.to_csv('automl_submission.csv', index = False)

In [None]:
submission

## Step 10. Feature importances 

In [None]:
%%time

# Fast feature importances calculation
fast_fi = automl.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)