## 0. Prerequisites

### 0.0. install LightAutoML

In [1]:
%%capture
!pip3 install -U lightautoml

# QUICK WORKAROUND FOR PROBLEM WITH PANDAS
!pip3 install -U pandas

### 0.1. Import libraries

Here we will import the libraries we use in this kernel:
- Standard python libraries for timing, working with OS etc.
- Essential python DS libraries like numpy, pandas, scikit-learn and torch (the last we will use in the next cell)
- LightAutoML modules: `TabularAutoML` preset for AutoML model creation and Task class to setup what kind of ML problem we solve (binary/multiclass classification or regression)

In [2]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

### 0.2. Constants

Here we setup the constants to use in the kernel:
- `N_THREADS` - number of vCPUs for LightAutoML model creation
- `N_FOLDS` - number of folds in LightAutoML inner CV
- `RANDOM_STATE` - random seed for better reproducibility
- `TEST_SIZE` - houldout data part size 
- `TIMEOUT` - limit in seconds for model to train
- `TARGET_NAME` - target column name in dataset

In [3]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 10 * 3600 # equal to 10 hours
TARGET_NAME = 'target'

### 0.3. Imported models setup

For better reproducibility fix numpy random seed with max number of threads for Torch (which usually try to use all the threads on server):

In [4]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

### 0.4. Data loading

For now it's time to load the data:

In [5]:
train_data = pd.read_feather('../input/amexfeatureengineering/770_FE_train.feather')
print(train_data.shape)
train_data.head()

(458913, 772)


Unnamed: 0,customer_ID,B_30_nunique,B_38_last,B_38_nunique,D_114_last,D_117_last,D_120_last,D_120_nunique,D_66_last,D_66_nunique,...,D_53_last_mean_diff,B_28_last_mean_diff,S_22_last_mean_diff,B_3_last_mean_diff,D_56_last_mean_diff,D_130_last_mean_diff,S_7_last_mean_diff,total_data_count,total_data_last,target
0,-9223358381327749917,2,7,3,2,1,1,2,0,1,...,-0.003294,0.029327,0.02887,-0.266846,0.006329,-0.004444,-0.176514,2316.0,179.0,1
1,-9223193039457028513,1,1,1,2,6,1,1,0,1,...,0.0,-0.000346,0.088196,-0.001068,-0.029099,0.0,0.0,2239.0,172.0,0
2,-9223189665817919541,1,1,1,1,5,1,1,0,1,...,0.0,-0.011818,-5.8e-05,-0.000363,0.003073,0.0,0.0,2215.0,169.0,0
3,-9223188534444851899,1,1,1,1,3,1,1,0,1,...,0.0,-0.00198,0.003124,0.003376,0.0,0.0,-0.000879,2130.0,163.0,0
4,-9223173911659837606,1,7,2,2,1,1,1,0,1,...,-0.017929,-0.002434,0.028061,0.018967,-0.000242,0.0,0.047119,2284.0,176.0,1


# 1. Task definition

### 1.1. Task type

On the cell below we create Task object - the class to setup what task LightAutoML model should solve with specific loss and metric if necessary (more info can be found [here](https://lightautoml.readthedocs.io/en/latest/pages/modules/generated/lightautoml.tasks.base.Task.html#lightautoml.tasks.base.Task) in our documentation):

In [6]:
# COMPETITION METRIC FROM Konstantin Yakovlev
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [7]:
task = Task('binary', )

### 1.2. Feature roles setup

To solve the task, we need to setup columns roles. The **only role you must setup is target role**, everything else (drop, numeric, categorical, group, weights etc.) is up to user - LightAutoML models have automatic columns typization inside:

In [8]:
roles = {
    'target': TARGET_NAME,
    'drop': ['customer_ID']
}

### 1.3. LightAutoML model creation - TabularAutoML preset

In [9]:
automl = TabularAutoML(
    task = task, 
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    general_params = {'use_algos': [['linear_l2', 'lgb', 'cb']]},
    reader_params = {'n_jobs': 1, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
    selection_params = {'mode': 0}
)

# 2. AutoML training

To run autoML training use fit_predict method:
- `train_data` - Dataset to train.
- `roles` - Roles dict.
- `verbose` - Controls the verbosity: the higher, the more messages.
        <1  : messages are not displayed;
        >=1 : the computation process for layers is displayed;
        >=2 : the information about folds processing is also displayed;
        >=3 : the hyperparameters optimization process is also displayed;
        >=4 : the training process for every algorithm is displayed;

Note: out-of-fold prediction is calculated during training and returned from the fit_predict method

In [10]:
%%time 
oof_pred = automl.fit_predict(train_data, roles = roles, verbose = 3)

[16:41:38] Stdout logging level is INFO3.
[16:41:38] Task: binary

[16:41:38] Start automl preset with listed constraints:
[16:41:38] - time: 36000.00 seconds
[16:41:38] - CPU: 4 cores
[16:41:38] - memory: 16 GB

[16:41:38] [1mTrain data shape: (458913, 772)[0m

[16:44:35] Feats was rejected during automatic roles guess: []
[16:44:39] Layer [1m1[0m train process start. Time left 35819.22 secs
[16:46:01] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[16:46:02] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====
[16:46:22] Linear model: C = 1e-05 score = 0.9571133271575156
[16:46:37] Linear model: C = 5e-05 score = 0.9585801494292806
[16:46:49] Linear model: C = 0.0001 score = 0.9588950606648894
[16:47:08] Linear model: C = 0.0005 score = 0.9592434948002484
[16:47:08] Linear model: C = 0.001 score = 0.9592436333719189
[16:47:37] Linear model: C = 0.005 score = 0.9593279966091908
[16:47:37] Linear model: C = 0.01 score = 0.9593279966091908
[16

In [11]:
print(automl.create_model_str_desc())

Final prediction for new objects (level 0) = 
	 0.08880 * (5 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +
	 0.34011 * (5 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) +
	 0.57110 * (5 averaged models Lvl_0_Pipe_1_Mod_1_CatBoost) 


In [12]:
print(f'OOF score: {amex_metric_mod(train_data[TARGET_NAME].values, oof_pred.data[:, 0])}')

OOF score: 0.7957557390739294


In [13]:
import joblib
joblib.dump(automl,'automl.pkl')

['automl.pkl']

In [14]:
automl_model = joblib.load('./automl.pkl')
automl_model

<lightautoml.automl.presets.tabular_presets.TabularAutoML at 0x7fa53af98490>

In [15]:
oof = pd.DataFrame({"customer_ID":train_data.customer_ID,'target':train_data[TARGET_NAME],'oof_pred':oof_pred.data[:, 0]})

In [16]:
oof.head()

Unnamed: 0,customer_ID,target,oof_pred
0,-9223358381327749917,1,0.76069
1,-9223193039457028513,0,0.000672
2,-9223189665817919541,0,0.001375
3,-9223188534444851899,0,0.019096
4,-9223173911659837606,1,0.916336


In [17]:
oof.to_csv('AutoML_oof.csv',index=False)

In [18]:
import gc
del train_data
gc.collect()

84

test_predictions = []
for i in range(10):
    data = pd.read_pickle('../input/amexaggdatapicklef32/test_agg_f32_part_{}.pkl'.format(i), compression="gzip")
    chris_xgb_test = data['customer_ID'].map(test_mapper['prediction']).values
    for col in data.columns:
        if data[col].dtype=='float16':
            data[col] = data[col].astype('float32').round(decimals=2).astype('float16')
    print(i, data.shape)
    test_pred = automl.predict(data)
    test_predictions += list(best_w * test_pred.data[:, 0] + (1-best_w)*chris_xgb_test)

submission = pd.read_csv('../input/amex-default-prediction/sample_submission.csv')
print(submission.shape)
submission.head()

submission['prediction'] = test_predictions
submission.to_csv('lightautoml_tabularautoml.csv', index = False)
submission