In [49]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

TRAIN_PATH = './datasets/train.csv'
TEST_PATH = './datasets/test.csv'
SUBMISSION_PATH = './datasets/submission.csv'

MODEL_PATH = './models/lightautoml_model.sav'

N_THREADS = 10
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 3*3600
TARGET_NAME = 'target'

In [48]:
import logging
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)

import pickle

import pandas as pd
from sklearn.impute import SimpleImputer
from optbinning import BinningProcess
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

from kaggle.api.kaggle_api_extended import KaggleApi

In [4]:
dataset = pd.read_csv(TRAIN_PATH, low_memory=False)
testset = pd.read_csv(TEST_PATH, low_memory=False)

## Train

In [None]:
task = Task('binary', )
roles = {'target': TARGET_NAME, 'drop': ['id']}

automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE, 'verbose': 1},
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]}
                      )

model = automl.fit_predict(dataset, roles=roles, verbose=0)
pickle.dump(automl, open(MODEL_PATH, 'wb'))

## Submission

In [12]:
logging.info("Predict")
predict = automl.predict(testset)
submissionset = pd.DataFrame(testset['id'])
submissionset[TARGET_NAME] = predict.data[:, 0]
submissionset.to_csv(SUBMISSION_PATH, index = False)

logging.info("Submission")
api = KaggleApi()
api.authenticate()
api.competition_submit(SUBMISSION_PATH,'API Submission','tabular-playground-series-nov-2021', quiet=True)

2021-11-02 21:56:22,603 - Load testset
2021-11-02 21:56:28,979 - Predict
2021-11-02 21:57:06,255 - Submission


Successfully submitted to Tabular Playground Series - Nov 2021

## Transform Data - FillNA

In [50]:
features = list(filter(lambda x: x not in ('id','target'), list(dataset.columns)))

filling_na = SimpleImputer(strategy='most_frequent')
filling_na.fit(dataset[features])

x = pd.DataFrame(filling_na.transform(dataset[features]), columns=features)
y = dataset['target'].values

## Transform Data - Binning

In [51]:
optbin = BinningProcess(variable_names=features)
optbin.fit(x, y)
x_binned = pd.DataFrame(optbin.transform(x), columns=features)

## Training With Binning

In [52]:
dataset_binned = x_binned.copy()
dataset_binned['target'] = y

task = Task('binary', )
roles = {'target': TARGET_NAME}

automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE, 'verbose': 1},
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]}
                      )

model = automl.fit_predict(dataset_binned, roles=roles, verbose=0)
pickle.dump(automl, open('./models/binned_lightautoml_model.sav', 'wb'))

2021-11-02 23:27:24,470 - Blending: iteration [1m1[0m: score = [1m0.7448895368493509[0m, weights = [1m[0.5239563  0.11808646 0.3579572 ][0m
2021-11-02 23:27:27,901 - Blending: iteration [1m2[0m: score = [1m0.7448895496845387[0m, weights = [1m[0.5244529  0.11725058 0.3582965 ][0m
2021-11-02 23:27:31,284 - Blending: iteration [1m3[0m: score = [1m0.7448895496845387[0m, weights = [1m[0.5244529  0.11725058 0.3582965 ][0m
2021-11-02 23:27:31,284 - Blending: no score update. Terminated

2021-11-02 23:27:31,299 - [1mAutoml preset training completed in 663.25 seconds[0m

2021-11-02 23:27:31,300 - Model description:
Final prediction for new objects (level 0) = 
	 0.52445 * (5 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +
	 0.11725 * (5 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) +
	 0.35830 * (5 averaged models Lvl_0_Pipe_1_Mod_1_Tuned_LightGBM) 



## Score with Binning

In [56]:
logging.info("Predict")
binned_testset = pd.DataFrame(optbin.transform(testset[features]), columns=features)

predict = automl.predict(binned_testset)

pd.DataFrame({ 'id': testset['id'], 'target': predict.data[:, 0] }).to_csv(SUBMISSION_PATH, index = False)

logging.info("Submission")
api = KaggleApi()
api.authenticate()
api.competition_submit(SUBMISSION_PATH,'Binned lightautoml Submission','tabular-playground-series-nov-2021', quiet=True)

Successfully submitted to Tabular Playground Series - Nov 2021