In [1]:
TRAIN_PATH = './datasets/train.csv'
TEST_PATH = './datasets/test.csv'
SUBMISSION_PATH = './datasets/submission.csv'

MODEL_PATH = './models/evalml_model.sav'

In [3]:
import logging
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)

import pickle

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from sklearn.impute import SimpleImputer
from optbinning import BinningProcess

from kaggle.api.kaggle_api_extended import KaggleApi

## Load datasets

In [4]:
dataset = pd.read_csv(TRAIN_PATH, low_memory=False)
testset = pd.read_csv(TEST_PATH, low_memory=False)

## Train baseline

In [5]:
from evalml.objectives import AUC
from evalml.utils import infer_feature_types
from evalml.preprocessing import split_data
import evalml

objective = AUC()

features = list(filter(lambda x: x not in ('id','target'), dataset.columns))
x = infer_feature_types(dataset[features])
y = dataset['target']

X_train, X_holdout, y_train, y_holdout = split_data(x, y, problem_type='binary', test_size=.3)

In [None]:
automl = evalml.automl.AutoMLSearch(X_train=x,
                                    y_train=y,
                                    problem_type='binary',
                                    objective='AUC',
                                    random_seed=1,
                                    ensembling=True,
                                    max_iterations=30,
                                    n_jobs=12)
automl.search()
automl.best_pipeline.save('./models/evalml_bestpipeline.sav')

## Submit new score

In [34]:
model = automl.best_pipeline

predictions_proba = model.predict_proba(testset[features])

pd.DataFrame({'id': testset['id'], 'target': predictions_proba.to_dataframe()[1]}).to_csv(SUBMISSION_PATH, index=False)

api = KaggleApi()
api.authenticate()
api.competition_submit(SUBMISSION_PATH,'Binned evalml Submission','tabular-playground-series-nov-2021', quiet=True)

Successfully submitted to Tabular Playground Series - Nov 2021

## Binning

In [35]:
features = list(filter(lambda x: x not in ('id','target'), list(dataset.columns)))

x = pd.DataFrame(dataset[features], columns=features)
y = dataset['target'].values

optbin = BinningProcess(variable_names=features)
optbin.fit(x, y)
x_binned = pd.DataFrame(optbin.transform(x), columns=features)

In [None]:
automl = evalml.automl.AutoMLSearch(X_train=infer_feature_types(x_binned),
                                    y_train=y,
                                    problem_type='binary',
                                    objective='AUC',
                                    random_seed=1,
                                    ensembling=True,
                                    max_iterations=20,
                                    n_jobs=-1)
automl.search()
automl.best_pipeline.save('./models/binned_evalml_bestpipeline.sav')

In [38]:
binned_x_test = optbin.transform(testset[features])

model = automl.best_pipeline
predictions_proba = model.predict_proba(binned_x_test)

pd.DataFrame({'id': testset['id'], 'target': predictions_proba.to_dataframe()[1]}).to_csv(SUBMISSION_PATH, index=False)

api = KaggleApi()
api.authenticate()
api.competition_submit(SUBMISSION_PATH,'Binned evalml Submission','tabular-playground-series-nov-2021', quiet=True)

Successfully submitted to Tabular Playground Series - Nov 2021