In [1]:
TRAIN_PATH = './datasets/train.csv'
TEST_PATH = './datasets/test.csv'
SUBMISSION_PATH = './datasets/submission.csv'

In [2]:
import logging
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)

import pickle

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from sklearn.impute import SimpleImputer
from optbinning import BinningProcess

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from kaggle.api.kaggle_api_extended import KaggleApi

## Load datasets

In [3]:
dataset = pd.read_csv(TRAIN_PATH, low_memory=False)
testset = pd.read_csv(TEST_PATH, low_memory=False)

## Bin numerical data

In [4]:
features = list(filter(lambda x: x not in ('id','target'), list(dataset.columns)))

x = pd.DataFrame(dataset[features], columns=features)
y = dataset['target'].values

optbin = BinningProcess(variable_names=features)
optbin.fit(x, y)
x_binned = pd.DataFrame(optbin.transform(x), columns=features)

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

## Train LogisticRegression

In [8]:
log = LogisticRegression(penalty='none', solver='newton-cg', max_iter=100, fit_intercept=True, n_jobs=-1)
log.fit(x_train, y_train)

predicts = log.predict_proba(x_test)
roc_auc_score(y_test, predicts[:,1])

0.7491885608375819

## Score and Submit LogisticRegression

In [14]:
x_eval_binned = pd.DataFrame(optbin.transform(testset[features]), columns=features)
predictions_proba_original = log.predict_proba(x_eval_binned)
pd.DataFrame({'id': testset['id'], 'target': predictions_proba_original[:,1]}).to_csv(SUBMISSION_PATH, index=False)

api = KaggleApi()
api.authenticate()
api.competition_submit(SUBMISSION_PATH,'Binned LogisticRegression Submission','tabular-playground-series-nov-2021', quiet=True)

Successfully submitted to Tabular Playground Series - Nov 2021

## Train RandomForest

In [7]:
rfc = RandomForestClassifier(n_jobs=-1)
rfc.fit(x_train, y_train)

predicts = rfc.predict_proba(x_test)
roc_auc_score(y_test, predicts[:,1])

0.7068294149682788

## Score and Submite RandomForest

In [11]:
predictions_proba = rfc.predict_proba(optbin.transform(testset[features]))
pd.DataFrame({'id': testset['id'], 'target': predictions_proba[:,1]}).to_csv(SUBMISSION_PATH, index=False)

api = KaggleApi()
api.authenticate()
api.competition_submit(SUBMISSION_PATH,'Binned RandomForest Submission','tabular-playground-series-nov-2021', quiet=True)

Successfully submitted to Tabular Playground Series - Nov 2021

## Train SVC

In [9]:
from sklearn.linear_model import SGDClassifier

svc = SGDClassifier(loss='log', fit_intercept=True, n_jobs=-1)
svc.fit(x_train, y_train)

predicts = svc.predict_proba(x_test)
roc_auc_score(y_test, predicts[:,1])

0.6181580710537582

## Train Kneighbors

In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_jobs=-1)
knn.fit(x_train, y_train)

predicts = knn.predict_proba(x_test)
roc_auc_score(y_test, predicts[:,1])

KeyboardInterrupt: 

## Ensemble Models

In [10]:
from sklearn.ensemble import StackingClassifier

StackingClassifier([log, rfc, svc, knn], stack_method='predict_proba', n_jobs=-1)

StackingClassifier(estimators=[LogisticRegression(n_jobs=-1,
                                                  solver='newton-cg'),
                               RandomForestClassifier(n_jobs=-1),
                               SGDClassifier(loss='log', n_jobs=-1)],
                   n_jobs=-1, stack_method='predict_proba')