In [1]:
import os
import joblib

import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler, OneHotEncoder

import shap
import optuna
import category_encoders as ce
import optuna.visualization as vis

import xgboost as xgb
import lightgbm as lgb
import catboost as cat

from pycaret.classification import *

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../data/clean_train4.csv')
test = pd.read_csv('../data/clean_test4.csv')

In [3]:
print(train.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2802 entries, 0 to 2801
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  2802 non-null   int64  
 1   job_type             2802 non-null   int64  
 2   marital_status       2802 non-null   int64  
 3   education_level      2802 non-null   int64  
 4   balance_amt          2802 non-null   float64
 5   household_insurance  2802 non-null   int64  
 6   car_loan             2802 non-null   int64  
 7   communication        2802 non-null   int64  
 8   last_contact_day     2802 non-null   int64  
 9   last_contact_month   2802 non-null   int64  
 10  no_of_contacts       2802 non-null   int64  
 11  prev_attempts        2802 non-null   int64  
 12  Outcome              2802 non-null   int64  
 13  car_insurance        2802 non-null   int64  
 14  call_duration        2802 non-null   float64
 15  zero_neg_balance     2802 non-null   i

Unnamed: 0,age,job_type,marital_status,education_level,balance_amt,household_insurance,car_loan,communication,last_contact_day,last_contact_month,no_of_contacts,prev_attempts,Outcome,car_insurance,call_duration,zero_neg_balance
0,47,1,1,0,1112.0,1,0,0,12,8,1,0,3,0,2.15,0
1,46,4,1,2,1.312372,0,1,0,14,8,3,1,1,1,293.3,0
2,36,7,2,1,281.0,1,0,0,15,8,1,0,3,1,8.583333,0
3,32,9,2,1,540.0,0,1,0,28,4,1,0,3,1,12.5,0
4,30,0,2,1,315.0,1,0,0,13,8,3,4,0,1,14.733333,0


In [4]:
num_feats = test.columns.tolist()

data_setup = setup(
    data=train,
    target='car_insurance',
    train_size=0.75,
    numeric_features=num_feats,
    fold_shuffle=True,
)

Unnamed: 0,Description,Value
0,session_id,1451
1,Target,car_insurance
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(2802, 16)"
5,Missing Values,False
6,Numeric Features,15
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


AttributeError: 'Simple_Imputer' object has no attribute 'fill_value_categorical'

In [13]:
best_model = compare_models(n_select=2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8653,0.9418,0.8903,0.8753,0.8824,0.7248,0.7258,1.806
lightgbm,Light Gradient Boosting Machine,0.8629,0.9373,0.8819,0.878,0.8797,0.7204,0.7209,0.97
xgboost,Extreme Gradient Boosting,0.8596,0.9365,0.8777,0.8763,0.8765,0.7137,0.7148,0.712
rf,Random Forest Classifier,0.8572,0.9341,0.8803,0.8707,0.8751,0.7084,0.7091,0.263
gbc,Gradient Boosting Classifier,0.8563,0.9338,0.8819,0.8681,0.8744,0.7063,0.7076,0.151
ada,Ada Boost Classifier,0.8453,0.9236,0.8534,0.8728,0.8625,0.6857,0.6869,0.086
et,Extra Trees Classifier,0.822,0.9062,0.8325,0.8513,0.8416,0.6385,0.6392,0.229
lr,Logistic Regression,0.812,0.9009,0.8124,0.8512,0.8308,0.6195,0.6213,0.224
dt,Decision Tree Classifier,0.7834,0.7792,0.8099,0.8097,0.8095,0.5585,0.559,0.016
nb,Naive Bayes,0.712,0.8655,0.5226,0.9471,0.6733,0.4521,0.5169,0.013


In [18]:
cat = create_model('catboost')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8578,0.9359,0.925,0.8409,0.881,0.7055,0.7104
1,0.8667,0.9473,0.9,0.871,0.8852,0.7263,0.7268
2,0.8524,0.9442,0.8417,0.8938,0.867,0.7015,0.7031
3,0.8762,0.9501,0.9083,0.879,0.8934,0.7458,0.7464
4,0.8762,0.9364,0.8992,0.8843,0.8917,0.7472,0.7474
5,0.8619,0.9321,0.8824,0.875,0.8787,0.7184,0.7185
6,0.8857,0.9566,0.8908,0.906,0.8983,0.7679,0.768
7,0.9095,0.9597,0.9496,0.8968,0.9224,0.8141,0.816
8,0.819,0.9189,0.8487,0.8347,0.8417,0.6306,0.6307
9,0.8476,0.9372,0.8571,0.8718,0.8644,0.6905,0.6907


In [19]:
lgb = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8626,0.9399,0.9167,0.8527,0.8835,0.7164,0.7192
1,0.8667,0.9437,0.875,0.8898,0.8824,0.7285,0.7287
2,0.8476,0.9378,0.85,0.8793,0.8644,0.6906,0.6911
3,0.8905,0.9519,0.9083,0.9008,0.9046,0.7761,0.7761
4,0.8857,0.9362,0.8992,0.8992,0.8992,0.7673,0.7673
5,0.8476,0.9266,0.8571,0.8718,0.8644,0.6905,0.6907
6,0.9048,0.9557,0.8992,0.9304,0.9145,0.8071,0.8077
7,0.8667,0.9466,0.9076,0.864,0.8852,0.7264,0.7276
8,0.8095,0.9043,0.8403,0.8264,0.8333,0.6111,0.6113
9,0.8476,0.9299,0.8655,0.8655,0.8655,0.6897,0.6897


In [20]:
evaluate_model(cat)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [16]:
best_model

[<catboost.core.CatBoostClassifier at 0x7f702a7d63a0>,
 LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.1, max_depth=-1,
                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                random_state=1451, reg_alpha=0.0, reg_lambda=0.0, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0)]

In [21]:
ensembled_model = ensemble_model([cat, lgb], fold=5)

ValueError: Estimator [<catboost.core.CatBoostClassifier object at 0x7f704ac18250>, LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=1451, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)] does not have the required fit() method.

In [7]:
final_model = finalize_model(best_model)

In [9]:
preds = predict_model(final_model, data=test)
preds.head()

Unnamed: 0,age,job_type,marital_status,education_level,balance_amt,household_insurance,car_loan,communication,last_contact_day,last_contact_month,no_of_contacts,prev_attempts,Outcome,call_duration,zero_neg_balance,Label,Score
0,65,5,1,1,20806.0,0,0,1,7,0,2,0,3,8.716667,0,1,0.7714
1,36,4,1,2,900.0,1,0,0,14,8,2,0,3,7.916667,0,0,0.6538
2,37,4,1,2,6771.0,0,0,0,6,11,1,1,2,5.616667,0,1,0.977
3,30,4,2,2,0.0,0,0,0,13,8,2,0,3,974.45,1,1,0.9938
4,41,4,1,2,328.0,1,0,0,28,5,12,0,3,1.083333,0,0,0.9978


In [11]:
sub = pd.DataFrame({
    'prediction': preds['Label'].astype(int)
})
print(sub.shape)
print(sub.value_counts())
sub.head()

(935, 1)
prediction
1             549
0             386
dtype: int64


Unnamed: 0,prediction
0,1
1,0
2,1
3,1
4,0


In [12]:
sub.to_csv('../submissions/pycaret1.csv', index=False)