In [1]:
import os
import joblib

import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler, OneHotEncoder
from sklearn.metrics import accuracy_score

import shap
import optuna
import category_encoders as ce
import optuna.visualization as vis

from pycaret.classification import *

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../data/preprocessed_train1.csv')
test = pd.read_csv('../data/preprocessed_test1.csv')
raw_test = pd.read_csv('../data/test.csv')

In [3]:
data_setup = setup(
    data=train,
    target='Response',
    fold_shuffle=True,
)

Unnamed: 0,Description,Value
0,session_id,7598
1,Target,Response
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(381109, 11)"
5,Missing Values,False
6,Numeric Features,5
7,Categorical Features,5
8,Ordinal Features,False
9,High Cardinality Features,False


AttributeError: 'Simple_Imputer' object has no attribute 'fill_value_categorical'

In [4]:
lgb = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8769,0.8579,0.0046,0.5556,0.0091,0.0071,0.0419
1,0.8769,0.8532,0.0049,0.5517,0.0097,0.0075,0.043
2,0.8769,0.855,0.0046,0.5357,0.009,0.007,0.0407
3,0.8768,0.8601,0.0052,0.5152,0.0102,0.0078,0.042
4,0.8766,0.8564,0.0049,0.4444,0.0096,0.007,0.0359
5,0.8765,0.8575,0.0046,0.3947,0.009,0.0062,0.0312
6,0.8763,0.8585,0.0021,0.2593,0.0042,0.0022,0.0132
7,0.877,0.8608,0.0058,0.5938,0.0114,0.0091,0.0496
8,0.8773,0.8574,0.0049,0.8421,0.0097,0.0083,0.0584
9,0.8768,0.8535,0.0037,0.5217,0.0073,0.0055,0.0356


In [None]:
best_model = compare_models(sort='AUC')

In [5]:
evaluate_model(lgb)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [6]:
final_model = finalize_model(lgb)

In [11]:
preds = predict_model(final_model, data=test, round=6)

In [12]:
print(preds.shape)
preds.head()

(127037, 12)


Unnamed: 0,Gender,Age,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,age < 46,Label,Score
0,1,25,11,1,0,0,0.364613,152,53,1,0,0.999964
1,1,40,28,0,1,1,0.254636,7,111,1,0,0.681055
2,1,47,28,0,1,1,0.591809,124,199,0,0,0.705187
3,1,24,27,1,0,1,0.448947,152,187,1,0,0.990075
4,1,27,28,1,0,0,1.548154,152,297,1,0,0.9999


In [16]:
probs = []

for i, row in tqdm(preds.iterrows(), total=preds.shape[0]):
    if row['Label'] == 0:
        probs.append(1 - row['Score'])
    else:
        probs.append(row['Score'])

  0%|          | 0/127037 [00:00<?, ?it/s]

In [17]:
sub = pd.DataFrame({
    'id': raw_test['id'],
    'Response': probs
})

print(sub.shape)
sub.head()

(127037, 2)


Unnamed: 0,id,Response
0,381110,3.6e-05
1,381111,0.318945
2,381112,0.294813
3,381113,0.009925
4,381114,0.0001


In [18]:
sub.to_csv('../submissions/pycaret_lgb1.csv', index=False)