In [1]:
!pip install pycaret[full]

Collecting pycaret[full]
  Downloading pycaret-3.2.0-py3-none-any.whl.metadata (17 kB)
Collecting category-encoders>=2.4.0 (from pycaret[full])
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting deprecation>=2.1.0 (from pycaret[full])
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting kaleido>=0.2.1 (from pycaret[full])
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting lightgbm>=3.0.0 (from pycaret[full])
  Downloading lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl.metadata (19 kB)
Collecting matplotlib<=3.6,>=3.3.0 (from pycaret[full])
  Downloading matplotlib-3.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8

In [28]:
import numpy as np
import pandas as pd

from pycaret.classification import *
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

import os

This is the continuation of the notebook `customer_lead_generator_eda.ipynb`.

__Helper function__

Function to plot the ROC curve, calculate AUC for a base model and the AUC of our model.

In [3]:
def plot_roc(y_test, proba_preds, model_name):

    # create linear line for dummy classifier with 50% random probability of predictions
    base_probs = [0 for _ in range(len(y_test))] # 0 is the majority class, could also sample randomly
    base_auc = roc_auc_score(y_test, base_probs)

    # get auc for ML model
    lr_auc = roc_auc_score(y_test, proba_preds)

    # summarize scores
    print('Logistic: ROC AUC=%.3f' % (lr_auc))

    # calculate roc curves
    ns_fpr, ns_tpr, _ = roc_curve(y_test, base_probs) # roc curve for the dummy model (3rd output: threshold, not using for plotting)
    lr_fpr, lr_tpr, _ = roc_curve(y_test, proba_preds) # roc curve for your model

    # plot the roc curve for the model
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Base')
    plt.plot(lr_fpr, lr_tpr, marker='.', label=model_name)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()

## 3. Model Development

### Import the processed data

In [4]:
data_folder = 'data'
filename = 'CustomerData_LeadGenerator.csv'

clean_filename = filename.replace('.csv', '_clean.csv')
customer_data_path_clean = os.path.join(data_folder, clean_filename)

clean_reduced_filename = filename.replace('.csv', '_clean_reduced.csv')
customer_data_path_clean_reduced = os.path.join(data_folder, clean_reduced_filename)

df = pd.read_csv(customer_data_path_clean, index_col=0)
df.head()

Unnamed: 0,b_specialisation_i,b_specialisation_h,b_specialisation_g,b_specialisation_f,b_specialisation_e,b_specialisation_d,b_specialisation_c,b_specialisation_b,b_specialisation_a,b_specialisation_j,...,q_2017 HHs: 5th Quintile (68.759 and above),q_5th Quint by Total HH,q_2017 Purchasing Power: Per Capita,q_2017 Total Population,"q_2017 Pop 15+/Edu: University, Fachhochschule",q_Uni by Total Pop,q_2017 Personal Care: Per Capita,q_2017 Medical Products: Per Capita,q_2017 Personal Effects: Per Capita,b_gekauft_gesamt
0,0,0,0,0,0,0,0,0,0,1,...,195228,0.186735,23730.35,2194057,324656,0.147971,593.19,326.28,235.75,1
1,0,0,0,1,1,1,0,0,0,0,...,168291,0.179848,23343.26,1966798,281130,0.142938,588.54,320.36,230.66,1
2,0,0,0,1,0,1,0,0,0,1,...,189787,0.184935,23674.18,2149624,320326,0.149015,592.88,325.57,234.82,1
3,0,1,1,0,0,0,0,0,0,0,...,189634,0.184523,23572.26,2161445,310981,0.143876,590.84,324.06,233.81,0
4,0,0,0,0,0,1,0,0,0,1,...,207590,0.189871,23821.63,2302959,334940,0.145439,593.56,327.75,237.41,1


In [5]:
df_reduced = pd.read_csv(customer_data_path_clean_reduced, index_col=0)
df_reduced

Unnamed: 0,b_specialisation_i,b_specialisation_h,b_specialisation_g,b_specialisation_f,b_specialisation_e,b_specialisation_d,b_specialisation_c,b_specialisation_b,b_specialisation_a,b_specialisation_j,q_OpeningDays,q_OpeningHours,q_2017 Average Household Size,q_5th Quint by Total HH,q_2017 Purchasing Power: Per Capita,q_2017 Total Population,q_Uni by Total Pop,q_2017 Medical Products: Per Capita,b_gekauft_gesamt
0,0,0,0,0,0,0,0,0,0,1,4,36.0,2.1,0.186735,23730.35,2194057,0.147971,326.28,1
1,0,0,0,1,1,1,0,0,0,0,4,20.0,2.1,0.179848,23343.26,1966798,0.142938,320.36,1
2,0,0,0,1,0,1,0,0,0,1,0,0.0,2.1,0.184935,23674.18,2149624,0.149015,325.57,1
3,0,1,1,0,0,0,0,0,0,0,4,20.0,2.1,0.184523,23572.26,2161445,0.143876,324.06,0
4,0,0,0,0,0,1,0,0,0,1,5,33.0,2.1,0.189871,23821.63,2302959,0.145439,327.75,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3768,0,0,0,0,0,0,0,0,0,0,0,0.0,2.2,0.203542,23869.52,327760,0.125418,325.57,0
3769,0,0,0,0,0,0,0,0,0,1,0,0.0,2.1,0.184899,23677.97,2150320,0.149148,326.32,0
3770,0,0,0,0,0,0,0,0,0,0,0,0.0,2.1,0.186292,23721.89,2167556,0.149137,326.22,0
3771,0,0,0,0,0,0,0,0,0,1,0,0.0,2.2,0.222034,24711.33,328260,0.116490,342.68,0


#### Train - test split

In [8]:
target = "b_gekauft_gesamt"
df_reduced_train, df_reduced_test = train_test_split(
    df_reduced, test_size=0.2, random_state=5, stratify=df_reduced[target])
# without stratify=y the class proportions might shift a bit - ensures equal class proportions
df_reduced_train.shape, df_reduced_test.shape

((3018, 19), (755, 19))

In [10]:
df_train, df_test = train_test_split(
    df, test_size=0.2, random_state=5, stratify=df[target])
df_train.shape, df_test.shape

((3018, 24), (755, 24))

In [18]:
df_train = df_train.rename(columns={'b_gekauft_gesamt': 'Expected'})
df_train.head()

Unnamed: 0,b_specialisation_i,b_specialisation_h,b_specialisation_g,b_specialisation_f,b_specialisation_e,b_specialisation_d,b_specialisation_c,b_specialisation_b,b_specialisation_a,b_specialisation_j,...,q_2017 HHs: 5th Quintile (68.759 and above),q_5th Quint by Total HH,q_2017 Purchasing Power: Per Capita,q_2017 Total Population,"q_2017 Pop 15+/Edu: University, Fachhochschule",q_Uni by Total Pop,q_2017 Personal Care: Per Capita,q_2017 Medical Products: Per Capita,q_2017 Personal Effects: Per Capita,Expected
1475,0,1,1,0,1,0,0,0,0,0,...,193787,0.184929,23643.99,2198616,322983,0.146903,592.36,325.87,234.87,0
65,0,0,1,0,0,0,0,0,0,0,...,42641,0.201188,24036.65,454573,69464,0.152812,589.09,326.34,237.5,1
2226,0,1,1,0,1,0,0,0,0,1,...,173260,0.179979,23443.76,2015517,291043,0.144401,590.05,323.33,232.02,0
2034,1,0,0,1,1,0,0,0,0,0,...,22659,0.207529,23029.64,254994,18344,0.071939,564.32,330.21,238.39,0
2643,0,0,0,0,0,0,0,0,0,0,...,33686,0.222787,24672.97,335485,38724,0.115427,592.58,341.76,252.89,0


In [23]:
df_test.head()

Unnamed: 0,b_specialisation_i,b_specialisation_h,b_specialisation_g,b_specialisation_f,b_specialisation_e,b_specialisation_d,b_specialisation_c,b_specialisation_b,b_specialisation_a,b_specialisation_j,...,q_2017 HHs: 5th Quintile (68.759 and above),q_5th Quint by Total HH,q_2017 Purchasing Power: Per Capita,q_2017 Total Population,"q_2017 Pop 15+/Edu: University, Fachhochschule",q_Uni by Total Pop,q_2017 Personal Care: Per Capita,q_2017 Medical Products: Per Capita,q_2017 Personal Effects: Per Capita,b_gekauft_gesamt
742,0,1,0,0,1,0,0,0,0,0,...,2193,0.159306,20812.26,32416,1311,0.040443,536.11,318.8,213.21,0
592,0,0,0,0,0,0,0,0,0,0,...,165741,0.176143,23279.07,1962729,295134,0.150369,589.14,320.47,229.28,0
112,0,0,0,0,0,0,0,0,0,1,...,205969,0.189726,23825.9,2285107,334839,0.146531,593.74,327.78,237.38,0
3408,0,0,0,0,0,0,0,0,0,0,...,6289,0.196734,22036.98,76357,4057,0.053132,543.27,333.26,230.41,0
3567,1,0,1,1,1,0,0,0,0,1,...,46642,0.202471,23712.27,503689,70850,0.140662,581.46,323.27,235.25,0


# Step 1: Feature Selection + Initial Baseline Model

- Repeatedly run next two cells 
- Each time reduce features (using `n_features_to_select`)
- Try to get the best model (Catboost) - Obtained initially from `compare_models` with highest 3-CV F1 score with the minimum features

We are using a tree-based ensemble model to select the best features here (LightGBM) if you check the documentation for the `setup` function which should be pretty good.

Why this step? Running models with less features is faster, so often a good way to see if you really need all features or maybe you don't! Plus ends up making your model more robust if you drop irrelevant features.

In [20]:
clf = setup(data=df_train, 
            target = 'Expected', 
            session_id=13, 
            remove_multicollinearity = True ,
            multicollinearity_threshold = 0.9,
            feature_selection=False,
            #n_features_to_select=19,
            experiment_name='swiss1',
            use_gpu=False,
            n_jobs=-1)

Unnamed: 0,Description,Value
0,Session id,13
1,Target,Expected
2,Target type,Binary
3,Original data shape,"(3018, 24)"
4,Transformed data shape,"(3018, 18)"
5,Transformed train set shape,"(2112, 18)"
6,Transformed test set shape,"(906, 18)"
7,Numeric features,23
8,Preprocess,True
9,Imputation type,simple


In [22]:
# Run the compare_models() function once to see which model is best
best_model = clf.compare_models(fold=5, sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.983,0.673,0.1238,0.2733,0.1667,0.1607,0.1753,0.024
ada,Ada Boost Classifier,0.983,0.6422,0.0667,0.3,0.1071,0.1018,0.1332,0.086
qda,Quadratic Discriminant Analysis,0.7813,0.5264,0.2762,0.0574,0.0507,0.0354,0.0439,0.028
gbc,Gradient Boosting Classifier,0.9749,0.6234,0.0333,0.05,0.04,0.0281,0.0285,0.174
dt,Decision Tree Classifier,0.9659,0.5068,0.0333,0.0333,0.0333,0.0167,0.0163,0.03
lr,Logistic Regression,0.9848,0.5322,0.0,0.0,0.0,0.0,0.0,0.58
knn,K Neighbors Classifier,0.9848,0.4792,0.0,0.0,0.0,0.0,0.0,0.054
nb,Naive Bayes,0.9848,0.5496,0.0,0.0,0.0,0.0,0.0,0.026
svm,SVM - Linear Kernel,0.9848,0.0,0.0,0.0,0.0,0.0,0.0,0.036
ridge,Ridge Classifier,0.9848,0.0,0.0,0.0,0.0,0.0,0.0,0.028


## Try with pre-selected important features using XGBoost

In [24]:
train_data_path = os.path.join(data_folder, 'train_data_selected.csv')
train_data = pd.read_csv(train_data_path, index_col=0)
train_data.head()

Unnamed: 0,b_specialisation_e,b_specialisation_d,b_specialisation_i,b_specialisation_f,q_2017 Personal Care: Per Capita,"q_2017 Pop 15+/Edu: University, Fachhochschule",q_2017 Personal Effects: Per Capita,b_specialisation_j,q_2017 Total Households,b_specialisation_h,b_specialisation_a,q_2017 Medical Products: Per Capita,b_specialisation_b,b_specialisation_g,q_2017 HHs: 5th Quintile (68.759 and above),q_OpeningHours,q_2017 Total Population,b_gekauft_gesamt
1475,1,0,0,0,592.36,322983,234.87,0,1047901,1,0,325.87,0,1,193787,20.0,2198616,0
65,0,0,0,0,589.09,69464,237.5,0,211946,0,0,326.34,0,1,42641,0.0,454573,1
2226,1,0,0,0,590.05,291043,232.02,1,962669,1,0,323.33,0,1,173260,0.0,2015517,0
2034,1,0,1,1,564.32,18344,238.39,0,109185,0,0,330.21,0,0,22659,27.0,254994,0
2643,0,0,0,0,592.58,38724,252.89,0,151203,0,0,341.76,0,0,33686,20.0,335485,0


In [27]:
test_data_path = os.path.join(data_folder, 'test_data_selected.csv')
test_data = pd.read_csv(test_data_path, index_col=0)
test_data.head()

Unnamed: 0,b_specialisation_e,b_specialisation_d,b_specialisation_i,b_specialisation_f,q_2017 Personal Care: Per Capita,"q_2017 Pop 15+/Edu: University, Fachhochschule",q_2017 Personal Effects: Per Capita,b_specialisation_j,q_2017 Total Households,b_specialisation_h,b_specialisation_a,q_2017 Medical Products: Per Capita,b_specialisation_b,b_specialisation_g,q_2017 HHs: 5th Quintile (68.759 and above),q_OpeningHours,q_2017 Total Population,b_gekauft_gesamt
742,1,0,0,0,536.11,1311,213.21,0,13766,1,0,318.8,0,0,2193,27.0,32416,0
592,0,0,0,0,589.14,295134,229.28,0,940947,0,0,320.47,0,0,165741,29.5,1962729,0
112,0,0,0,0,593.74,334839,237.38,1,1085613,0,0,327.78,0,0,205969,0.0,2285107,0
3408,0,0,0,0,543.27,4057,230.41,0,31967,0,0,333.26,0,0,6289,20.0,76357,0
3567,1,0,1,1,581.46,70850,235.25,1,230364,0,0,323.27,0,1,46642,0.0,503689,0


In [30]:
clf_xgb_selected = setup(data=train_data,
            target = 'b_gekauft_gesamt',
            session_id=14,
            test_data=test_data,
            normalize = 'True',
            normalize_method = 'zscore',
            experiment_name='xbg_selected',
            n_jobs=-1)

Unnamed: 0,Description,Value
0,Session id,14
1,Target,b_gekauft_gesamt
2,Target type,Binary
3,Original data shape,"(3773, 18)"
4,Transformed data shape,"(3773, 18)"
5,Transformed train set shape,"(3018, 18)"
6,Transformed test set shape,"(755, 18)"
7,Numeric features,17
8,Rows with missing values,0.0%
9,Preprocess,True


In [31]:
best_model_xgb_selected = compare_models(fold=5, sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9838,0.7017,0.1511,0.37,0.21,0.2038,0.2265,0.022
qda,Quadratic Discriminant Analysis,0.9775,0.596,0.1733,0.2085,0.1862,0.1751,0.1773,0.022
nb,Naive Bayes,0.9692,0.7052,0.2178,0.1601,0.1813,0.1664,0.1699,0.022
knn,K Neighbors Classifier,0.9838,0.6279,0.1067,0.3,0.1571,0.1518,0.1726,0.034
lr,Logistic Regression,0.9851,0.71,0.0867,0.6667,0.1497,0.1464,0.2314,0.674
gbc,Gradient Boosting Classifier,0.9778,0.7114,0.1089,0.1621,0.1288,0.1181,0.1212,0.258
xgboost,Extreme Gradient Boosting,0.9831,0.7164,0.0644,0.1667,0.0923,0.0874,0.097,0.148
ada,Ada Boost Classifier,0.9838,0.7249,0.0444,0.4,0.08,0.0762,0.1285,0.108
dt,Decision Tree Classifier,0.9705,0.5354,0.0867,0.0904,0.0782,0.064,0.0687,0.026
lightgbm,Light Gradient Boosting Machine,0.9834,0.6606,0.0444,0.2,0.0727,0.0687,0.0894,38.692
