In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os

sys.path.append('..')
sys.path.append('../src')

import pandas as pd
import numpy as np
from datetime import timedelta

import constants as cst

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

from xgboost import XGBClassifier, plot_tree, plot_importance

In [None]:
training_data = pd.read_csv(os.path.join('..', cst.FEATURES_PATH), index_col=0)
training_target = pd.read_csv(os.path.join('..', cst.TRAIN_TARGET_PATH), index_col=0)

In [None]:
full_training_data = pd.merge(training_data, training_target, on='client_id', how='left')
full_training_data.dropna(inplace=True)

In [None]:
train_set, test_set = train_test_split(full_training_data, test_size=0.3, random_state=42)

## Extract label

In [None]:
y_train = train_set['is_churn']
train_set.drop(columns=['is_churn'], inplace=True)

y_test = test_set['is_churn']
test_set.drop(columns=['is_churn'], inplace=True)

## Transformer

In [None]:
scale_nume_cols = [
    'mean_qty',
    'mean_sales',
    'n_branch',
    'n_product',
    'purchase_freq',
    'n_purchases',
    # 'client_age',
    # 'time_from_last_purchase',
    # 'client_lifetime',
]
unscaled_num_cols = [
    'max_qty', 
    'min_qty', 
    'std_qty', 
    'last_qty_1',
    'last_qty_2',
    'last_qty_3',
    'last_qty_4',
    'max_sales',
    'min_sales', 
    'std_sales', 
    'last_sales_1',
    'last_sales_2',
    'last_sales_3',
    'last_sales_4', 
    'delay_purchase_n1',
    'delay_purchase_n2',
    'delay_purchase_n3',
    'delay_purchase_n4',
]
drop_cols = ['client_id', 'frequency', 'client_category', 'time_from_last_purchase', 'client_lifetime', 'client_age']

In [None]:
transformer = make_column_transformer(
    (StandardScaler(), scale_nume_cols),
    ('passthrough', unscaled_num_cols),
    ('drop', drop_cols)
)

## Fitting model

In [None]:
pipeline = make_pipeline(
    transformer, XGBClassifier()
)

In [None]:
pipeline.fit(train_set, y_train)

## Evaluating model

In [None]:
train_pred = pipeline.predict(train_set)
test_pred = pipeline.predict(test_set)

### Train eval

In [None]:
print(f'Accuracy score: {accuracy_score(train_pred, y_train)}')
print(f'Precision score: {precision_score(train_pred, y_train)}')
print(f'Recall score: {recall_score(train_pred, y_train)}')
print(f'F1 score: {f1_score(train_pred, y_train)}')
print(f'{confusion_matrix(train_pred, y_train)}')

### Test eval

In [None]:
print(f'Accuracy score: {accuracy_score(test_pred, y_test)}')
print(f'Precision score: {precision_score(test_pred, y_test)}')
print(f'Recall score: {recall_score(test_pred, y_test)}')
print(f'F1 score: {f1_score(test_pred, y_test)}')
print(f'{confusion_matrix(test_pred, y_test)}')

In [None]:
pipeline.steps[1][1].get_booster().feature_names = [name.split('__')[1] for name in pipeline.steps[0][1].get_feature_names_out()]

plot_importance(pipeline.steps[1][1])

## Interpretability

In [None]:
import shap

In [None]:
# load JS visualization code to notebook
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(pipeline.steps[1][1])
shap_values = explainer.shap_values(pipeline.steps[0][1].fit_transform(train_set))

In [None]:
shap.summary_plot(shap_values, features=pipeline.steps[0][1].fit_transform(train_set), 
                  feature_names=[name.split('__')[1] for name in pipeline.steps[0][1].get_feature_names_out()])

In [None]:
i = np.argwhere(train_pred==1)[0][0]
shap.force_plot(
    explainer.expected_value, 
    shap_values[i], 
    features=pipeline.steps[0][1].fit_transform(train_set)[i], 
    feature_names=[name.split('__')[1] for name in pipeline.steps[0][1].get_feature_names_out()]
)

## Validation

In [None]:
# retrieve non-churners
val_transactions = pd.read_csv(os.path.join('..', cst.VALIDATION_DATA_PATH), index_col=0)
non_churners = val_transactions['client_id'].unique()

In [None]:
# check whether a client has churned in the validation period
train_set.loc[train_set['client_id'].isin(non_churners), 'val_is_churn'] = 0
train_set.loc[~(train_set['client_id'].isin(non_churners)), 'val_is_churn'] = 1

In [None]:
train_set['val_is_churn'].value_counts(normalize=True)

In [None]:
print(f"Accuracy score: {accuracy_score(train_pred, train_set['val_is_churn'])}")
print(f"Precision score: {precision_score(train_pred, train_set['val_is_churn'])}")
print(f"Recall score: {recall_score(train_pred, train_set['val_is_churn'])}")
print(f"F1 score: {f1_score(train_pred, train_set['val_is_churn'])}")
print(f"{confusion_matrix(train_pred, train_set['val_is_churn'])}")

## Saving predictions

In [None]:
full_training_data['churn_prob'] = pipeline.predict_proba(full_training_data)[:, 1]
full_training_data['churn_pred'] = pipeline.predict(full_training_data)

In [None]:
full_training_data.loc[(full_training_data['client_id'].isin(non_churners)) & (full_training_data['churn_pred']==1), 'actionable'] = 1

In [None]:
full_training_data.to_csv(os.path.join('..', cst.FULL_PREDICTIONS_PATH))

In [None]:
explainer = shap.TreeExplainer(pipeline.steps[1][1])
shap_values = explainer.shap_values(pipeline.steps[0][1].fit_transform(full_training_data))

In [None]:
np.save(os.path.join('..', cst.SHAP_VALUES_PATH), shap_values)

In [None]:
import pickle

with open(os.path.join('..', cst.EXPLAINER_PATH), 'wb') as f:
    pickle.dump(explainer, f)

In [None]:
with open(os.path.join('..', cst.PIPELINE_PATH), 'wb') as f:
    pickle.dump(pipeline, f)

In [None]:
full_training_data[full_training_data['actionable']==1]