In [None]:
import sys
import os
from pathlib import PurePath

# add custom python modules root to the path variable,
from typing import Dict
root_path = PurePath(os.getcwd()).parents[2].joinpath('src')
if root_path not in sys.path:
    sys.path.insert(0, str(root_path))
sys.path

In [None]:
import warnings
import lightgbm as lgb
import shap
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from typing import Tuple, Dict
from sklearn import metrics




In [None]:
def _oversampling(train_data: pd.DataFrame, config: Dict) -> pd.DataFrame:
    counts = dict(train_data[config["dependent_var"]].value_counts())

    train_data_class_0 = train_data[train_data[config["dependent_var"]] == 0]
    train_data_class_1 = train_data[train_data[config["dependent_var"]] == 1]

    if counts[1] > counts[0]:
        train_data_class_0_over = train_data_class_0.sample(counts[1], replace=True, random_state=0)
        train_data_over = pd.concat([train_data_class_1, train_data_class_0_over], axis=0)
    else:
        train_data_class_1_over = train_data_class_1.sample(counts[0], replace=True, random_state=0)
        train_data_over = pd.concat([train_data_class_0, train_data_class_1_over], axis=0)

    return train_data_over

In [None]:

def prepare_data(data_for_modeling: pd.DataFrame, config: Dict)-> Tuple:
    number_waves = sum(
        data_for_modeling[["wave"] + config["feature_vars"]].groupby("wave").sum().sum(axis=1) > 0
    )

    X = data_for_modeling[config["feature_vars"]]
    y = data_for_modeling[[config["dependent_var"]]]

    sss = StratifiedShuffleSplit(test_size=0.1, random_state=0)
    sss_split_indices = next(sss.split(X, y))

    train_data = data_for_modeling.iloc[sss_split_indices[0]]
    train_data_oversampled = _oversampling(train_data, config)

    X_training = train_data_oversampled[config["feature_vars"]]
    y_training = train_data_oversampled[config["dependent_var"]]

    validation_data = data_for_modeling.iloc[sss_split_indices[1]]
    X_validation = validation_data[config["feature_vars"]]
    y_validation = validation_data[config["dependent_var"]]

    return (X_training, y_training), (X_validation, y_validation)


def build_model(X_training, y_training, X_validatin, y_validation, config: Dict) -> pd.DataFrame:
    weights = data_for_modeling[["_SYS_RESPONDENT_WEIGHT"]]

    lgb_train = lgb.Dataset(X_training, y_training)
    lgb_val = lgb.Dataset(X_validation, y_validation, reference=lgb_train)

    model = lgb.train({"boosting_type": "gbdt", "objective": "binary", "metric": "binary_logloss"},
                      lgb_train,
                      valid_sets=[lgb_train, lgb_val],
                      early_stopping_rounds=5,
                      verbose_eval=True,
                      
                     )

    return model


In [None]:
# Load data
# data_for_modeling = ...
# data_for_modeling.info()

In [None]:
(X_training, y_training), (X_validation, y_validation) = prepare_data(data_for_modeling, config)

In [None]:
model = build_model(X_training, y_training, X_validation, y_validation, config)

In [None]:
def evaluate(model, weights, brand, expected_y, predicted_y):
    auc = metrics.roc_auc_score(expected_y, predicted_y)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
        shap_values_local = explainer.shap_values(X)[1]

    shap_values_global = np.sum(
        np.multiply(np.multiply(shap_values_local, X.replace(0, -1)), weights), axis=0
    ) / np.sum(weights["_SYS_RESPONDENT_WEIGHT"])

    modeling_result = pd.DataFrame(shap_values_global, columns=[brand]).T

    return modeling_result




In [None]:
X = data_for_modeling[config["feature_vars"]]
X

In [None]:
%time predicted_y = model.predict(X)

In [None]:
predicted_y = model.predict(X)

In [None]:
weights = data_for_modeling[["_SYS_RESPONDENT_WEIGHT"]]
expected_y = data_for_modeling[config["dependent_var"]]
evaluate(model, weights, brand, expected_y, predicted_y)


In [None]:
weights = data_for_modeling[["_SYS_RESPONDENT_WEIGHT"]]
expected_y = data_for_modeling[config["dependent_var"]]
evaluate(model, weights, brand, expected_y, predicted_y)



In [None]:
def evaluate(model, weights, brand, expected_y, predicted_y):
    auc = metrics.roc_auc_score(expected_y, predicted_y)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
        shap_values_local = explainer.shap_values(X)[1]

    shap_values_global = np.sum(
        np.multiply(np.multiply(shap_values_local, X.replace(0, -1)), weights), axis=0
    ) / np.sum(weights["_SYS_RESPONDENT_WEIGHT"])

    modeling_result = pd.DataFrame(shap_values_global, columns=[brand]).T

    return modeling_result




In [None]:
X = data_for_modeling[config["feature_vars"]]
X

In [None]:
%time
predicted_y = model.predict(X)

In [None]:
predicted_y = model.predict(X)

In [None]:
weights = data_for_modeling[["_SYS_RESPONDENT_WEIGHT"]]
expected_y = data_for_modeling[config["dependent_var"]]
evaluate(model, weights, brand, expected_y, predicted_y)

