# Part05 Model Bias

In [14]:
# import modules

from typing import Dict, Union, List

import pandas as pd
import numpy as np
import joblib
import yaml

from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

from src.commons.Utils import impute_scale_and_convert_to_numpy

In [15]:
ohe_test: pd.DataFrame
churn_test: pd.DataFrame 

ohe_test = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/ohe_test.csv"
)

churn_test = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/churn_test.csv"
)

In [16]:
with open(file="../../config.yaml", mode="r") as file_stream: 
    yml = yaml.load(
        stream=file_stream,
        Loader=yaml.SafeLoader
    )
    mean_total_charges: float = yml["MEAN_TOTAL_CHARGES"]

Now, let's randomly select 100 customers that has

   * TechSupport_No
   * OnlineSecurity_No
   * Contract_Month-to-Month
   * InternetService_Fiber optic
   * OnlineBackup_No

In [17]:
# selecting customers that have high SHAP value indicators
condition =   (ohe_test["TechSupport_No"] == 1) \
            & (ohe_test["OnlineSecurity_No"]) \
            & (ohe_test["Contract_Month-to-month"]) \
            & (ohe_test["InternetService_Fiber optic"]) \
            & (ohe_test["OnlineBackup_No"])

base_ohe: pd.DataFrame = ohe_test[condition]
base_churn: pd.DataFrame = churn_test.loc[churn_test.index.isin(base_ohe.index)]

print(f"there are {len(base_ohe)} customers found with these characteristics")
print(f"len(base_churn) = {len(base_churn)}")

there are 363 customers found with these characteristics
len(base_churn) = 363


In [18]:
# Now, selecting customers that are NOT in the base_ohe (the complementary set)
compared_ohe: pd.DataFrame = ohe_test.loc[~ohe_test.index.isin(base_ohe.index)]

_compared_ohe: pd.DataFrame = compared_ohe.sample(n=len(base_ohe))

compared_ohe = _compared_ohe.copy()
compared_churn: pd.DataFrame = churn_test.loc[churn_test.index.isin(compared_ohe.index)] 

print(f"there are {len(compared_ohe)} customers complementary of base_ohe")
print(f"len(compared_churn) = {len(compared_churn)}")

there are 363 customers complementary of base_ohe
len(compared_churn) = 363


In [19]:
scaler_folder: str = "../../models/scaler"

feature_base_np: np.ndarray
churn_base_np: np.ndarray

feature_compared_np: np.ndarray
churn_compared_np: np.ndarray

feature_base_np, churn_base_np = impute_scale_and_convert_to_numpy(
    ohe_df=base_ohe,
    columns_with_nulls=["TotalCharges"],
    impute_val=[mean_total_charges],
    scaler_folder=scaler_folder,
    churn_df=base_churn
)

feature_compared_np, churn_compared_np = impute_scale_and_convert_to_numpy(
    ohe_df=compared_ohe,
    columns_with_nulls=["TotalCharges"],
    impute_val=[mean_total_charges],
    scaler_folder=scaler_folder,
    churn_df=compared_churn    
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ohe_df[[each_cols]] = ohe_df[[each_cols]].fillna(value=im_val, inplace=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ohe_df[[each_cols]] = ds_scaled
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ohe_df[[each_cols]] = ds_scaled
A value is trying to be set on a copy of a slice from a DataFra

In [21]:
# Load trained PCA
pca: PCA = joblib.load(
    filename="../../models/feature_pca.pkl"
)

feature_base_pca: np.ndarray = pca.transform(X=feature_base_np)
feature_compared_pca: np.ndarray = pca.transform(X=feature_compared_np)

In [22]:
# Load the best GradientBoostingClassifier model

gb_classifier: GradientBoostingClassifier = joblib.load(
    filename="../../models/gb_classifier_best.pkl"
)

In [23]:
# Predict base group
y_pred_base: np.ndarray = gb_classifier.predict(X=feature_base_pca)
base_accuracy_score: float = accuracy_score(
    y_true=churn_base_np.ravel(),
    y_pred=y_pred_base
)

In [24]:
# Predict the compared group
y_pred_compared: np.ndarray = gb_classifier.predict(X=feature_compared_pca)
compared_accuracy_score: float = accuracy_score(
    y_true=churn_compared_np.ravel(),
    y_pred=y_pred_compared
)

In [25]:
print(f"base group accuracy score {base_accuracy_score}")
print("----"*10)
print(f"compared group accuracy score {compared_accuracy_score}")

base group accuracy score 0.7630853994490359
----------------------------------------
compared group accuracy score 0.5509641873278237


As expected, the bias of the model is quite high. 

In fact, the accuracy of the model for predicting churn for customers that have high SHAP value characteristics is about 1.5x better than those who don't

This bias is caused by the imbalanced distribution found in the training dataset itself.