# Model insights

In [1]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import mlflow

%matplotlib inline
plt.rcParams["figure.figsize"] = (14, 8)

## Config

In [2]:
# file paths
INPUT_DIR = Path("../input")

OUTPUT_DIR = Path("../output")

# data
TRAIN_DATA = INPUT_DIR / "train.csv"

# columns in the data
INDEX_COL = "Transaction_ID"

TARGET_COL = "MERCHANT_CATEGORIZED_AS"

DATETIME_COLS = ["MERCHANT_CATEGORIZED_AT", "PURCHASED_AT"]

# random seed
RANDOM_SEED = 98765

# mlflow config
MLFLOW_TRACKING_URI = f"sqlite:///{OUTPUT_DIR}/mlruns.db"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

## Extracting model insights

In [3]:
try:
    from src import utils
except ImportError:
    import sys
    !{sys.executable} -m pip install -e .. -q

In [4]:
# helper functions
from eli5.sklearn import PermutationImportance
from sklearn.model_selection import StratifiedKFold
import eli5

def get_col_names(pipe) -> list:
    """Get the feature names of the transformed matrix"""
    transformers = pipe.named_steps["columntransformer"].named_transformers_
    merchant_cols = list(transformers["merch_name_vec"].get_feature_names_out())
    ts_features = ("month", "day", "weekday", "hour")
    ts_cols = [f"purchased_{feat}" for feat in ts_features]
    untranformed_cols = ["IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY", "USER_HOUSEHOLD"]
    other_cols = ["log_purchase", "log_income", "USER_ID", "USER_GENDER", "age"]
    return merchant_cols + ts_cols + untranformed_cols + other_cols

def explain_models(run_id: str) -> None:
    # load models
    estimators = utils.load_models(run_id)
    
    # load data
    train_df = pd.read_csv(
        TRAIN_DATA, index_col=INDEX_COL, parse_dates=DATETIME_COLS
    )
    y = train_df[TARGET_COL]
    X = train_df.drop(TARGET_COL, axis=1)
    
    # get cross validation splits
    cv = StratifiedKFold(
        n_splits=len(estimators), shuffle=True, random_state=RANDOM_SEED
    )

    for fold, (_, val_idx) in enumerate(cv.split(X, y)):
        # obtain validation data for a particular fold
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        # transform validation data
        transformer = estimators[fold].named_steps["columntransformer"]
        X_val_trans = transformer.transform(X_val)
        
        # get feature names
        feature_names = get_col_names(estimators[fold])

        # obtain permutation importance
        model = estimators[fold].steps[1][1]
        perm = PermutationImportance(model, random_state=RANDOM_SEED)
        perm.fit(X_val_trans, y_val)
        display(eli5.show_weights(perm, feature_names=feature_names))

  from pandas import MultiIndex, Int64Index


In [5]:
# histogram-based GBT
explain_models("71e0f0e65613406d88c6e63129f8b5e5")

INFO: Finished 'load_models' in 1.0585 seconds


Weight,Feature
0.1627  ± 0.0662,USER_ID
0.1307  ± 0.0569,log_purchase
0.1253  ± 0.0644,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0933  ± 0.0584,safaricom
0.0613  ± 0.1048,purchased_hour
0.0587  ± 0.0687,log_income
0.0587  ± 0.0399,purchased_weekday
0.0533  ± 0.0608,purchased_month
0.0453  ± 0.0272,USER_HOUSEHOLD
0.0347  ± 0.0131,USER_GENDER


Weight,Feature
0.1040  ± 0.0354,log_purchase
0.0827  ± 0.0311,safaricom
0.0587  ± 0.0213,USER_HOUSEHOLD
0.0480  ± 0.0433,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0480  ± 0.0320,purchased_month
0.0427  ± 0.0930,USER_ID
0.0347  ± 0.0213,purchased_weekday
0.0347  ± 0.0213,log_income
0.0240  ± 0.0662,purchased_day
0.0240  ± 0.0200,limited


Weight,Feature
0.0827  ± 0.0459,safaricom
0.0507  ± 0.0427,log_purchase
0.0240  ± 0.0200,USER_ID
0.0160  ± 0.0427,USER_GENDER
0.0080  ± 0.0574,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0053  ± 0.0574,purchased_month
0  ± 0.0000,im
0  ± 0.0000,impact
0  ± 0.0000,ipay
0  ± 0.0000,italia


Weight,Feature
0.1486  ± 0.0483,log_purchase
0.0919  ± 0.0397,safaricom
0.0784  ± 0.0432,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0757  ± 0.0367,USER_ID
0.0405  ± 0.0764,purchased_hour
0.0189  ± 0.0367,purchased_month
0.0162  ± 0.0202,USER_GENDER
0.0162  ± 0.0108,USER_HOUSEHOLD
0.0081  ± 0.0216,limited
0.0027  ± 0.0202,purchased_day


Weight,Feature
0.0811  ± 0.0616,safaricom
0.0649  ± 0.0692,log_purchase
0.0568  ± 0.0359,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0432  ± 0.0671,USER_ID
0.0378  ± 0.0626,purchased_weekday
0.0351  ± 0.0439,log_income
0.0243  ± 0.0861,purchased_hour
0.0216  ± 0.0216,USER_HOUSEHOLD
0.0162  ± 0.0397,purchased_day
0.0081  ± 0.0132,limited


In [6]:
# LightGBM
explain_models("7caceff48db9434695cc7850f11347af")

INFO: Finished 'load_models' in 0.1982 seconds


Weight,Feature
0.1040  ± 0.0569,log_purchase
0.0880  ± 0.0433,USER_ID
0.0880  ± 0.0362,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0827  ± 0.0640,safaricom
0.0720  ± 0.0870,purchased_hour
0.0267  ± 0.0337,USER_GENDER
0.0240  ± 0.0392,log_income
0.0160  ± 0.0569,purchased_day
0.0160  ± 0.0107,limited
0.0133  ± 0.0377,USER_HOUSEHOLD


Weight,Feature
0.0800  ± 0.0239,safaricom
0.0640  ± 0.0662,log_purchase
0.0587  ± 0.0362,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0320  ± 0.0644,USER_ID
0.0267  ± 0.0608,log_income
0.0267  ± 0.0413,USER_HOUSEHOLD
0.0213  ± 0.0433,purchased_weekday
0.0160  ± 0.0107,limited
0.0133  ± 0.0169,purchased_month
0.0027  ± 0.0200,age


Weight,Feature
0.1067  ± 0.0337,safaricom
0.0507  ± 0.0544,USER_ID
0.0293  ± 0.0743,purchased_hour
0.0240  ± 0.0261,purchased_month
0.0213  ± 0.0272,purchased_weekday
0.0133  ± 0.0584,log_purchase
0.0107  ± 0.0311,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0  ± 0.0000,impact
0  ± 0.0000,ipay
0  ± 0.0000,italia


Weight,Feature
0.1297  ± 0.0607,log_purchase
0.1027  ± 0.0471,safaricom
0.1000  ± 0.0367,USER_ID
0.0919  ± 0.0524,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0351  ± 0.0557,purchased_hour
0.0243  ± 0.0397,log_income
0.0162  ± 0.0265,purchased_weekday
0.0135  ± 0.0171,limited
0.0081  ± 0.0132,age
0.0054  ± 0.0501,USER_HOUSEHOLD


Weight,Feature
0.0703  ± 0.0602,log_purchase
0.0595  ± 0.0501,safaricom
0.0216  ± 0.0501,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0189  ± 0.0276,USER_GENDER
0.0054  ± 0.0991,purchased_hour
0.0027  ± 0.0397,purchased_weekday
0  ± 0.0000,kariuki
0  ± 0.0000,impact
0  ± 0.0000,ipay
0  ± 0.0000,jackline


In [7]:
# CatBoost
explain_models("5a150d8abae84d87944a3c947a14f68b")

INFO: Finished 'load_models' in 0.9919 seconds


Weight,Feature
0.1253  ± 0.0549,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.1253  ± 0.0644,safaricom
0.0560  ± 0.0200,USER_ID
0.0400  ± 0.0239,purchased_day
0.0347  ± 0.0272,USER_HOUSEHOLD
0.0267  ± 0.0169,USER_GENDER
0.0240  ± 0.0311,purchased_month
0.0240  ± 0.0354,log_purchase
0.0240  ± 0.0311,purchased_hour
0.0240  ± 0.0311,purchased_weekday


Weight,Feature
0.0720  ± 0.0549,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0720  ± 0.0495,safaricom
0.0427  ± 0.0200,USER_HOUSEHOLD
0.0267  ± 0.0446,log_income
0.0240  ± 0.0311,log_purchase
0.0187  ± 0.0131,age
0.0133  ± 0.0000,corporate
0.0133  ± 0.0000,kplc
0.0133  ± 0.0000,viva
0.0133  ± 0.0000,mburu


Weight,Feature
0.0933  ± 0.0413,safaricom
0.0293  ± 0.0459,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0133  ± 0.0169,naivas
0.0133  ± 0.0000,enterprises
0.0133  ± 0.0000,raphael
0.0133  ± 0.0000,zillions
0.0133  ± 0.0000,nhif
0.0133  ± 0.0446,log_purchase
0.0107  ± 0.0107,peter
0.0027  ± 0.0107,rubis


Weight,Feature
0.1162  ± 0.0439,safaricom
0.1000  ± 0.0405,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0378  ± 0.0202,purchased_day
0.0243  ± 0.0202,purchased_hour
0.0189  ± 0.0439,USER_ID
0.0135  ± 0.0000,total
0.0135  ± 0.0000,nyambura
0.0135  ± 0.0000,butchery
0.0135  ± 0.0000,srt
0.0135  ± 0.0000,telkom


Weight,Feature
0.0703  ± 0.0359,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0676  ± 0.0382,safaricom
0.0297  ± 0.0108,kenda
0.0216  ± 0.0630,log_purchase
0.0189  ± 0.0132,USER_GENDER
0.0135  ± 0.0000,enterprises
0.0135  ± 0.0000,jerusa
0.0135  ± 0.0000,petrocity
0.0108  ± 0.0202,log_income
0.0108  ± 0.0315,USER_HOUSEHOLD


In [8]:
# XGBoost
explain_models("ce4da2287b8f424eb974adfe7fa46086")

INFO: Finished 'load_models' in 0.4225 seconds


Weight,Feature
0.1040  ± 0.0311,USER_ID
0.0933  ± 0.0608,safaricom
0.0613  ± 0.0622,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0533  ± 0.0653,log_purchase
0.0373  ± 0.0200,kenda
0.0320  ± 0.0574,purchased_hour
0.0213  ± 0.0320,USER_HOUSEHOLD
0.0133  ± 0.0292,purchased_month
0.0080  ± 0.0272,USER_GENDER
0.0080  ± 0.0213,purchased_weekday


Weight,Feature
0.1040  ± 0.0311,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0880  ± 0.0362,log_purchase
0.0853  ± 0.0213,safaricom
0.0613  ± 0.0272,USER_HOUSEHOLD
0.0427  ± 0.0517,USER_ID
0.0293  ± 0.0200,limited
0.0293  ± 0.0200,kenda
0.0213  ± 0.0213,purchased_month
0.0187  ± 0.0362,log_income
0.0107  ± 0.0392,USER_GENDER


Weight,Feature
0.1093  ± 0.0459,safaricom
0.0747  ± 0.0495,log_purchase
0.0587  ± 0.0362,purchased_month
0.0373  ± 0.0200,USER_ID
0.0320  ± 0.0320,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0267  ± 0.0631,purchased_weekday
0.0267  ± 0.0000,kenda
0.0187  ± 0.0272,purchased_day
0.0160  ± 0.0107,naivas
0.0133  ± 0.0000,kileleshwa


Weight,Feature
0.1216  ± 0.0820,log_purchase
0.1162  ± 0.0653,safaricom
0.0649  ± 0.0495,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0568  ± 0.0465,purchased_hour
0.0459  ± 0.0367,kenda
0.0405  ± 0.0382,USER_ID
0.0351  ± 0.0216,USER_GENDER
0.0243  ± 0.0465,USER_HOUSEHOLD
0.0162  ± 0.0202,limited
0.0162  ± 0.0315,log_income


Weight,Feature
0.0892  ± 0.0367,log_purchase
0.0865  ± 0.0471,safaricom
0.0703  ± 0.0108,kenda
0.0486  ± 0.0439,IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY
0.0351  ± 0.0405,USER_HOUSEHOLD
0.0351  ± 0.0276,purchased_weekday
0.0324  ± 0.0216,log_income
0.0324  ± 0.0717,purchased_hour
0.0297  ± 0.0359,USER_ID
0.0216  ± 0.0216,USER_GENDER
