In [None]:
import pandas as pd
import numpy as np
import os
import sys
import joblib
import json
import xgboost as xgb
import shap
from pathlib import Path
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')
sys.path.append(os.path.abspath('..'))
from sentinel.validation import SentinelFeatureQuality
from sentinel.evaluation import SentinelEvaluator
from sentinel.inference import SentinelInference
from sentinel.modeling import SentinelTrainer
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
import time

In [41]:
start_time = time.time()

In [36]:
collaps = (time.time() - start_time) 
collaps

39.212315797805786

In [12]:
r = round(100*np.array(y).sum()/len(y), 2)
print(r)

26.67


In [2]:
date = datetime.now().replace(microsecond=0).isoformat()
print(date)

2026-02-04T15:13:50


In [42]:
base_path = Path('../')

In [None]:
# # uncomment this cell and run it to download the dataset
# from scripts.download_data import download_dataset
# download_dataset(output_dir= base_path / 'data/raw', force=True)

#### **Train models**

In [43]:
df = pd.read_csv(base_path / 'data/raw/train_raw.csv')

In [5]:
df.head(12).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
TransactionID,2987000,2987001,2987002,2987003,2987004,2987005,2987006,2987007,2987008,2987009,2987010,2987011
isFraud,0,0,0,0,0,0,0,0,0,0,0,0
TransactionDT,86400,86401,86469,86499,86506,86510,86522,86529,86535,86536,86549,86555
TransactionAmt,68.5,29.0,59.0,50.0,50.0,49.0,159.0,422.5,15.0,117.0,75.887,16.495
ProductCD,W,W,W,W,H,W,W,W,H,W,C,C
...,...,...,...,...,...,...,...,...,...,...,...,...
id_36,,,,,F,,,,F,,F,F
id_37,,,,,T,,,,F,,T,T
id_38,,,,,T,,,,T,,T,T
DeviceType,,,,,mobile,,,,mobile,,desktop,desktop


In [44]:
trainer = SentinelTrainer(base_path=base_path)

In [45]:
init_features = df.columns.tolist()
init_obj_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
cat_cols = [

    'ProductCD', 'card4', 'card6', 'DeviceType',
    'os_type', 'browser_type', 'device_vendor',
    'addr1', 'card1', 'card2',
    'P_emaildomain_vendor_id', 
    'P_emaildomain_suffix_id',
    
    'device_info_combo', 'card_email_combo', 
    'product_network_combo', 'card1_addr1_combo', 
    'os_browser_combo'
] + init_obj_cols

cat_cols = list(set(cat_cols))

In [46]:
data_split_config = {'train_size': 0.85,
 'nan_thresh': 0.95,
 'corr_thresh': 0.97, 
 'cat_cols': cat_cols}
trainer.prepare_data(df, **data_split_config)


üöÄ Starting Data Preparation (Train Split=0.85)
   Train size: 501,959 (85.0%)
   Test size:  88,581 (15.0%)
 
  ‚öôÔ∏è  Running SentinelPreprocessing...
--- Fitting Sentinel Preprocessor ---
Dropped 9 columns
--- Sentinel Preprocessor Fitted ---
--- Transforming 501959 rows ---
--- Transforming 88581 rows ---
   üõ†Ô∏è  Running SentinelFeatureEngineering...
--- Fitting Sentinel v7.0 on 501,959 rows ---
   > PCA Group_0: 11 cols -> 2 comps (97% var)
   > PCA Group_1: 23 cols -> 6 comps (97% var)
   > PCA Group_2: 18 cols -> 5 comps (97% var)
   > PCA Group_3: 22 cols -> 7 comps (97% var)
   > PCA Group_4: 20 cols -> 7 comps (97% var)
   > PCA Group_5: 43 cols -> 10 comps (97% var)
   > PCA Group_6: 29 cols -> 7 comps (97% var)
   > PCA Group_7: 31 cols -> 8 comps (97% var)
   > PCA Group_8: 19 cols -> 7 comps (97% var)
   > PCA Group_9: 46 cols -> 8 comps (97% var)
   > PCA Group_10: 16 cols -> 6 comps (97% var)
   > PCA Group_11: 32 cols -> 10 comps (97% var)
   > PCA Group_12: 11

In [47]:
X_train_eng, y_train, X_test_eng, y_test = trainer.X_train, trainer.y_train, trainer.X_test, trainer.y_test

In [48]:
X_train_eng.head(14).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
TransactionID,2987000,2987001,2987002,2987003,2987004,2987005,2987006,2987007,2987008,2987009,2987010,2987011,2987012,2987013
TransactionDT,86400,86401,86469,86499,86506,86510,86522,86529,86535,86536,86549,86555,86564,86585
TransactionAmt,68.5,29.0,59.0,50.0,50.0,49.0,159.0,422.5,15.0,117.0,75.887001,16.495001,50.0,40.0
ProductCD,1,1,1,1,2,1,1,1,2,1,3,3,1,1
card1,13926,2755,4663,18132,4497,5937,12308,12695,2803,17399,16496,4461,3786,12866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PCA_Group_13_2,0.109226,0.109226,0.109226,0.109226,-0.816481,0.109226,0.109226,0.109226,-0.816481,0.109226,0.109226,0.109226,0.109226,0.109226
PCA_Group_13_3,0.107902,0.107902,0.107902,0.107902,-0.839058,0.107902,0.107902,0.107902,-0.839058,0.107902,0.107902,0.107902,0.107902,0.107902
PCA_Group_13_4,-0.093709,-0.093709,-0.093709,-0.093709,0.951159,-0.093709,-0.093709,-0.093709,0.951159,-0.093709,-0.093709,-0.093709,-0.093709,-0.093709
PCA_Group_13_5,0.022433,0.022433,0.022433,0.022433,-0.099798,0.022433,0.022433,0.022433,-0.099798,0.022433,0.022433,0.022433,0.022433,0.022433


In [49]:
all_cols = X_train_eng.columns.tolist()
print('All columns: ', all_cols)

All columns:  ['TransactionID', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_28', 'id_29', 'id_32', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'country', 'hour_of_day', 'day_of_week', 'day_of_month', 'month_year', 'P_emaildomain_length', 'P_emaildomain_has_digits', 'P_emaildomain_is_free', 'P_emaildomain_vendor_id', 'P_emaildomain_suffix_id', 'P_emaildomain_country_id', 'P_emaildomain_risk_score', 'R_emaildomain_length', 'R_emaildomain_has_digits',

In [31]:
quality_check = SentinelFeatureQuality(
    X_train=X_train_eng,
    y_train=y_train,
    X_test=X_test_eng,
    time_col='TransactionDT'
)

üìä Sentinel Feature Quality Initialized
   Train: 501,959 rows | Fraud Rate: 3.5023%
   Test:  88,581 rows (Drift Analysis Ready)


In [50]:
D_cols = [f'D{i}' for i in range(1, 16) if f'D{i}' in X_train_eng.columns]
statistical_drops = [
    'C2', 'C3', 'C10', 'C11', 'C12',          
    'UID_amt_sum_1h', 'UID_count_24h',      
    'id_04', 'id_06', 'id_10', 'id_17',   
    'id_19', 'id_20',
    'TransactionAmt_log_z',                 
    'screen_height',                    
    'is_99_cents',                      
    'D9_norm',                     
    'addr1_switch',              
    'Amt_div_card4_mean'    
]

features_to_drop_final = [
    'TransactionID',   
    'TransactionDT',   
    'UID',          
    'UID_hash',        
    'P_emaildomain',   
    'R_emaildomain',   
    'country',
     
    'card1', 'card2', 'card3', 'card5', 
    'addr1', 'addr2',                  
    
    *D_cols,        

    'month_year',       
    'day_of_month',     
    'hour_of_day',      
    'days_since_first_txn', 
    'UID_dt_diff',     
    
    *statistical_drops,
    
    'multi_entity_sharing',
    'device_vendor_degree',
    'unique_addrs_per_card',
    'unique_emails_per_card',
    'cards_per_device',
    'addr1_degree',
    
    'card1_addr1_combo', 
    'device_info_combo', 
    'D10_norm',          
    'D15_norm',        
    
    'cents_value',                
    'UID_velocity_1h',            
    'UID_velocity_12h',
    'UID_count_1h',
    'UID_count_12h',
    'email_country_mismatch',     
    'P_emaildomain_country_id',   
    'R_emaildomain_country_id',
    'P_emaildomain_has_digits',
    'R_emaildomain_has_digits',
    
    'PCA_Group_0_1'
]

In [None]:
class StandardColumns:
    # --- CORE TRANSACTION DATA ---
    TRANSACTION_ID = "transaction_id"
    TIMESTAMP      = "timestamp"
    AMOUNT         = "amount"
    SCORE          = "score"      # The output of your ML model
    IS_FRAUD       = "is_fraud"   # Ground truth
    
    # --- STRATEGY CATEGORIES ---
    PRODUCT        = "product"
    EMAIL          = "email_domain"
    DEVICE         = "device_vendor"
    
    # --- BEHAVIORAL & EXPLAINABILITY ---
    VELOCITY       = "velocity_24h"
    HIST_RISK      = "fraud_rate_historical"
    
    # --- FORENSIC DEEP-DIVE ---
    CARD_ID        = "card_id"
    DIST_SIGNAL    = "distance_signal"

COLUMN_MAPPING: Dict[str, str] = {
    # 1. CORE IDENTITY & FINANCIALS
    'TransactionID':        StandardColumns.TRANSACTION_ID,
    'timestamp':            StandardColumns.TIMESTAMP,
    'TransactionAmt':       StandardColumns.AMOUNT,
    'isFraud':              StandardColumns.IS_FRAUD,
    'composite_risk_score': StandardColumns.SCORE, 

    # 2. CATEGORICAL (For Sunburst & Bar Charts)
    'ProductCD':            StandardColumns.PRODUCT,
    'P_emaildomain':        StandardColumns.EMAIL,
    'device_vendor':        StandardColumns.DEVICE,
    'DeviceInfo':           StandardColumns.DEVICE,  # Alias from producer

    # 3. BEHAVIORAL (For Scatter Plots & Explainability)
    'UID_velocity_24h':     StandardColumns.VELOCITY,
    'UID_vel':              StandardColumns.VELOCITY, # Alias from producer
    'card_email_combo_fraud_rate': StandardColumns.HIST_RISK,
    
    # 4. FORENSIC DETAILS (For the Search Table / Detail Panel)
    'card1':                StandardColumns.CARD_ID,
    'dist1':                StandardColumns.DIST_SIGNAL
}

COLUMN_TYPES: Dict[str, str] = {
    StandardColumns.AMOUNT:     'currency',    # e.g. 150.0 -> $150.00
    StandardColumns.SCORE:      'percentage',  # e.g. 0.92 -> 92.0%
    StandardColumns.HIST_RISK:  'percentage',  # e.g. 0.05 -> 5.0%
    StandardColumns.TIMESTAMP:  'datetime',    # e.g. ISO -> Feb 02, 11:00
    StandardColumns.IS_FRAUD:   'boolean'      # e.g. 1 -> "FRAUD"
}

In [5]:
#quality_report = quality_check.score_features(to_analyse)

In [51]:
FIXED_PARAMS = {
            'lgb': {'objective': 'binary', 'boosting_type': 'gbdt', 'metric': 'auc', 'n_jobs': -1, 'verbose': -1, 'random_state': 42},
            'cb': {'loss_function': 'Logloss', 'eval_metric': 'AUC', 'grow_policy': 'SymmetricTree','bootstrap_type': 'Bernoulli', 'thread_count': -1, 'task_type': 'GPU', 'verbose': 0, 'random_state': 42},
            'xgb': {'objective': 'binary:logistic', 'booster': 'gbtree', 'eval_metric': 'auc', 'grow_policy': 'depthwise', 'device': 'cuda', 'tree_method': 'hist', 'early_stopping_rounds': 50, 'n_jobs': -1, 'verbosity': 0, 'random_state': 42}
        }

##### **Train LightGBM**

In [57]:
print(f"\n--- 1. Baseline Training for Feature Selection ---")
lgb_baseline_result = trainer.train_model(
    model_type='lgb',
    params=None,        
    top_n_features=None, 
    full_data=False,    
    n_folds=6,
    cols_to_drop=cols_to_drop,
    save_artifacts=False
)


--- 1. Baseline Training for Feature Selection ---

üöÄ Training LGB Model...

üöÄ Training Final LGB on 255 features | Full Data: False

üöÄ Training LGB Model...
   -> Fold 1/6...
   --- Size: Train=71711, Val=71708 ---
[LightGBM] [Info] Number of positive: 1922, number of negative: 69789
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.431382 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 49363
[LightGBM] [Info] Number of data points in the train set: 71711, number of used features: 251
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026802 -> initscore=-3.592110
[LightGBM] [Info] Start training from score -3.592110
   -> Fold 2/6...
   --- Size: Train=143419, Val=71708 ---
[LightGBM] [Info] Number of positive: 3732, number of negative: 139687
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was

In [58]:
lgb_top_features = lgb_baseline_result['importances'].groupby('feature')['importance'].mean().sort_values(ascending=False).index.tolist()

In [59]:
print(f"\n--- 1. Baseline Training for Feature Selection ---")
lgb_baseline_result = trainer.train_model(
    model_type='lgb',
    params=None,        
    top_n_features=lgb_top_features[:200], 
    full_data=False,    
    n_folds=6,
    cols_to_drop=cols_to_drop,
    save_artifacts=False
)


--- 1. Baseline Training for Feature Selection ---

üöÄ Training LGB Model...

üöÄ Training Final LGB on 200 features | Full Data: False

üöÄ Training LGB Model...
   -> Fold 1/6...
   --- Size: Train=71711, Val=71708 ---
[LightGBM] [Info] Number of positive: 1922, number of negative: 69789
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.248363 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 46378
[LightGBM] [Info] Number of data points in the train set: 71711, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026802 -> initscore=-3.592110
[LightGBM] [Info] Start training from score -3.592110
   -> Fold 2/6...
   --- Size: Train=143419, Val=71708 ---
[LightGBM] [Info] Number of positive: 3732, number of negative: 139687
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was

In [None]:
lgb_best_params = trainer.optimize_hyperparameters(model_type='lgb', n_trials=5, n_folds=3, top_n_features=lgb_top_features[:200])


üîç Optuna: Optimizing LGB on 150 features, (20 trials)...


[I 2026-01-27 23:30:39,330] A new study created in memory with name: no-name-53535de4-ff24-48e5-af95-b7a691f049f4
[I 2026-01-27 23:32:42,985] Trial 0 finished with value: 0.9337392818900874 and parameters: {'n_estimators': 2118, 'learning_rate': 0.009662375788586234, 'num_leaves': 134, 'max_depth': 9, 'subsample': 0.8873097738117048, 'colsample_bytree': 0.8863586374306982, 'reg_alpha': 0.04614015967044038, 'reg_lambda': 0.06395495235273566, 'min_child_samples': 23}. Best is trial 0 with value: 0.9337392818900874.
[I 2026-01-27 23:33:40,796] Trial 1 finished with value: 0.9328453865844626 and parameters: {'n_estimators': 3056, 'learning_rate': 0.15693254125681155, 'num_leaves': 227, 'max_depth': 9, 'subsample': 0.9958935902820945, 'colsample_bytree': 0.9501417805654493, 'reg_alpha': 1.4420985095947494e-08, 'reg_lambda': 1.5691560535327646e-05, 'min_child_samples': 79}. Best is trial 0 with value: 0.9337392818900874.
[I 2026-01-27 23:35:15,089] Trial 2 finished with value: 0.938095483287

   ‚úÖ Best params: {'n_estimators': 1728, 'learning_rate': 0.0052086467147919365, 'num_leaves': 254, 'max_depth': 11, 'subsample': 0.7480403373947704, 'colsample_bytree': 0.5017667080472615, 'reg_alpha': 0.07922401383323642, 'reg_lambda': 1.3782534615346387e-05, 'min_child_samples': 40}


In [None]:
lgb_best_params.update(FIXED_PARAMS['lgb'])
lgb_report = trainer.final_train('lgb', lgb_best_params, lgb_top_features[:200])


üöÄ Training LGB Model...

üèÅ Final Test Results (LGB):
   Optimum Threshold: 0.3169
   AUC: 0.8862
   Precision: 49.35%
   Recall: 52.87%
   Net Profit Impact: $181,705.82
üíæ Saved lgb model & 150 features.


##### **Train CatBoost**

In [19]:
print(f"\n--- 1. Baseline Training for Feature Selection ---")
cb_baseline_result = trainer.train_model(
    model_type='cb',
    params=None,        
    top_n_features=None, 
    full_data=False,    
    n_folds=6,
    cols_to_drop=features_to_drop_final,
    save_artifacts=False
)


--- 1. Baseline Training for Feature Selection ---

üöÄ Training CB Model...

üöÄ Training Final CB on 211 features | Full Data: False

üöÄ Training CB Model...
   -> Fold 1/6...
   --- Size: Train=71711, Val=71708 ---


Default metric period is 5 because AUC is/are not implemented for GPU


   -> Fold 2/6...
   --- Size: Train=143419, Val=71708 ---


Default metric period is 5 because AUC is/are not implemented for GPU


   -> Fold 3/6...
   --- Size: Train=215127, Val=71708 ---


Default metric period is 5 because AUC is/are not implemented for GPU


   -> Fold 4/6...
   --- Size: Train=286835, Val=71708 ---


Default metric period is 5 because AUC is/are not implemented for GPU


   -> Fold 5/6...
   --- Size: Train=358543, Val=71708 ---


Default metric period is 5 because AUC is/are not implemented for GPU


   -> Fold 6/6...
   --- Size: Train=430251, Val=71708 ---


Default metric period is 5 because AUC is/are not implemented for GPU



üèÅ Final Test Set Evaluation:

üèÅ Final Test Results (CB):
   Optimum Threshold: 0.3763
   AUC: 0.8620
   Precision: 44.46%
   Recall: 48.82%
   Net Profit Impact: $154,438.81


In [20]:
cb_top_features = cb_baseline_result['importances'].groupby('feature')['importance'].mean().sort_values(ascending=False).index.tolist()
cb_best_params = trainer.optimize_hyperparameters(model_type='cb', n_trials=20, n_folds=5, top_n_features=cb_top_features[:150])
cb_best_params.update(FIXED_PARAMS['cb'])


üîç Optuna: Optimizing CB on 150 features, (20 trials)...


[I 2026-01-28 00:47:40,928] A new study created in memory with name: no-name-67b3a89a-db57-4fed-aa93-8681c75e104d
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2026-01-28 00:49:09,757] Trial 0 finished with value: 0.9454065801698119 and parameters: {'iterations': 2628, 'learning_rate': 0.15118510911583524, 'depth': 4, 'l2_leaf_reg': 7.024678290229622, 'subsample': 0.7309519457710606, 'random_strength': 5.559065905054413e-08}. Best is trial 0 with value: 0.9454065801698119.
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Defau

   ‚úÖ Best params: {'iterations': 3799, 'learning_rate': 0.006575600361961698, 'depth': 10, 'l2_leaf_reg': 28.72327658931732, 'subsample': 0.5560267687485813, 'random_strength': 0.03336729742516672}


In [21]:
cb_report = trainer.final_train('cb', cb_best_params, cb_top_features[:150])


üöÄ Training CB Model...


Default metric period is 5 because AUC is/are not implemented for GPU



üèÅ Final Test Results (CB):
   Optimum Threshold: 0.5247
   AUC: 0.8731
   Precision: 54.02%
   Recall: 48.17%
   Net Profit Impact: $170,147.86
üíæ Saved cb model & 150 features.


##### **Train Xgboost**

In [22]:
print(f"\n--- 1. Baseline Training for Feature Selection ---")
xgb_baseline_result = trainer.train_model(
    model_type='xgb',
    params=None,        
    top_n_features=None, 
    full_data=False,    
    n_folds=6,
    cols_to_drop=features_to_drop_final,
    save_artifacts=False
)


--- 1. Baseline Training for Feature Selection ---

üöÄ Training XGB Model...

üöÄ Training Final XGB on 211 features | Full Data: False

üöÄ Training XGB Model...
   -> Fold 1/6...
   --- Size: Train=71711, Val=71708 ---
   -> Fold 2/6...
   --- Size: Train=143419, Val=71708 ---
   -> Fold 3/6...
   --- Size: Train=215127, Val=71708 ---
   -> Fold 4/6...
   --- Size: Train=286835, Val=71708 ---
   -> Fold 5/6...
   --- Size: Train=358543, Val=71708 ---
   -> Fold 6/6...
   --- Size: Train=430251, Val=71708 ---

üèÅ Final Test Set Evaluation:

üèÅ Final Test Results (XGB):
   Optimum Threshold: 0.0694
   AUC: 0.8709
   Precision: 47.21%
   Recall: 49.92%
   Net Profit Impact: $169,865.33


In [23]:
xgb_top_features = xgb_baseline_result['importances'].groupby('feature')['importance'].mean().sort_values(ascending=False).index.tolist()
xgb_best_params = trainer.optimize_hyperparameters(model_type='xgb', n_trials=20, n_folds=3, top_n_features=xgb_top_features[:150])



üîç Optuna: Optimizing XGB on 150 features, (20 trials)...


[I 2026-01-28 05:47:36,867] A new study created in memory with name: no-name-ba1da086-6461-4fa1-afe1-53d74d7e9f8d
[I 2026-01-28 05:53:54,923] Trial 0 finished with value: 0.9497161496428811 and parameters: {'n_estimators': 4563, 'learning_rate': 0.009343798676910084, 'max_depth': 10, 'min_child_weight': 2, 'subsample': 0.7905613458421644, 'colsample_bytree': 0.872652238330978, 'gamma': 2.381061331230639e-07, 'reg_alpha': 3.818134098819682, 'reg_lambda': 2.881266370086273e-07}. Best is trial 0 with value: 0.9497161496428811.
[I 2026-01-28 05:55:01,212] Trial 1 finished with value: 0.9443039482955862 and parameters: {'n_estimators': 3269, 'learning_rate': 0.0391546819064918, 'max_depth': 6, 'min_child_weight': 7, 'subsample': 0.9520801256095032, 'colsample_bytree': 0.8937036067706721, 'gamma': 1.9441503720234865e-06, 'reg_alpha': 0.5850772791985083, 'reg_lambda': 5.111143638939098}. Best is trial 0 with value: 0.9497161496428811.
[I 2026-01-28 06:00:56,379] Trial 2 finished with value: 0

   ‚úÖ Best params: {'n_estimators': 4935, 'learning_rate': 0.01623652280283671, 'max_depth': 10, 'min_child_weight': 1, 'subsample': 0.7934008192507429, 'colsample_bytree': 0.9843954527397509, 'gamma': 0.004357648635267788, 'reg_alpha': 8.901631981922412, 'reg_lambda': 1.1377762447313336e-08}


In [24]:
xgb_best_params.update(FIXED_PARAMS['xgb'])
del xgb_best_params['early_stopping_rounds']
xgb_report = trainer.final_train('xgb', xgb_best_params, xgb_top_features[:150]) 


üöÄ Training XGB Model...

üèÅ Final Test Results (XGB):
   Optimum Threshold: 0.1090
   AUC: 0.9041
   Precision: 50.29%
   Recall: 59.65%
   Net Profit Impact: $224,249.90
üíæ Saved xgb model & 150 features.


In [25]:
trainer.select_best_model()


üèÜ Selecting Best Configuration based on Financial Impact...

   Evaluating 13 combinations using Cost Strategy...
   ‚≠ê New Leader: [lgb:1.0] Net Savings=$181,705.82
   ‚≠ê New Leader: [xgb:1.0] Net Savings=$224,249.90

üéâ WINNER: {'xgb': 1.0}
   AUC: 0.9041
   Net Savings: $224,249.90
   Optimal Threshold: 0.1090
üíæ Saved production_config.json to ../models/prod_v1


{'weights': {'xgb': 1.0},
 'threshold': 0.10898989898989898,
 'profit': 224249.9,
 'auc': 0.9040543687007886}

In [12]:
xgb_features = json.load(open("../models/prod_v1/xgb_features.json"))

#### **Post-processing & inference**

In [52]:
test_raw = pd.read_csv(base_path / 'data/raw/test_raw.csv')

In [53]:
test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88581 entries, 0 to 88580
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float64(399), int64(4), object(31)
memory usage: 293.3+ MB


In [54]:
int_col = [col for col in test_raw.columns if test_raw[col].dtype in ['int8', 'int16', 'int32', 'int64']]
obj_col = [col for col in test_raw.columns if test_raw[col].dtype in ['object', 'category']]
num_col = [col for col in test_raw.columns if test_raw[col].dtype in ['float16', 'float32', 'float64']]
print('Integer columns: ', int_col)
print('Object/Category columns: ', obj_col)

Integer columns:  ['TransactionID', 'isFraud', 'TransactionDT', 'card1']
Object/Category columns:  ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']


In [55]:
# df = test_raw.replace(['nan', 'NaN', 'Nan', np.nan], None)
# df = df.where(pd.notnull(df), None)
records = test_raw.replace({np.nan: None}).to_dict('records') # df.to_dict('records')

In [56]:
sub_records = [records[i] for i in range(1000)]

In [57]:
rev_records = pd.DataFrame(sub_records)
rev_records = rev_records.replace({None: np.nan})

In [58]:
for col in int_col:
    rev_records[col] = pd.to_numeric(rev_records[col], errors='coerce').astype('Int64')

for col in obj_col:
    rev_records[col] = rev_records[col].astype('object')

for col in num_col:
    rev_records[col] = pd.to_numeric(rev_records[col], errors='coerce').astype('float64')

In [59]:
#inference  = SentinelInference(model_dir=base_path/'models/prod_v1')
processor = joblib.load(base_path / 'models/prod_v1/sentinel_preprocessor.pkl')
engineer = joblib.load(base_path / 'models/prod_v1/sentinel_engineer.pkl')
with open(base_path / 'models/prod_v1/categorical_features.json', 'r') as f: 
    cat_features = json.load(f)
xgb_model = joblib.load(base_path / 'models/prod_v1/xgb_model.pkl')
cb_model = joblib.load(base_path / 'models/prod_v1/cb_model.pkl')
lgb_model = joblib.load(base_path / 'models/prod_v1/lgb_model.pkl')
print("‚úÖ Successfully loaded processor and engineer!")

‚úÖ Successfully loaded processor and engineer!


In [60]:
df_clean = processor.transform(rev_records)
df_features =engineer.transform(df_clean)

--- Transforming 1000 rows ---
--- Transforming 1,000 rows ---


In [90]:
cols = test_raw.columns.tolist()

In [113]:
test_raw.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3488959,0,13151880,57.95,W,7919,194.0,150.0,mastercard,166.0,...,,,,,,,,,,
1,3488960,0,13151945,47.95,W,1764,158.0,150.0,visa,226.0,...,,,,,,,,,,
2,3488961,0,13152098,209.95,W,2455,321.0,150.0,visa,226.0,...,,,,,,,,,,
3,3488962,0,13152112,107.95,W,7919,194.0,150.0,mastercard,166.0,...,,,,,,,,,,
4,3488963,0,13152147,58.95,W,10838,143.0,150.0,visa,226.0,...,,,,,,,,,,


In [91]:
DASHBOARD_FEATURES = [
            'P_emaildomain', 'TransactionAmt', 'ProductCD','TransactionAmt_log', 'TransactionAmt_suspicious', 'cents_value',
            'country', 'composite_risk_score', 'DeviceType', 'os_browser_combo', 
            'UID_velocity_1h', 'UID_velocity_12h', 'UID_velocity_24h',
            'multi_entity_sharing', 'card_email_combo', 'device_info_combo', 
            'P_emaildomain_risk_score', 'email_match_status', 'P_emaildomain_is_free',
            'ProductCD_switch', 'user_amt_zscore', 'Amt_div_card1_mean', 
            'hour_of_day', 'day_of_week', 'time_gap_anomaly',
            'screen_area', 'addr1_fraud_rate', 'addr1_degree'
        ]

In [77]:
['P_emaildomain', 'TransactionAmt', 'ProductCD', 'TransactionAmt_log', 
'TransactionAmt_suspicious', 'cents_value', 'country', 'composite_risk_score',
 'DeviceType', 'os_browser_combo', 'UID_velocity_1h', 'UID_velocity_12h', 
 'UID_velocity_24h', 'multi_entity_sharing', 'card_email_combo', 
 'device_info_combo', 'P_emaildomain_risk_score', 'email_match_status', 
 'P_emaildomain_is_free', 'ProductCD_switch', 'user_amt_zscore', 
 'Amt_div_card1_mean', 'hour_of_day', 'day_of_week', 'time_gap_anomaly', 
 'screen_area', 'addr1_fraud_rate', 'addr1_degree']

['P_emaildomain',
 'TransactionAmt',
 'ProductCD',
 'TransactionAmt_log',
 'TransactionAmt_suspicious',
 'cents_value',
 'country',
 'composite_risk_score',
 'DeviceType',
 'os_browser_combo',
 'UID_velocity_1h',
 'UID_velocity_12h',
 'UID_velocity_24h',
 'multi_entity_sharing',
 'card_email_combo',
 'device_info_combo',
 'P_emaildomain_risk_score',
 'email_match_status',
 'P_emaildomain_is_free',
 'ProductCD_switch',
 'user_amt_zscore',
 'Amt_div_card1_mean',
 'hour_of_day',
 'day_of_week',
 'time_gap_anomaly',
 'screen_area',
 'addr1_fraud_rate',
 'addr1_degree']

In [105]:
def extract_data_for_dashboard(data_original: pd.DataFrame, df_features: pd.DataFrame):
        """
        Extract data for dashboard.
        """
        DASHBOARD_FEATURES = [
            'P_emaildomain', 'TransactionAmt', 'ProductCD','TransactionAmt_log', 'TransactionAmt_suspicious', 'cents_value',
            'country', 'composite_risk_score', 'DeviceType', 'os_browser_combo', 
            'UID_velocity_1h', 'UID_velocity_12h', 'UID_velocity_24h',
            'multi_entity_sharing', 'card_email_combo', 'device_info_combo', 
            'P_emaildomain_risk_score', 'email_match_status', 'P_emaildomain_is_free',
            'ProductCD_switch', 'user_amt_zscore', 'Amt_div_card1_mean', 
            'hour_of_day', 'day_of_week', 'time_gap_anomaly',
            'screen_area', 'addr1_fraud_rate', 'addr1_degree'
        ]

        orinigal_cols = [c for c in data_original.columns if c in DASHBOARD_FEATURES]
        feature_cols = [col for col in DASHBOARD_FEATURES if col not in orinigal_cols]

        missing_cols = [c for c in DASHBOARD_FEATURES if c not in data_original.columns and c not in df_features.columns]
        
        for col in missing_cols:
            df_features[col] = np.nan
        
        export_df = data_original[orinigal_cols].copy()
        eng_df = df_features[feature_cols].copy()
        
        dash_df = pd.concat([export_df, eng_df], axis=1)

        return dash_df

In [106]:
sub_test = test_raw.head(10)
test_clean = processor.transform(sub_test)
test_features =engineer.transform(test_clean)

--- Transforming 10 rows ---
--- Transforming 10 rows ---


In [109]:
dash_df = extract_data_for_dashboard(sub_test, test_features)

In [110]:
test_features.shape

(10, 274)

In [112]:
dash_df

Unnamed: 0,TransactionAmt,ProductCD,P_emaildomain,DeviceType,TransactionAmt_log,TransactionAmt_suspicious,cents_value,country,composite_risk_score,os_browser_combo,...,P_emaildomain_is_free,ProductCD_switch,user_amt_zscore,Amt_div_card1_mean,hour_of_day,day_of_week,time_gap_anomaly,screen_area,addr1_fraud_rate,addr1_degree
0,57.95,W,anonymous.com,,4.07669,1,950,USA/Global,0.029226,0,...,0,0,-0.228482,0.572162,5,1,1,,0.013608,1
1,47.95,W,gmail.com,,3.8908,1,950,USA/Global,0.037126,0,...,1,0,-0.717842,0.471908,5,1,1,,0.017447,2
2,209.95,W,gmail.com,,5.351621,1,949,USA/Global,0.037529,0,...,1,0,-459.76001,0.785005,5,1,1,,0.026946,1
3,107.95,W,yahoo.com,,4.690889,1,949,USA/Global,0.02896,0,...,1,0,0.0,1.06583,5,1,1,,0.026949,1
4,58.95,W,gmail.com,,4.093511,1,950,USA/Global,0.031623,0,...,1,0,-0.307249,0.568443,5,1,1,,0.014253,1
5,224.0,W,,,5.416101,1,0,Unknown,0.029107,0,...,0,0,5.062349,2.211635,5,1,1,,0.022216,1
6,70.825,C,yahoo.com,mobile,4.274232,0,824,USA/Global,0.048029,0,...,1,0,0.0,0.57573,5,1,1,,0.114033,2
7,77.95,W,yahoo.com,,4.368815,0,949,USA/Global,0.028141,0,...,1,0,1.111463,0.769629,5,1,1,,0.018759,1
8,12.296,C,gmail.com,desktop,2.587463,0,295,USA/Global,0.081554,1,...,1,0,-0.916746,0.275608,5,1,1,,0.114033,2
9,57.95,W,gmail.com,,4.07669,1,950,USA/Global,0.029939,0,...,1,0,0.0,0.572162,5,1,1,,0.017447,2


In [61]:
cols = df_features.columns.tolist()
print(cols)

['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_28', 'id_29', 'id_32', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'country', 'hour_of_day', 'day_of_week', 'day_of_month', 'month_year', 'P_emaildomain_length', 'P_emaildomain_has_digits', 'P_emaildomain_is_free', 'P_emaildomain_vendor_id', 'P_emaildomain_suffix_id', 'P_emaildomain_country_id', 'P_emaildomain_risk_score', 'R_emaildomain_length', 'R_emaildomain_has_digits', 'R

In [None]:
['TransactionID', 'isFraud', 'TransactionAmt', 'ProductCD', 'TransactionAmt_log', 'TransactionAmt_suspicious', 'country',
'composite_risk_score', 'DeviceType', 'os_browser_combo', 'ID_velocity_1h', 'UID_velocity_12h', 'UID_velocity_24h',
'multi_entity_sharing', 'card_email_combo', 'device_info_combo', 'P_emaildomain_risk_score', 'email_match_status', 'P_emaildomain_is_free',
'composite_risk_score', 'composite_risk_score', 'ProductCD_switch', 'user_amt_zscore', 'Amt_div_card1_mean', 'hour_of_day', 'day_of_week'] 

In [None]:
['TransactionID', 'isFraud', 'TransactionAmt', 'ProductCD','TransactionAmt_log', 'TransactionAmt_suspicious', 'cents_value',
'country', 'composite_risk_score', 'DeviceType', 'os_browser_combo', 
'UID_velocity_1h', 'UID_velocity_12h', 'UID_velocity_24h',
'multi_entity_sharing', 'card_email_combo', 'device_info_combo', 
'P_emaildomain_risk_score', 'email_match_status', 'P_emaildomain_is_free',
'ProductCD_switch', 'user_amt_zscore', 'Amt_div_card1_mean', 
'hour_of_day', 'day_of_week', 'time_gap_anomaly',
'screen_area', 'addr1_fraud_rate', 'addr1_degree']

In [None]:
['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 
'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 
'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 
'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_28', 'id_29', 'id_32', 'id_34', 'id_35', 'id_36', 
'id_37', 'id_38', 'DeviceType', 'country', 'hour_of_day', 'day_of_week', 'day_of_month', 'month_year', 'P_emaildomain_length', 'P_emaildomain_has_digits', 'P_emaildomain_is_free',
 'P_emaildomain_vendor_id', 'P_emaildomain_suffix_id', 'P_emaildomain_country_id', 'P_emaildomain_risk_score', 'R_emaildomain_length', 'R_emaildomain_has_digits', 
 'R_emaildomain_is_free', 'R_emaildomain_vendor_id', 'R_emaildomain_suffix_id', 'R_emaildomain_country_id', 'R_emaildomain_risk_score', 
 'email_match_status', 'email_country_mismatch', 'screen_width', 'screen_height', 'os_type', 'browser_type', 'device_vendor', 'D1_norm', 
 'D2_norm', 'D3_norm', 'D4_norm', 'D5_norm', 'D6_norm', 'D7_norm', 'D8_norm', 'D9_norm', 'D10_norm', 'D11_norm', 'D12_norm', 'D13_norm', 'D14_norm',
  'D15_norm', 'UID', 'device_info_combo', 'card_email_combo', 'hour_sin', 'hour_cos', 'days_since_first_txn', 'UID_dt_diff', 
  'UID_count_1h', 'UID_amt_sum_1h', 'UID_velocity_1h', 'UID_count_12h', 'UID_amt_sum_12h', 'UID_velocity_12h', 'UID_count_24h',
   'UID_amt_sum_24h', 'UID_velocity_24h', 'TransactionAmt_log', 'TransactionAmt_log_z', 'TransactionAmt_suspicious', 'txn_sequence', 'amt_change_abs', 
   'ProductCD_switch', 'addr1_switch', 'device_vendor_switch', 'time_gap_anomaly', 'screen_area', 'screen_aspect_ratio', 'product_network_combo', 
   'card1_addr1_combo', 'os_browser_combo', 'device_vendor_degree', 'addr1_degree', 'unique_addrs_per_card', 'unique_emails_per_card', 'multi_entity_sharing', 
   'cents_value', 'is_exact_dollars', 'is_99_cents', 'TransactionAmt_decimal', 'UID_freq_enc', 'card1_freq_enc', 'addr1_freq_enc', 'card4_freq_enc', 
   'P_emaildomain_vendor_id_freq_enc', 'device_vendor_freq_enc', 'addr1_fraud_rate', 'card4_fraud_rate', 'P_emaildomain_vendor_id_fraud_rate', 'device_vendor_fraud_rate',
    'device_info_combo_fraud_rate', 'card_email_combo_fraud_rate', 'ProductCD_fraud_rate', 'card6_fraud_rate', 'os_type_fraud_rate', 'browser_type_fraud_rate', 
    'hour_of_day_fraud_rate', 'composite_risk_score', 'user_amt_zscore', 'Amt_div_card1_mean', 'Amt_div_card4_mean', 'Amt_div_addr1_mean', 'Amt_div_P_emaildomain_vendor_id_mean',
     'Amt_div_ProductCD_mean', 'PCA_Group_0_0', 'PCA_Group_0_1', 'PCA_Group_1_0', 'PCA_Group_1_1', 'PCA_Group_1_2', 'PCA_Group_1_3', 'PCA_Group_1_4', 'PCA_Group_1_5', 
     'PCA_Group_2_0', 'PCA_Group_2_1', 'PCA_Group_2_2', 'PCA_Group_2_3', 'PCA_Group_2_4', 'PCA_Group_3_0', 'PCA_Group_3_1', 'PCA_Group_3_2', 'PCA_Group_3_3', 
     'PCA_Group_3_4', 'PCA_Group_3_5', 'PCA_Group_3_6', 'PCA_Group_4_0', 'PCA_Group_4_1', 'PCA_Group_4_2', 'PCA_Group_4_3', 'PCA_Group_4_4', 'PCA_Group_4_5', 
     'PCA_Group_4_6', 'PCA_Group_5_0', 'PCA_Group_5_1', 'PCA_Group_5_2', 'PCA_Group_5_3', 'PCA_Group_5_4', 'PCA_Group_5_5', 'PCA_Group_5_6', 'PCA_Group_5_7', 
     'PCA_Group_5_8', 'PCA_Group_5_9', 'PCA_Group_6_0', 'PCA_Group_6_1', 'PCA_Group_6_2', 'PCA_Group_6_3', 'PCA_Group_6_4', 'PCA_Group_6_5', 'PCA_Group_6_6', 
     'PCA_Group_7_0', 'PCA_Group_7_1', 'PCA_Group_7_2', 'PCA_Group_7_3', 'PCA_Group_7_4', 'PCA_Group_7_5', 'PCA_Group_7_6', 'PCA_Group_7_7', 'PCA_Group_8_0', 
     'PCA_Group_8_1', 'PCA_Group_8_2', 'PCA_Group_8_3', 'PCA_Group_8_4', 'PCA_Group_8_5', 'PCA_Group_8_6', 'PCA_Group_9_0', 'PCA_Group_9_1', 'PCA_Group_9_2', 
     'PCA_Group_9_3', 'PCA_Group_9_4', 'PCA_Group_9_5', 'PCA_Group_9_6', 'PCA_Group_9_7', 'PCA_Group_10_0', 'PCA_Group_10_1', 'PCA_Group_10_2', 'PCA_Group_10_3', 
     'PCA_Group_10_4', 'PCA_Group_10_5', 'PCA_Group_11_0', 'PCA_Group_11_1', 'PCA_Group_11_2', 'PCA_Group_11_3', 'PCA_Group_11_4', 'PCA_Group_11_5', 'PCA_Group_11_6', 
     'PCA_Group_11_7', 'PCA_Group_11_8', 'PCA_Group_11_9', 'PCA_Group_12_0', 'PCA_Group_12_1', 'PCA_Group_12_2', 'PCA_Group_12_3', 'PCA_Group_12_4', 'PCA_Group_12_5', 
     'PCA_Group_12_6', 'PCA_Group_12_7', 'PCA_Group_13_0', 'PCA_Group_13_1', 'PCA_Group_13_2', 'PCA_Group_13_3', 'PCA_Group_13_4', 'PCA_Group_13_5', 'UID_hash']

In [25]:
xgb_features = xgb_model.get_booster().feature_names
cb_features = cb_model.feature_names_
lgb_features = lgb_model.feature_name_

xgb_booster = xgb_model.get_booster()

In [26]:
xgb_pred = xgb_model.predict_proba(df_features[xgb_features])[:, 1]

In [27]:
background_summary = shap.kmeans(df_features[xgb_features], 200)
background_df = pd.DataFrame(background_summary.data, columns=xgb_features)

In [29]:
xgb_model.set_params(predictor="cpu_predictor")

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'binary:logistic'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,'gbtree'
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.9843954527397509
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",'cuda'
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,True


In [39]:
xgb_explainer = shap.TreeExplainer(
        model=xgb_model,
        # No data parameter for tree_path_dependent
        model_output='raw',  # Log-odds for binary classification
        feature_perturbation='tree_path_dependent'
    )

In [153]:
test_row = df_features[xgb_features].head(100)

In [152]:
xgb_model.n_estimators

4935

In [166]:
dmat = xgb.DMatrix(test_row, enable_categorical=True)
start = time.time()
    # pred_contribs=True calculates EXACT TreeSHAP
    # validate_features=False saves a few milliseconds of overhead
shap_matrix = xgb_model.get_booster().predict(
    dmat,
    pred_contribs=True,
    validate_features=False,
    iteration_range=(0, xgb_model.n_estimators)
)
duration = (time.time() - start) * 1000

In [183]:
batch_explanations = []
        
for i in range(len(test_row)):
    feature_impacts = shap_matrix[i][:-1]
    
    raw_values = test_row.iloc[i].values
    
    impacts_list = []
    for name, val, impact in zip(xgb_features, raw_values, feature_impacts):
        if abs(impact) > 1e-4:
            impacts_list.append({
                "feature": name,
                "value": str(round(val, 3)),
                "impact": float(impact)
            })
    
    top_n = sorted(impacts_list, key=lambda x: abs(x['impact']), reverse=True)[:10]
    batch_explanations.append(top_n)

In [197]:
import plotly.graph_objects as go

def notebook_shap_bar(explanation_list):
    """
    explanation_list: List of {'feature': str, 'value': str, 'impact': float}
    """
    # Sort data so biggest positive impacts are at the top
    data = sorted(explanation_list, key=lambda x: x['impact'])
    
    features = [d['feature'] for d in data]
    values = [d['value'] for d in data]
    impacts = [d['impact'] for d in data]
    
    # Create labels that include the actual data point
    #labels = [f"<b>{f}</b><br>Value: {v}" for f, v in zip(features, values)]
    labels=[f"<b>{d['feature']}</b> ({d['value']})" for d in data]
    
    # Red for positive (Fraud), Green for negative (Legit)
    colors = ['#EF553B' if x > 0 else '#00CC96' for x in impacts]

    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=impacts,
        y=labels,
        orientation='h',
        marker_color=colors,
        text=[f"{x:+.2f}" for x in impacts], # Show impact value on bar
        textposition='outside'
    ))

    fig.update_layout(
        title="<b>Transaction Risk Evidence</b><br><sup>What pushed the model toward Fraud vs Legit</sup>",
        template="plotly_dark", # Built-in dark theme
        xaxis_title="Impact on Score (Log-Odds): Legit Signal <---> Fraud Signal",
        yaxis=dict(ticksuffix="  "),
        height=len(features) * 50, # Responsive height
        margin=dict(l=200), # Space for feature names
        showlegend=False
    )
    
    # Add a strong zero line
    fig.add_vline(x=0, line_width=2, line_color="white", line_dash="dash")
    
    return fig

# TEST IT:
# Assuming batch_explanations[0] contains your result
# fig = notebook_shap_bar(batch_explanations[0])
# fig.show()

In [None]:
def notebook_shap_waterfall(explanation_list, base_value):
    """
    explanation_list: List of {'feature': str, 'value': str, 'impact': float}
    base_value: The starting point (explainer.expected_value)
    """
    # Sort by absolute impact
    data = sorted(explanation_list, key=lambda x: abs(x['impact']), reverse=True)
    
    features = ["Starting Base"] + [d['feature'] for d in data]
    impacts = [base_value] + [d['impact'] for d in data]
    
    # Calculate measures for Plotly waterfall
    # First is absolute, rest are relative
    measures = ["absolute"] + ["relative"] * len(data)

    fig = go.Figure(go.Waterfall(
        orientation = "h",
        measure = measures,
        x = impacts,
        y = features,
        connector = {"line":{"color":"rgb(63, 63, 63)"}},
        increasing = {"marker":{"color":"#EF553B"}}, # Red
        decreasing = {"marker":{"color":"#00CC96"}}, # Green
        totals = {"marker":{"color":"#636EFA"}}      # Blue
    ))

    fig.update_layout(
        title = "<b>Risk Score Build-up</b>",
        template = "plotly_dark",
        height = 600,
        showlegend = False
    )

    return fig

# TEST IT:
# fig = notebook_shap_waterfall(batch_explanations[0], xgb_explainer.expected_value)
# fig.show()

In [189]:
def notebook_risk_gauge(base_value, explanation_list):
    # Calculate final probability
    total_impact = sum([d['impact'] for d in explanation_list])
    final_log_odds = base_value + total_impact
    prob = (1 / (1 + np.exp(-final_log_odds))) * 100

    fig = go.Figure(go.Indicator(
        mode = "gauge+number",
        value = prob,
        number = {'suffix': "%", 'font': {'size': 60, 'color': 'white'}},
        title = {'text': "Final Risk Score", 'font': {'size': 24}},
        gauge = {
            'axis': {'range': [0, 100], 'tickwidth': 1, 'tickcolor': "white"},
            'bar': {'color': "#636EFA"},
            'bgcolor': "rgba(0,0,0,0)",
            'borderwidth': 2,
            'bordercolor': "gray",
            'steps': [
                {'range': [0, 30], 'color': "#00CC96"},  # Safe
                {'range': [30, 70], 'color': "#FFA15A"}, # Warning
                {'range': [70, 100], 'color': "#EF553B"} # Danger
            ],
            'threshold': {
                'line': {'color': "white", 'width': 4},
                'thickness': 0.75,
                'value': prob
            }
        }
    ))

    fig.update_layout(template="plotly_dark", height=400)
    return fig

# RUN: notebook_risk_gauge(shap_matrix[0][-1], batch_explanations[0]).show()

In [None]:
def notebook_dot_plot(explanation_list):
    data = sorted(explanation_list, key=lambda x: x['impact'])
    
    fig = go.Figure()

    # Add background shapes for Fraud vs Legit zones
    fig.add_vrect(x0=0, x1=max([d['impact'] for d in data])*1.2, fillcolor="red", opacity=0.05, layer="below", line_width=0)
    fig.add_vrect(x0=min([d['impact'] for d in data])*1.2, x1=0, fillcolor="green", opacity=0.05, layer="below", line_width=0)

    fig.add_trace(go.Scatter(
        x=[d['impact'] for d in data],
        y=[f"<b>{d['feature']}</b> ({d['value']})" for d in data],
        mode='markers+text',
        marker=dict(
            color=['#EF553B' if d['impact'] > 0 else '#00CC96' for d in data],
            size=14,
            symbol='diamond'
        ),
        text=[f" {d['impact']:+.2f}" for d in data],
        textposition="middle right",
    ))

    fig.update_layout(
        title="<b>Risk Evidence Strength</b>",
        template="plotly_dark",
        xaxis_title="Legit Signal <---> Fraud Signal",
        yaxis=dict(showgrid=True, gridcolor="#232a35"),
        height=600,
        margin=dict(l=200)
    )
    fig.add_vline(x=0, line_width=1, line_color="white")
    
    return fig

# RUN: notebook_dot_plot(batch_explanations[0]).show()

In [221]:
fig = notebook_shap_bar(batch_explanations[1])
fig.show()

In [191]:
notebook_dot_plot(batch_explanations[0])

In [208]:
def _get_action(prob: float, soft: float, hard: float) -> str:
    if prob >= hard: return "BLOCK" 
    elif prob >= soft: return "REVIEW"
    return "APPROVE" 

In [215]:
list([0, 0, 0, 0])

[0, 0, 0, 0]

In [205]:
import plotly.graph_objects as go

def notebook_risk_radar(explanation_list):
    # We take the absolute impact to show 'strength' of signals
    data = sorted(explanation_list, key=lambda x: abs(x['impact']), reverse=True)[:]
    
    fig = go.Figure()

    fig.add_trace(go.Scatterpolar(
        r=[abs(d['impact']) for d in data],
        theta=[d['feature'] for d in data],
        fill='toself',
        marker=dict(color='#00CC96'),
        line=dict(color='#00CC96'),
        hoverinfo="text",
        text=[f"{d['feature']}<br>Value: {d['value']}<br>Impact: {d['impact']:+.2f}" for d in data]
    ))

    fig.update_layout(
        polar=dict(
            bgcolor='rgba(0,0,0,0)',
            radialaxis=dict(visible=True, range=[0, max([abs(d['impact']) for d in data]) * 1.1], showticklabels=False),
        ),
        showlegend=False,
        template="plotly_dark",
        title="<b>Transaction Risk Fingerprint</b><br><sup>Top 8 Signal Strengths</sup>",
        height=500
    )
    return fig

# RUN: notebook_risk_radar(batch_explanations[0]).show()

In [206]:
notebook_risk_gauge(bias, data_to_plot)

In [187]:
# 1. Extract the bias from the last column of your matrix for row 0
row_index = 0
bias = float(shap_matrix[row_index][-1])

# 2. Get your processed Top 10 list for row 0
data_to_plot = batch_explanations[row_index]

# 3. Show the chart
fig = notebook_shap_waterfall(data_to_plot, bias)
fig.show()


In [None]:
notebook_shap_waterfall(explanation_list, base_value)

In [158]:
#shap_matrix

In [None]:
import os
# Force the system to hide the GPU from this process
os.environ["CUDA_VISIBLE_DEVICES"] = ""

import shap
import xgboost as xgb
import pandas as pd
import joblib

# 1. Load your model as usual
# (Assume xgb_model is loaded here)

# 2. Force the model parameters to CPU at the "Booster" level
if hasattr(xgb_model, "get_booster"):
    # For Scikit-learn wrapper
    xgb_model.get_booster().set_param({"device": "cpu", "predictor": "cpu_predictor"})
else:
    # For native xgb.Booster
    xgb_model.set_param({"device": "cpu", "predictor": "cpu_predictor"})

# 3. Initialize the explainer
# Since CUDA_VISIBLE_DEVICES is "", it cannot even "see" the GPU
xgb_explainer = shap.TreeExplainer(
    xgb_model, 
    data=None, 
    feature_perturbation='tree_path_dependent'
)


print("‚úÖ Success! The GPU was bypassed and SHAP was calculated on CPU.")
print(f"SHAP shape: {shap_values.shape}")

‚úÖ Success! The GPU was bypassed and SHAP was calculated on CPU.
SHAP shape: (1, 150)


In [114]:
# 4. Test with a single row
test_row = df_features[xgb_features].head(1)
shap_values = xgb_explainer.shap_values(test_row, tree_limit=500)


In [157]:
#shap_values

In [156]:
#shap_values

In [83]:
import shap
print(shap.explainers._tree.assert_import("cext"))

None


In [42]:
import shap
import pandas as pd
import joblib

# 1. Simply sample 200 REAL rows from your 500k dataset
# This is instant and mathematically robust
xgb_explainer = shap.TreeExplainer(
    xgb_model, 
    data=None, 
    feature_perturbation='tree_path_dependent'
)

# 3. Test on a small sample
print("Testing with shap.sample...")
test_sample = df_features[xgb_features].iloc[:2]
print("‚úÖ SUCCESS! SHAP calculated using real-row sampling.")

# 4. Save
#joblib.dump(explainer, 'shap_explainer.pkl')

Testing with shap.sample...
‚úÖ SUCCESS! SHAP calculated using real-row sampling.


In [38]:
shap_values_obj = explainer(test_sample, max_evals=len(xgb_features) * 2 + 1)

print("‚úÖ SUCCESS! Permutation SHAP calculated.")

‚úÖ SUCCESS! Permutation SHAP calculated.


In [None]:
pred = inference.predict(test_raw.head(15000))

In [21]:
xgbfeatures = inference._get_features(xgb_model, 'xgb')
cbfeatures = inference._get_features(cb_model, 'cb')
lgbfeatures = inference._get_features(lgb_model, 'lgb')

In [155]:
df_clean = processor.transform(rev_records)
df_features =engineer.transform(df_clean)

--- Transforming 2000 rows ---
--- Transforming 2,000 rows ---


In [23]:
missing = list(set(lgbfeatures) - set(df_features.columns.tolist()))
missing

['device_info_combo_fraud_rate',
 'screen_area',
 'UID_amt_sum_12h',
 'UID_amt_sum_24h',
 'UID_velocity_24h',
 'screen_width',
 'screen_aspect_ratio']

In [24]:
for col in missing:
    df_features[col] = np.nan

In [2]:
# for col in cbfeatures:
#     if col in cat_features:
#         df_features[col] = df_features[col].astype('category')

# def _consistent_features(self, df, features, model_name):

#     missing_features = list(set(features) - set(df.columns.tolist()))
#     for col in missing_features:
#         df[col] = np.nan
#     if model_name == 'cb': pass
#     else:
#         for col in df.columns.tolist():
#             if col in self.cat_features:
#                     df[col] = df[col].astype('category') 
#     tem_df = df[features].copy()

#     return tem_df


# def _get_feat4board(self, data: Union[Dict, pd.DataFrame]=None, 
#                     features: List[str]=[
#                         'TransactionAmt',      
#                         'ProductCD',
#                         'card_email_combo_fraud_rate',      
#                         'P_emaildomain', 'R_emaildomain_is_free',  
#                         'UID_velocity_24h',   
#                         'dist1',              
#                         'addr1', 'card1_freq_enc',                        
#                         'D15',                
#                         'device_vendor',        
#                         'C13', 'C1', 'C14', 'UID_vel'
#                     ]) -> Dict[str, Any]:
#     """
#     Get features for dashboard, prioritizing explainability and impact.
#     """
#     if data is None:
#         data = self.df_features

#     feat4board = {}
#     if 'TransactionDT' in data:
#             feat4board['hour_of_day'] = (data['TransactionDT'] // 3600) % 24

#     for col in features:
#         if col in data: 
#             feat4board[col] = data[col].values.tolist()
                    
#     return feat4board


#df['UID_hash'] = (df['UID'].apply(lambda x: hash(str(x)) % 10000)).astype('i4')

In [25]:
temp_df = df_features[xgbfeatures].copy()

In [26]:
xgb_model.predict_proba(temp_df) 

array([[9.9965411e-01, 3.4586628e-04],
       [9.9864173e-01, 1.3582949e-03],
       [9.9624014e-01, 3.7598354e-03]], dtype=float32)

In [None]:
with open(base_path / 'models/prod_v1/xgb_features.json') as f:
    features = json.load(f)
print("‚úÖ Successfully loaded feature list!")

In [69]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 3
Columns: 150 entries, ProductCD to screen_width
dtypes: float32(110), float64(4), int16(1), int32(3), int8(7), uint16(1), uint8(24)
memory usage: 2.1 KB


In [30]:
display(temp_df.head(3))

Unnamed: 0,card_email_combo_fraud_rate,PCA_Group_9_0,C4,card_email_combo,C14,C8,C5,PCA_Group_6_2,C1,PCA_Group_11_0,...,M9,PCA_Group_7_5,id_01,PCA_Group_11_1,PCA_Group_1_0,PCA_Group_5_5,D5_norm,Amt_div_P_emaildomain_vendor_id_mean,PCA_Group_7_2,browser_type
0,0.026452,-2.276176,0,2,132,0,115,0.062642,150,-0.65718,...,-1,-0.022948,,-0.286271,1.124099,0.044865,121.22084,0.345907,-0.286416,0
1,0.082285,-2.276176,0,0,5,0,7,0.062642,7,-0.65718,...,1,-0.022948,,-0.286271,1.560474,0.044865,95.221588,0.374975,-0.286416,0
2,0.033621,-2.276176,0,1,1,0,1,0.062642,1,-0.65718,...,1,-0.022948,,-0.286271,1.560474,0.044865,,1.641835,-0.286416,0


In [29]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 3
Columns: 273 entries, TransactionID to screen_width
dtypes: float32(173), float64(4), int16(6), int32(5), int8(23), object(4), uint16(9), uint32(3), uint8(46)
memory usage: 3.5+ KB


In [19]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 400 entries, 0 to 399
Columns: 284 entries, TransactionID to UID_hash
dtypes: float32(188), float64(3), int16(9), int32(6), int8(30), object(4), uint16(12), uint32(3), uint8(29)
memory usage: 372.3+ KB
