# Model Evaluation Results Summary

### Imports

In [1]:
import os
import json
import gc
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Paths
RESULTS_DIR = Path('../results')
MODELS_DIR = Path('../models')


### Config

In [2]:
# Set working directory to project root (use cwd().parent for notebooks in notebooks/ folder)
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
os.chdir(PROJECT_ROOT)

In [3]:
# Load environment variables
load_dotenv()

True

In [4]:
# Constants
# Data directory
DATA_DIR = os.getenv('DATA_DIR', './data')
# Results directory
RESULTS_DIR = os.getenv('RESULTS_DIR', './results')

In [5]:
# Convert RESULTS_DIR to Path if it's a string
RESULTS_DIR = Path(RESULTS_DIR)

# Find the latest evaluation results file
eval_files = list(RESULTS_DIR.glob('ovr_grouped_evaluation_*.json'))
if eval_files:
    latest_eval_file = sorted(eval_files)[-1]
    print(f'Loading evaluation results from: {latest_eval_file}')
else:
    raise FileNotFoundError('No evaluation results found in results directory')

# Load evaluation results
with open(latest_eval_file, 'r') as f:
    eval_results = json.load(f)

# Load thresholds
with open(RESULTS_DIR / 'ovr_group_thresholds.json', 'r') as f:
    thresholds = json.load(f)

# Load MLflow model info if available
mlflow_files = list(RESULTS_DIR.glob('mlflow_model_info_*.json'))
if mlflow_files:
    with open(sorted(mlflow_files)[-1], 'r') as f:
        mlflow_info = json.load(f)
    print(f'MLflow run ID: {mlflow_info.get("run_id", "N/A")}')
else:
    mlflow_info = None
    print('No MLflow info found')


Loading evaluation results from: results/ovr_grouped_evaluation_20260115_034238.json
MLflow run ID: f8d21b1657f24b27a1f0bdf1be0fe56f


### Overall Model Performance


In [6]:
# Extract overall metrics
overall = eval_results['overall']

In [7]:
# Create summary DataFrame for overall metrics
overall_df = pd.DataFrame({
    'Metric': [
        'Mean AUC-ROC',
        'Std AUC-ROC',
        'Mean Average Precision',
        'MAP@7',
        'Precision@7',
        'Precision@7 (with thresholds)',
        'Products Evaluated'
    ],
    'Value': [
        f"{overall['mean_auc']:.4f}",
        f"{overall['std_auc']:.4f}",
        f"{overall['mean_avg_precision']:.4f}",
        f"{overall['map_at_7']:.4f}",
        f"{overall['precision_at_7']:.4f}",
        f"{overall['precision_at_7_with_thresholds']:.4f}",
        str(overall['n_products_evaluated'])
    ]
})
display(overall_df)


Unnamed: 0,Metric,Value
0,Mean AUC-ROC,0.9025
1,Std AUC-ROC,0.0753
2,Mean Average Precision,0.2668
3,MAP@7,0.7659
4,Precision@7,0.057
5,Precision@7 (with thresholds),0.0564
6,Products Evaluated,21.0


### Performance by Product Group


In [8]:
# Extract per-group metrics
per_group = eval_results['per_group']

In [9]:
# Create group comparison DataFrame
group_df = pd.DataFrame([
    {
        'Group': group.capitalize(),
        'Mean AUC': metrics['mean_auc'],
        'N Products': metrics['n_products']
    }
    for group, metrics in per_group.items()
])
group_df = group_df.sort_values('Mean AUC', ascending=False)
display(group_df)


Unnamed: 0,Group,Mean AUC,N Products
2,Rare,0.949124,7
0,Frequent,0.90175,8
1,Mid,0.849006,6


### Model Performance by Product


In [10]:
# Create per-product DataFrame
per_product = eval_results['per_product']

product_df = pd.DataFrame([
    {
        'Product': prod.replace('target_', ''),
        'AUC-ROC': metrics['auc_roc'],
        'Avg Precision': metrics['avg_precision'],
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1': metrics['f1'],
        'Log Loss': metrics['log_loss'],
        'Positive Rate': metrics['positive_rate'],
        'Threshold': metrics['threshold']
    }
    for prod, metrics in per_product.items()
])

# Sort by AUC descending
product_df = product_df.sort_values('AUC-ROC', ascending=False).reset_index(drop=True)
print(f'Total products evaluated: {len(product_df)}')
display(product_df)


Total products evaluated: 21


Unnamed: 0,Product,AUC-ROC,Avg Precision,Precision,Recall,F1,Log Loss,Positive Rate,Threshold
0,ahor_fin,1.0,1.0,1.7e-05,1.0,3.4e-05,0.000162,1.2e-05,2.543875e-08
1,ctju_fin,0.999675,0.630616,0.329787,0.775,0.462687,0.003846,0.000465,0.9773621
2,viv_fin,0.995513,0.012087,0.009174,0.714286,0.018116,0.017172,8.1e-05,0.2916048
3,hip_fin,0.984138,0.001419,0.00015,1.0,0.0003,0.033607,3.5e-05,0.0125474
4,cco_fin,0.956881,0.631468,0.343595,0.790607,0.479013,0.608289,0.044817,0.9074373
5,ctma_fin,0.94966,0.235532,0.254596,0.541016,0.34625,0.523193,0.005954,0.9520002
6,ctop_fin,0.949247,0.213653,0.373874,0.367257,0.370536,0.018926,0.002628,0.3492027
7,ctpp_fin,0.924375,0.014043,0.008065,0.015267,0.010554,0.052902,0.001523,0.6327205
8,nomina,0.915487,0.506285,0.482759,0.660714,0.557889,0.491696,0.063818,0.8076984
9,nom_pens,0.914574,0.502101,0.466979,0.686196,0.555751,0.470344,0.064108,0.7821116


Main takeaways:

- Only 21 products have been evaluated;
As 3 products (aval_fin, deco_fin, deme_fin) are very rare and had zero positive cases in evaluation dataset (no customers in the test month actually added these products), thus AUC-ROC cannot be calculated without having both positive and negative examples. All 3 missing products have a default threshold of 0.5 as a result.

- Model overfitted for rare group of products;
Model for product ahor_fin seems to have a perfect performance, AUC for ctju_fin, viv_fin also close to 1, but it's a statistical noise, not a real model performance. That products are so rare, model learned not to recommend them to anyone rather than found any pattern.

- Model performs well for mid and frequent groups of products.

### Threshold Analysis


In [11]:
# Create thresholds DataFrame
threshold_df = pd.DataFrame([
    {
        'Product': prod.replace('target_', ''),
        'Optimized Threshold': thresh
    }
    for prod, thresh in thresholds.items()
]).sort_values('Optimized Threshold', ascending=False).reset_index(drop=True)

display(threshold_df)


Unnamed: 0,Product,Optimized Threshold
0,ctju_fin,0.9773621
1,ctma_fin,0.9520002
2,ecue_fin,0.9312269
3,cco_fin,0.9074373
4,dela_fin,0.9017012
5,reca_fin,0.889197
6,cno_fin,0.8768995
7,nomina,0.8076984
8,nom_pens,0.7821116
9,ctpp_fin,0.6327205


Main takeaways:

- The extremely low threshold (2.54e-08) for ahor_fin product suggests the model barely learned anything meaningful.

### Performance Summary Statistics


In [12]:
# Summary statistics for all metrics
# Exclude rare products with unreliable metrics (too few positive examples)
rare_products_to_exclude = ['ahor_fin', 'ctju_fin', 'viv_fin', 'aval_fin', 'deco_fin', 'deme_fin']
product_df_filtered = product_df[~product_df['Product'].isin(rare_products_to_exclude)]

print(f'Excluding {len(rare_products_to_exclude)} rare products with unreliable metrics: {rare_products_to_exclude}')
print(f'Products remaining: {len(product_df_filtered)} out of 24')

summary_stats = product_df_filtered[['AUC-ROC', 'Precision', 'Recall', 'F1', 'Log Loss']].describe()
display(summary_stats)


Excluding 6 rare products with unreliable metrics: ['ahor_fin', 'ctju_fin', 'viv_fin', 'aval_fin', 'deco_fin', 'deme_fin']
Products remaining: 18 out of 24


Unnamed: 0,AUC-ROC,Precision,Recall,F1,Log Loss
count,18.0,18.0,18.0,18.0,18.0
mean,0.886484,0.209987,0.540845,0.234521,0.292418
std,0.071477,0.239687,0.307767,0.240884,0.223662
min,0.723547,0.00015,0.014337,0.0003,0.018713
25%,0.874026,0.00579,0.384431,0.010521,0.052975
50%,0.90123,0.126708,0.556222,0.1821,0.29963
75%,0.922153,0.366304,0.685102,0.468245,0.486358
max,0.984138,0.76838,1.0,0.557889,0.637508


### Summary

According to the statistics summary (after excluding metrics of 6 rare products), the final OvR grouped CatBoost model is good at ranking (average AUC 0.886), catches half of actual buyers who are more likely to buy (average Recall 0.541), and the model outputs trustworthy probabilities a customer prefers to buy exactly the recommended product (average LogLoss 0.292), but the precision is low because very few customers actually add new products each month (~1-10% for most products). 

For further improvement of model performance for rare products, content-based features can be added or another model architecture can be tested.

By summarizing the key points made, the final model is ready for production and the recommendations can have a positive impact on the business.