In [35]:
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    precision_score,
    recall_score
)

import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)


# Part 1: Baseline Evaluation

In [5]:
# Load data
train = pd.read_csv('data/ecommerce_returns_train.csv')
test = pd.read_csv('data/ecommerce_returns_test.csv')

In [12]:
def preprocess(df):
    """Exact replication of baseline_model.py preprocessing"""
    df_processed = df.copy()

    # Encode categorical: product_category
    # CRITICAL WEAKNESS: LabelEncoder implies ordinality (0 < 1 < 2) for nominal data
    le_category = LabelEncoder()
    df_processed['product_category_encoded'] = le_category.fit_transform(
        df_processed['product_category']
    )

    # Handle missing sizes
    # WEAKNESS: Imputing mode for Electronics/Home Decor creates noise
    if df_processed['size_purchased'].notna().any():
        most_common_size = df_processed['size_purchased'].mode()[0]
        df_processed['size_purchased'] = df_processed['size_purchased'].fillna(most_common_size)


        le_size = LabelEncoder()
        df_processed['size_encoded'] = le_size.fit_transform(
            df_processed['size_purchased']
        )

    feature_cols = [
        'customer_age', 'customer_tenure_days', 'product_category_encoded',
        'product_price', 'days_since_last_purchase', 'previous_returns',
        'product_rating', 'size_encoded', 'discount_applied'
    ]

    X = df_processed[feature_cols]
    y = df_processed['is_return']

    return X, y

In [17]:
# Prepare data
X_train, y_train = preprocess(train)
X_test, y_test = preprocess(test)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train baseline model
baseline_model = LogisticRegression(random_state=42, max_iter=1000)
baseline_model.fit(X_train_scaled, y_train)

# Predictions
y_pred = baseline_model.predict(X_test_scaled)
y_prob = baseline_model.predict_proba(X_test_scaled)[:, 1]


In [27]:
# Basic evaluation
print("Baseline Model Performance")
print("=" * 50)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Baseline Model Performance
Accuracy: 0.7475

Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86      1495
           1       0.00      0.00      0.00       505

    accuracy                           0.75      2000
   macro avg       0.37      0.50      0.43      2000
weighted avg       0.56      0.75      0.64      2000



In [16]:
# Save artifacts
joblib.dump(baseline_model, 'baseline_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

## COMPREHENSIVE EVALUATION

In [39]:
print("1. Overall Metrics")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
print(f"Precision (Class 1): {precision_score(y_test, y_pred, zero_division=0):.4f}")
print(f"Recall (Class 1): {recall_score(y_test, y_pred, zero_division=0):.4f}")
print(f"Macro F1 Score: {f1_score(y_test, y_pred, average='macro'):.4f}")
print("\nClass 1 (Returned):")
print(f"Class 1 Recall:    {recall_score(y_test, y_pred, pos_label=1):.4f} (Crucial for catching returns)")
print(f"Class 1 Precision: {precision_score(y_test, y_pred, pos_label=1):.4f} (Crucial for intervention cost)")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

1. Overall Metrics
Accuracy: 0.7475
ROC-AUC: 0.5622
Precision (Class 1): 0.0000
Recall (Class 1): 0.0000
Macro F1 Score: 0.4278

Class 1 (Returned):
Class 1 Recall:    0.0000 (Crucial for catching returns)
Class 1 Precision: 0.0000 (Crucial for intervention cost)

Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86      1495
           1       0.00      0.00      0.00       505

    accuracy                           0.75      2000
   macro avg       0.37      0.50      0.43      2000
weighted avg       0.56      0.75      0.64      2000



In [29]:
print("2. Confusion Matrix Interpretation")
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

print(f"True Negatives (Kept & Predicted Kept): {tn}")
print(f"False Positives (Kept but Predicted Return): {fp} (Cost: ${fp} * 3 = ${fp*3})")
print(f"False Negatives (Returned but Predicted Kept): {fn} (Cost: ${fn} * 18 = ${fn*18})")
print(f"True Positives (Returned & Predicted Return): {tp}")


2. Confusion Matrix Interpretation
True Negatives (Kept & Predicted Kept): 1495
False Positives (Kept but Predicted Return): 0 (Cost: $0 * 3 = $0)
False Negatives (Returned but Predicted Kept): 505 (Cost: $505 * 18 = $9090)
True Positives (Returned & Predicted Return): 0


In [33]:
print("3. Performance by Product Category")
# Merge predictions back to test set for segmentation
test_results = test.copy()
test_results['y_true'] = y_test
test_results['y_pred'] = y_pred

category_metrics = test_results.groupby('product_category').agg({
    'y_true': [
        ('Accuracy', lambda x: accuracy_score(x, test_results.loc[x.index, 'y_pred'])),
        ('Recall (Catch Rate)', lambda x: recall_score(x, test_results.loc[x.index, 'y_pred'], zero_division=0)),
        ('Precision', lambda x: precision_score(x, test_results.loc[x.index, 'y_pred'], zero_division=0)),
        ('Count', 'count')
    ]
}).droplevel(0, axis=1).reset_index()

print(category_metrics)

3. Performance by Product Category
  product_category  Accuracy  Recall (Catch Rate)  Precision  Count
0      Electronics  0.828666                  0.0        0.0    607
1          Fashion  0.686594                  0.0        0.0   1104
2       Home_Decor  0.809689                  0.0        0.0    289


In [34]:
print("4. Business Financial Check")
# Baseline Cost (No Model): Total Returns * $18
baseline_cost = (tp + fn) * 18
# Model Cost: (Intervention Cost) + (Unprevented Returns)
# Note: Intervention is applied to (TP + FP). Success rate is 35% on TPs.
intervention_cost = (tp + fp) * 3
unprevented_return_cost = fn * 18 + (tp * (1 - 0.35) * 18)
model_total_cost = intervention_cost + unprevented_return_cost

print(f"Cost without Model: ${baseline_cost:,.2f}")
print(f"Cost with Baseline Model: ${model_total_cost:,.2f}")
print(f"Net Profit/Loss: ${baseline_cost - model_total_cost:,.2f}")

4. Business Financial Check
Cost without Model: $9,090.00
Cost with Baseline Model: $9,090.00
Net Profit/Loss: $0.00


## Evaluation Findings

The baseline model has converged into a "Majority Class Classifier" (or a "Null Classifier"). Because approximately 75% of the training data consists of non-returns, the model has learned that the safest way to minimize error (and maximize accuracy) is to simply predict "No Return" (Class 0) for every single order.
The model generates zero savings. It is financially equivalent to having no model at all.

1. Key Metrics & Justification
- Recall: Total Failure. This measures how many actual returns the model catches. If Recall is low, the model is failing its primary purpose, preventing returns.
- Precision: Undefined/Zero. Since the model never predicted a return, precision is technically undefined. That's spent 0 on interventions, but also saved 0.
- ROC-AUC: Near Random. A score of 0.5 is random guessing. Indicates that even if we lowered the decision threshold, the model has almost no ability to distinguish between a returner and a keeper.
2. Confusion Matrix Interpretation:
- True Negatives (1495): The model correctly ignored all customers who kept their items.
- False Negatives (505): The model missed 100% of the returns. This resulted in 9,090 in return costs that went completely unaddressed.
- True Positives (0): Not a single intervention was triggered.
3. Performance by Product Category:
- The variation in accuracy across categories is not due to model intelligence, but due to the underlying return rates of those categories:
- Electronics (82.8% Accuracy): This category likely has a lower return rate around 17%, so guessing "0" works better here by luck.
- Fashion (68.6% Accuracy): This category has a higher return rate around 31%, so the "guess 0" strategy fails more often here.
- The model is equally blind across all categories; it is not picking up on category-specific signals (like sizing in Fashion).
4. Business Financial Check:
- Baseline Cost (No Model): 9,090.00 (505 returns * 18)
- Model Cost: 9,090.00(0 intervention + 9,090 unprevented returns)
- Net ROI: $0.00
- The model generates zero savings. It is financially equivalent to having no model at all.
5. Root Causes & Weakness Identification:
- Class Imbalance Bias: The Logistic Regression default threshold is 0.5. Since the probability of a return rarely crosses 50% for any given customer in this dataset, the model never pulls the trigger.
- The "Prior" Probability: In your dataset, only about 25% of orders are returns (505 out of 2000). Without looking at any specific features, the "base" probability of any random order being a return is 0.25.
- Weak Signal: The ROC-AUC of 0.56 suggests that the current features (Age, Price, etc.) in their current format (linear scaling, label encoding) provide very little signal to separate the classes.
- Improper Metric Optimization: The model optimized for standard accuracy (log-loss) rather than a cost-sensitive metric that penalizes missing a return (False Negative) more than a false alarm.


The baseline is unfit for production. The improved solution must address the class imbalance and use a model capable of finding non-linear patterns to improve the ROC-AUC score before optimization can even begin.

# Part 2: Business-Aligned Metrics

1. Success means the model consistently generates positive net financial savings by accurately identifying high-return-risk orders where intervention yields more benefit than cost. In practice, this requires high precision on high-risk orders, strong recall among costliest potential returns, and intervention decisions based on an ROI-optimized probability threshold.
2. Recommended Metrics:
- Net Savings ($): The estimated dollar amount saved after subtracting intervention costs and remaining return costs.
- Precision (Class 1): The Critical Guardrail. Intervening on an order costs $3, and a successful intervention reduces the return probability by 35%, yielding an expected benefit of 0.35×18=6.30, the net financial gain of a True Positive is 6.30−3=3.30. A False Positive, however, costs 3 with no savings, our "Breakeven Precision" is very high 3.00/(3.00+3.30). precision is the primary performance constraint. The model must reliably identify only high-risk orders before we care about how many it captures. Precision ensures each intervention makes money.
- Recall (Class 1): Secondary to precision. Once we establish a profitable precision tier, we maximize recall to scale the savings. Recall determines how much total money we can make.

In [44]:
def preprocess_fixed(df):
    df_processed = df.copy()
    le_category = LabelEncoder()
    df_processed['product_category_encoded'] = le_category.fit_transform(df_processed['product_category'])

    # FIX: No inplace=True
    if df_processed['size_purchased'].notna().any():
        most_common_size = df_processed['size_purchased'].mode()[0]
        df_processed['size_purchased'] = df_processed['size_purchased'].fillna(most_common_size)
        le_size = LabelEncoder()
        df_processed['size_encoded'] = le_size.fit_transform(df_processed['size_purchased'])

    feature_cols = [
        'customer_age', 'customer_tenure_days', 'product_category_encoded',
        'product_price', 'days_since_last_purchase', 'previous_returns',
        'product_rating', 'size_encoded', 'discount_applied'
    ]
    return df_processed[feature_cols], df_processed['is_return']

def calculate_financials(y_true, y_prob, threshold):
    y_pred = (y_prob >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Constants
    COST_RETURN = 18.0
    COST_INTERVENTION = 3.0
    SUCCESS_RATE = 0.35

    # 1. Baseline Cost (No Model)
    # We pay $18 for every actual return
    baseline_cost = (tp + fn) * COST_RETURN

    # 2. Model Cost
    # Intervention Cost: We pay $3 for every flagged order (TP + FP)
    intervention_spend = (tp + fp) * COST_INTERVENTION

    # Remaining Return Cost:
    # - We pay full $18 for missed returns (FN)
    # - We pay full $18 for the 65% of caught returns (TP) that didn't convert
    unprevented_returns_cost = (fn * COST_RETURN) + (tp * (1 - SUCCESS_RATE) * COST_RETURN)

    total_model_cost = intervention_spend + unprevented_returns_cost

    # 3. Metrics
    net_savings = baseline_cost - total_model_cost
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    return {
        "Threshold": threshold,
        "Net_Savings": net_savings,
        "Precision": precision,
        "Recall": recall,
        "Interventions": tp + fp,
        "TP": tp,
        "FP": fp
    }

In [43]:
X_train, y_train = preprocess_fixed(train)
X_test, y_test = preprocess_fixed(test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)
y_probs = model.predict_proba(X_test_scaled)[:, 1]


In [45]:
results = []
thresholds = np.arange(0.05, 0.95, 0.01)

for t in thresholds:
    results.append(calculate_financials(y_test, y_probs, t))

df_results = pd.DataFrame(results)

# Find Optimal
best_run = df_results.loc[df_results['Net_Savings'].idxmax()]

print("=== FINANCIAL ANALYSIS RESULTS ===")
print(f"Optimal Threshold: {best_run['Threshold']:.2f}")
print(f"Max Net Savings:   ${best_run['Net_Savings']:.2f}")
print(f"Precision at Opt:  {best_run['Precision']:.2%}")
print(f"Recall at Opt:     {best_run['Recall']:.2%}")
print(f"Interventions:     {int(best_run['Interventions'])}")
print("-" * 30)
print("Breakeven Precision Required: ~47.6%")

=== FINANCIAL ANALYSIS RESULTS ===
Optimal Threshold: 0.40
Max Net Savings:   $0.00
Precision at Opt:  0.00%
Recall at Opt:     0.00%
Interventions:     0
------------------------------
Breakeven Precision Required: ~47.6%


In [47]:
# Check if we ever make money
if best_run['Net_Savings'] <= 0:
    print("\nThe model NEVER makes a profit.")
    print("Reason: The model cannot isolate a segment of users with >47.6% return probability.")
else:
    print("\nThe model found a profitable pocket of customers.")


The model NEVER makes a profit.
Reason: The model cannot isolate a segment of users with >47.6% return probability.


3. The Breakeven Point:
- To break even, we need  Precision×($18×0.35)>$3.00
- The model must be at least 47.6% precise.
- The baseline Logistic Regression likely never reaches this precision on the test set because the features are too weak (ROC-AUC ~0.56). T it cannot find any group of customers
4. Threshold Selection Rationale: The selection of 0.40 is essentially arbitrary in this specific context, it simply represents a "cutoff" high enough that no orders crossed it.
- If we lowered the threshold to 0.25: We would catch some returns (High Recall), but we would flag so many non-returners (Low Precision) that the intervention costs would skyrocket, leading to negative savings.
- To technically determine the optimal threshold for selection, a process called Profit Curve Optimization (or Cost-Sensitive Learning). Using TP, FP for Breakeven Ratio 3.30×P(TP)>3.00×P(FP).
Benefit of a True Positive (TP):
Identify a return correctly, intervene (3 cost), save the return 35% of the time (18 saving). Net Value: (18×0.35)−3=+3.30
Cost of a False Positive (FP):
Flag a loyal customer who wasn't going to return anyway, intervene (3 cost), no savings occur. Net Value: −3.
The Breakeven Ratio: To make money, the expected value must be positive 3.30×P(TP)>3.00×P(FP). This implies the need roughly 1 TP for every 1 FP.
5. A "Good Enough to Deploy" Criteria: This model is NOT good enough to deploy. To justify deployment, a future iteration must
- Push Probabilities Higher: We need a model that outputs probabilities > 50% for specific high-risk segments ("Young customers buying XXL Fashion items on discount").
- Achieve Positive ROI: We need to see Net Savings > $0.


What's the optimal balance between catching returns (recall) and avoiding wasted interventions (precision)?
- In this specific financial scenario, Precision is significantly more important than Recall. It must prioritize avoiding wasted interventions over catching every return. The optimal balance requires a "Sniper" strategy (High Precision / Lower Recall), rather than a "Shotgun" strategy.
- Low Precision / High Recall: Bankrupts the program (Wasted intervention costs exceed savings).
- High Precision / Low Recall: Profitable but small scale (Positive ROI, but leaves money on the table by missing returns).