In [33]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report



In [34]:
n = 1000

order_amount = np.random.uniform(100, 2000, n)
delay_minutes = np.random.randint(0, 90, n)
previous_refunds = np.random.randint(0, 5, n)
fraud_score = np.random.uniform(0, 1, n)
complaint_severity = np.random.randint(1, 6, n)

refund_prob = (
    0.3 * (delay_minutes > 30) +
    0.4 * (complaint_severity > 3) -
    0.5 * (fraud_score > 0.7)
)

refund_prob = 1 / (1 + np.exp(-refund_prob))
refunded = np.random.binomial(1, refund_prob)

data = pd.DataFrame({
    "order_amount": order_amount,
    "delay_minutes": delay_minutes,
    "previous_refunds": previous_refunds,
    "fraud_score": fraud_score,
    "complaint_severity": complaint_severity,
    "refunded": refunded
})

data.head()


Unnamed: 0,order_amount,delay_minutes,previous_refunds,fraud_score,complaint_severity,refunded
0,158.697991,25,0,0.434278,5,1
1,528.17565,82,1,0.920281,2,0
2,1932.932072,28,4,0.726028,3,0
3,417.849562,56,2,0.859765,5,1
4,183.191519,41,0,0.350026,4,0


In [35]:
data["rule_prediction"] = [
    rule_based_decision(v, d, p)
    for v, d, p in zip(
        data["order_amount"],
        data["delay_minutes"],
        data["previous_refunds"]
    )
]

rule_accuracy = calculate_accuracy(
    data["rule_prediction"],
    data["refunded"]
)

print("Rule-Based Accuracy:", rule_accuracy)


Rule-Based Accuracy: 0.549


In [36]:
X = data.drop(columns=["refunded", "rule_prediction"])
y = data["refunded"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

ml_predictions = model.predict(X_test)

ml_accuracy = accuracy_score(y_test, ml_predictions)

print("ML Accuracy:", ml_accuracy)
print("ML Confusion Matrix:")
print(confusion_matrix(y_test, ml_predictions))
print("\nML Classification Report:")
print(classification_report(y_test, ml_predictions))


ML Accuracy: 0.535
ML Confusion Matrix:
[[26 60]
 [33 81]]

ML Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.30      0.36        86
           1       0.57      0.71      0.64       114

    accuracy                           0.54       200
   macro avg       0.51      0.51      0.50       200
weighted avg       0.52      0.54      0.52       200



In [37]:
retention_value = 500
fraud_threshold = 0.7

def calculate_cost(df, predictions):
    total_cost = 0
    for i, pred in enumerate(predictions):
        row = df.iloc[i]
        if pred == 1:
            total_cost += row["order_amount"]
            if row["fraud_score"] > fraud_threshold:
                total_cost += row["order_amount"]
        else:
            if row["refunded"] == 1:
                total_cost += retention_value
    return total_cost

rule_cost = calculate_cost(data, data["rule_prediction"])
ml_cost = calculate_cost(data.loc[X_test.index], ml_predictions)

print("Total Economic Cost (Rule-Based):", rule_cost)
print("Total Economic Cost (ML-Based):", ml_cost)


Total Economic Cost (Rule-Based): 922017.6156404524
Total Economic Cost (ML-Based): 195572.2890551412


In [38]:
import sys
import os

sys.path.append(os.getcwd())

from rule_engine import rule_based_decision
from model import train_model, predict, predict_proba
from metrics import calculate_total_refund_cost, calculate_accuracy


In [39]:
from metrics import calculate_total_refund_cost

# Rule-based cost (full dataset)
rule_cost = calculate_total_refund_cost(
    data["rule_prediction"],
    data["order_amount"]
)

# ML cost (test set only)
ml_cost = calculate_total_refund_cost(
    ml_predictions,
    data.loc[X_test.index, "order_amount"]
)

print("Rule-Based Total Cost:", rule_cost)
print("ML-Based Total Cost:", ml_cost)


Rule-Based Total Cost: 640385.1381845224
ML-Based Total Cost: 151078.46660442764
