In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report



In [7]:
n = 1000

order_amount = np.random.uniform(100, 2000, n)
delay_minutes = np.random.randint(0, 90, n)
previous_refunds = np.random.randint(0, 5, n)
fraud_score = np.random.uniform(0, 1, n)
complaint_severity = np.random.randint(1, 6, n)

refund_prob = (
    0.3 * (delay_minutes > 30) +
    0.4 * (complaint_severity > 3) -
    0.5 * (fraud_score > 0.7)
)

refund_prob = 1 / (1 + np.exp(-refund_prob))
refunded = np.random.binomial(1, refund_prob)

data = pd.DataFrame({
    "order_amount": order_amount,
    "delay_minutes": delay_minutes,
    "previous_refunds": previous_refunds,
    "fraud_score": fraud_score,
    "complaint_severity": complaint_severity,
    "refunded": refunded
})

data.head()


Unnamed: 0,order_amount,delay_minutes,previous_refunds,fraud_score,complaint_severity,refunded
0,1824.562347,76,4,0.002091,3,0
1,1745.695941,30,2,0.043922,2,1
2,1974.436198,26,1,0.082287,5,1
3,1183.113163,81,3,0.393517,2,0
4,1852.294986,22,0,0.220481,5,0


In [8]:
data["rule_prediction"] = [
    rule_based_decision(v, d, p)
    for v, d, p in zip(
        data["order_amount"],
        data["delay_minutes"],
        data["previous_refunds"]
    )
]

rule_accuracy = calculate_accuracy(
    data["rule_prediction"],
    data["refunded"]
)

print("Rule-Based Accuracy:", rule_accuracy)


Rule-Based Accuracy: 0.533


In [9]:
X = data.drop(columns=["refunded", "rule_prediction"])
y = data["refunded"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

ml_predictions = model.predict(X_test)

ml_accuracy = accuracy_score(y_test, ml_predictions)

print("ML Accuracy:", ml_accuracy)
print("ML Confusion Matrix:")
print(confusion_matrix(y_test, ml_predictions))
print("\nML Classification Report:")
print(classification_report(y_test, ml_predictions))


ML Accuracy: 0.535
ML Confusion Matrix:
[[15 71]
 [22 92]]

ML Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.17      0.24        86
           1       0.56      0.81      0.66       114

    accuracy                           0.54       200
   macro avg       0.48      0.49      0.45       200
weighted avg       0.50      0.54      0.48       200



In [10]:
retention_value = 500
fraud_threshold = 0.7

def calculate_cost(df, predictions):
    total_cost = 0
    for i, pred in enumerate(predictions):
        row = df.iloc[i]
        if pred == 1:
            total_cost += row["order_amount"]
            if row["fraud_score"] > fraud_threshold:
                total_cost += row["order_amount"]
        else:
            if row["refunded"] == 1:
                total_cost += retention_value
    return total_cost

rule_cost = calculate_cost(data, data["rule_prediction"])
ml_cost = calculate_cost(data.loc[X_test.index], ml_predictions)

print("Total Economic Cost (Rule-Based):", rule_cost)
print("Total Economic Cost (ML-Based):", ml_cost)


Total Economic Cost (Rule-Based): 974290.8233123241
Total Economic Cost (ML-Based): 208426.33709229212


In [11]:
import sys
import os

sys.path.append(os.getcwd())

from rule_engine import rule_based_decision
from model import train_model, predict, predict_proba
from metrics import calculate_total_refund_cost, calculate_accuracy


In [12]:
from metrics import calculate_total_refund_cost

# Rule-based cost (full dataset)
rule_cost = calculate_total_refund_cost(
    data["rule_prediction"],
    data["order_amount"]
)

# ML cost (test set only)
ml_cost = calculate_total_refund_cost(
    ml_predictions,
    data.loc[X_test.index, "order_amount"]
)

print("Rule-Based Total Cost:", rule_cost)
print("ML-Based Total Cost:", ml_cost)


Rule-Based Total Cost: 673239.7386137495
ML-Based Total Cost: 162748.39376669176


In [13]:
from metrics import calculate_total_refund_cost


In [14]:
# Fair evaluation: compare both models on test set only

rule_test_cost = calculate_total_refund_cost(
    data.loc[X_test.index, "rule_prediction"],
    data.loc[X_test.index, "order_amount"]
)

ml_test_cost = calculate_total_refund_cost(
    ml_predictions,
    data.loc[X_test.index, "order_amount"]
)

print("Rule Test Cost:", rule_test_cost)
print("ML Test Cost:", ml_test_cost)


Rule Test Cost: 128401.94660672765
ML Test Cost: 162748.39376669176
