# Modeling và Prediction

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from scipy.stats import gaussian_kde
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import MinMaxScaler
from pywaffle import Waffle
from wordcloud import WordCloud
import squarify
import plotly.express as px
import plotly.graph_objects as go

df = pd.read_csv("Amazon_SaleData_Cleaned.csv")

# Tạo các biến phụ trợ cho phân tích
df["OrderDate"] = pd.to_datetime(df["OrderDate"])
df["Year"] = df["OrderDate"].dt.year
df["Month"] = df["OrderDate"].dt.month
df["Quarter"] = df["OrderDate"].dt.quarter
df["YearMonth"] = df["OrderDate"].dt.to_period("M")
df["DayOfWeek"] = df["OrderDate"].dt.day_name()
df["MonthName"] = df["OrderDate"].dt.month_name()

# Tính toán các metrics bổ sung
df["Revenue"] = df["TotalAmount"]  # Doanh thu

print(f"Dataset shape: {df.shape}")
print(f"Thời gian: {df['OrderDate'].min()} đến {df['OrderDate'].max()}")
print(f"\nĐã tải dữ liệu và tạo các biến phụ trợ thành công!")

Dataset shape: (100000, 27)
Thời gian: 2020-01-01 00:00:00 đến 2024-12-29 00:00:00

Đã tải dữ liệu và tạo các biến phụ trợ thành công!


In [13]:
import numpy as np
import pandas as pd

# 1) Cờ NotDelivered (phòng khi chưa tạo)
df["NotDelivered"] = df["OrderStatus"].isin(["Cancelled", "Returned"]).astype(int)

# 2) DiscountGroup theo % giảm giá
df["DiscountGroup"] = pd.cut(
    df["Discount"],
    bins=[-0.001, 0, 0.05, 0.10, 0.20, 0.30],
    labels=["0%", "0-5%", "5-10%", "10-20%", "20-30%"],
    include_lowest=True,
)

# 3) AmountGroup theo TotalAmount (chia 4 nhóm theo quartile)
df["AmountGroup"] = pd.qcut(
    df["TotalAmount"],
    4,
    labels=["Low", "Medium", "High", "Very High"],
)

# 4) Customer_type: One-time Buyer vs Repeat Buyer
#    - Tính số đơn Delivered theo khách
df_delivered = df[df["OrderStatus"] == "Delivered"].copy()
orders_per_customer = df_delivered.groupby("CustomerID")["OrderID"].nunique()


def categorize_customer(n: int) -> str:
    return "One-time Buyer" if n == 1 else "Repeat Buyer"


customer_type = orders_per_customer.map(categorize_customer)

# Gán nhãn phân khúc cho df
df["Customer_type"] = df["CustomerID"].map(customer_type)

# Nếu có khách chưa từng Delivered => NaN, gán mặc định là One-time
df["Customer_type"] = df["Customer_type"].fillna("One-time Buyer")

high_risk_mask = (
    # Cash On Delivery – High – 20–30%
    (
        (df["PaymentMethod"] == "Cash On Delivery")
        & (df["AmountGroup"] == "High")
        & (df["DiscountGroup"] == "20-30%")
    )
    |
    # Cash On Delivery – Very High – 20–30%
    (
        (df["PaymentMethod"] == "Cash On Delivery")
        & (df["AmountGroup"] == "Very High")
        & (df["DiscountGroup"] == "20-30%")
    )
    |
    # Cash On Delivery – High – 10–20%
    (
        (df["PaymentMethod"] == "Cash On Delivery")
        & (df["AmountGroup"] == "High")
        & (df["DiscountGroup"] == "10-20%")
    )
    |
    # Cash On Delivery – High – 0–5%
    (
        (df["PaymentMethod"] == "Cash On Delivery")
        & (df["AmountGroup"] == "High")
        & (df["DiscountGroup"] == "0-5%")
    )
    |
    # Amazon Pay – Medium – 20–30%
    (
        (df["PaymentMethod"] == "Amazon Pay")
        & (df["AmountGroup"] == "Medium")
        & (df["DiscountGroup"] == "20-30%")
    )
    |
    # Amazon Pay – High – 20–30%
    (
        (df["PaymentMethod"] == "Amazon Pay")
        & (df["AmountGroup"] == "High")
        & (df["DiscountGroup"] == "20-30%")
    )
    |
    # Net Banking – Very High – 10–20%
    (
        (df["PaymentMethod"] == "Net Banking")
        & (df["AmountGroup"] == "Very High")
        & (df["DiscountGroup"] == "10-20%")
    )
)

df["HighRiskProfile"] = high_risk_mask.astype(int)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)

# Lọc đúng status và tạo target
df_model = df[df["OrderStatus"].isin(["Delivered", "Cancelled", "Returned"])].copy()
df_model["NotDelivered"] = (
    df_model["OrderStatus"].isin(["Cancelled", "Returned"]).astype(int)
)

# ==== THÊM HighRiskProfile VÀO FEATURES ====
feature_cols = [
    "TotalAmount",
    "Discount",
    "ShippingCost",
    "Quantity",
    "PaymentMethod",
    "DiscountGroup",
    "AmountGroup",
    "Customer_type",
    "HighRiskProfile",  # <—
]

X = df_model[feature_cols].copy()
y = df_model["NotDelivered"]

# bỏ missing để tránh lỗi
mask = X.notna().all(axis=1) & y.notna()
X, y = X[mask], y[mask]

# numeric & categorical columns
num_cols = ["TotalAmount", "Discount", "ShippingCost", "Quantity", "HighRiskProfile"]
cat_cols = ["PaymentMethod", "DiscountGroup", "AmountGroup", "Customer_type"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("clf", LogisticRegression(max_iter=2000, class_weight="balanced")),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("PR-AUC :", average_precision_score(y_test, y_proba))

              precision    recall  f1-score   support

           0     0.9558    0.7766    0.8570     14926
           1     0.1692    0.5588    0.2598      1215

    accuracy                         0.7602     16141
   macro avg     0.5625    0.6677    0.5584     16141
weighted avg     0.8966    0.7602    0.8120     16141

[[11592  3334]
 [  536   679]]
ROC-AUC: 0.6721969397449916
PR-AUC : 0.14567540683100336
