# Modeling & So sánh mô hình (Random Forest vs XGBoost)

Notebook này đọc dữ liệu đã làm sạch từ `inventory_clean.csv`, thực hiện Encoder/Scale, train 2 mô hình (RF, XGBoost) cho 2 bài toán Regression: Demand, Classification: Stockout, so sánh metric và lưu mô hình tốt nhất.


In [1]:
# Import thư viện

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
)

from xgboost import XGBRegressor, XGBClassifier
import pickle


In [2]:
# Đọc dữ liệu sạch
clean_csv_path = r'C:\Users\Admin\AdvancedPython\12423030_12423TN_HoangThiMinhTam\data\inventory_clean.csv'
df = pd.read_csv(clean_csv_path)
print('Loaded:', clean_csv_path)
print('Shape:', df.shape)
df.head()


Loaded: C:\Users\Admin\AdvancedPython\12423030_12423TN_HoangThiMinhTam\data\inventory_clean.csv
Shape: (76000, 24)


Unnamed: 0,Date,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Price,Discount,...,Epidemic,Demand,Year,Month,DayOfWeek,IsWeekend,Stockout,Effective_Price,Has_Discount,Promo_Epidemic
0,2022-01-01,S001,P0001,Electronics,North,195,102,252,72.72,5,...,0,115,2022,1,5,1,0,69.084,1,0
1,2022-01-01,S003,P0015,Groceries,East,166,131,500,98.46,10,...,0,133,2022,1,5,1,0,88.614,1,0
2,2022-01-01,S004,P0008,Furniture,West,281,59,0,100.64,0,...,0,80,2022,1,5,1,0,100.64,0,0
3,2022-01-01,S002,P0005,Electronics,South,155,103,334,92.35,0,...,0,94,2022,1,5,1,0,92.35,0,0
4,2022-01-01,S001,P0015,Toys,North,119,55,74,14.39,5,...,0,64,2022,1,5,1,0,13.6705,1,0


In [3]:
# Khai báo target & danh sách feature 
target_reg = 'Demand'
target_clf = 'Stockout'

num_features = [
    'Inventory Level',
    'Units Sold',
    'Units Ordered',
    'Price',
    'Effective_Price',
    'Discount',
    'Competitor Pricing',
    'Month',
    'DayOfWeek',
    'IsWeekend'
]

bin_features = [
    'Promotion',
    'Epidemic',
    'Has_Discount',
    'Promo_Epidemic'
]

cat_features = [
    'Category',
    'Region',
    'Weather Condition',
    'Seasonality'
]

# Đảm bảo các cột tồn tại
num_features = [c for c in num_features if c in df.columns]
bin_features = [c for c in bin_features if c in df.columns]
cat_features = [c for c in cat_features if c in df.columns]

y_reg = df[target_reg]
y_clf = df[target_clf]

drop_cols = [target_reg, target_clf]
if 'Date' in df.columns:
    drop_cols.append('Date')

X = df.drop(columns=drop_cols)
print('X shape:', X.shape)


X shape: (76000, 21)


In [4]:
# Chia train/test
X_train, X_test, y_train_reg, y_test_reg, y_train_clf, y_test_clf = train_test_split(
    X, y_reg, y_clf, test_size=0.2, random_state=42
)
print('Train:', X_train.shape, ' Test:', X_test.shape)


Train: (60800, 21)  Test: (15200, 21)


In [5]:
# Encoder (One-Hot) + Scale (StandardScaler)
#    fit trên train, transform cho test để tránh data leakage
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat = ohe.fit_transform(X_train[cat_features])
X_test_cat  = ohe.transform(X_test[cat_features])
cat_names = ohe.get_feature_names_out(cat_features)

X_train_cat_df = pd.DataFrame(X_train_cat, columns=cat_names, index=X_train.index)
X_test_cat_df  = pd.DataFrame(X_test_cat,  columns=cat_names, index=X_test.index)

scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[num_features])
X_test_num  = scaler.transform(X_test[num_features])

X_train_num_df = pd.DataFrame(X_train_num, columns=num_features, index=X_train.index)
X_test_num_df  = pd.DataFrame(X_test_num,  columns=num_features, index=X_test.index)

X_train_bin_df = X_train[bin_features].copy()
X_test_bin_df  = X_test[bin_features].copy()

X_train_final = pd.concat([X_train_num_df, X_train_bin_df, X_train_cat_df], axis=1)
X_test_final  = pd.concat([X_test_num_df,  X_test_bin_df,  X_test_cat_df], axis=1)

print('X_train_final:', X_train_final.shape)
print('X_test_final :', X_test_final.shape)


X_train_final: (60800, 31)
X_test_final : (15200, 31)


In [6]:
# Train 2 mô hình: Random Forest vs XGBoost 
# Random Forest 
rf_reg = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
rf_reg.fit(X_train_final, y_train_reg)

rf_clf = RandomForestClassifier(
    n_estimators=400, random_state=42, n_jobs=-1, class_weight='balanced'
)
rf_clf.fit(X_train_final, y_train_clf)

# XGBoost
# scale_pos_weight giúp xử lý lệch lớp cho bài toán Stockout
pos = (y_train_clf == 1).sum()
neg = (y_train_clf == 0).sum()
scale_pos_weight = (neg / pos) if pos > 0 else 1.0

xgb_reg = XGBRegressor(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)
xgb_reg.fit(X_train_final, y_train_reg)

xgb_clf = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)
xgb_clf.fit(X_train_final, y_train_clf)

In [7]:
# Đánh giá & so sánh metric

def eval_reg(name, model, X_te, y_te):
    pred = model.predict(X_te)
    mae = mean_absolute_error(y_te, pred)
    rmse = np.sqrt(mean_squared_error(y_te, pred))
    r2 = r2_score(y_te, pred)
    return {'Model': name, 'MAE': mae, 'RMSE': rmse, 'R2': r2}

def eval_clf(name, model, X_te, y_te):
    pred = model.predict(X_te)
    # predict_proba có sẵn cho RF/XGB
    proba = model.predict_proba(X_te)[:, 1]
    prec = precision_score(y_te, pred, zero_division=0)
    rec = recall_score(y_te, pred, zero_division=0)
    f1 = f1_score(y_te, pred, zero_division=0)
    auc = roc_auc_score(y_te, proba)
    cm = confusion_matrix(y_te, pred)
    return {'Model': name, 'Precision': prec, 'Recall': rec, 'F1': f1, 'ROC_AUC': auc, 'CM': cm}

reg_results = [
    eval_reg('RandomForestRegressor', rf_reg, X_test_final, y_test_reg),
    eval_reg('XGBRegressor', xgb_reg, X_test_final, y_test_reg)
]
reg_df = pd.DataFrame(reg_results)
reg_df


Unnamed: 0,Model,MAE,RMSE,R2
0,RandomForestRegressor,12.623461,16.896876,0.867827
1,XGBRegressor,11.944533,15.844291,0.883782


In [8]:
clf_results = [
    eval_clf('RandomForestClassifier', rf_clf, X_test_final, y_test_clf),
    eval_clf('XGBClassifier', xgb_clf, X_test_final, y_test_clf)
]

clf_df = pd.DataFrame([{k:v for k,v in d.items() if k!='CM'} for d in clf_results])
clf_df


Unnamed: 0,Model,Precision,Recall,F1,ROC_AUC
0,RandomForestClassifier,0.917532,0.796954,0.853003,0.993587
1,XGBClassifier,0.945513,0.998308,0.971193,0.999585


In [9]:
# In Confusion Matrix
for d in clf_results:
    print('\n===', d['Model'], '===')
    print('Confusion Matrix:\n', d['CM'])



=== RandomForestClassifier ===
Confusion Matrix:
 [[13300   127]
 [  360  1413]]

=== XGBClassifier ===
Confusion Matrix:
 [[13325   102]
 [    3  1770]]


In [10]:
# Chọn mô hình tốt nhất & lưu .pkl vào thư mục demo
#    - Regression: chọn RMSE thấp nhất
#    - Classification: chọn F1 cao nhất

best_reg_row = reg_df.sort_values('RMSE', ascending=True).iloc[0]
best_reg_name = best_reg_row['Model']
best_reg_model = rf_reg if best_reg_name == 'RandomForestRegressor' else xgb_reg

best_clf_row = clf_df.sort_values('F1', ascending=False).iloc[0]
best_clf_name = best_clf_row['Model']
best_clf_model = rf_clf if best_clf_name == 'RandomForestClassifier' else xgb_clf

print('Best Regression:', best_reg_name, ' | RMSE =', round(float(best_reg_row['RMSE']), 4))
print('Best Classification:', best_clf_name, ' | F1 =', round(float(best_clf_row['F1']), 4))

best_bundle = {
    'best_reg': {
        'model_name': best_reg_name,
        'model': best_reg_model,
    },
    'best_clf': {
        'model_name': best_clf_name,
        'model': best_clf_model,
    },
    'scaler': scaler,
    'ohe': ohe,
    'num_features': num_features,
    'bin_features': bin_features,
    'cat_features': cat_features,
    'cat_names': list(cat_names)
}

# Đường dẫn lưu model
save_dir = r"C:\Users\Admin\AdvancedPython\12423030_12423TN_HoangThiMinhTam\demo"
os.makedirs(save_dir, exist_ok=True)

best_model_path = os.path.join(save_dir, "best_models.pkl")

with open(best_model_path, "wb") as f:
    pickle.dump(best_bundle, f)

print("Saved best models to:", best_model_path)


Best Regression: XGBRegressor  | RMSE = 15.8443
Best Classification: XGBClassifier  | F1 = 0.9712
Saved best models to: C:\Users\Admin\AdvancedPython\12423030_12423TN_HoangThiMinhTam\demo\best_models.pkl
