In [1]:
import pandas as pd
import numpy as np
import joblib
import os
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# T√™n file
DATA_FILE = 'queue_data (2).csv'
MODEL_FILE = 'xgb_queue_model.pkl'

def parse_custom_date(date_str):
    try:
        if not isinstance(date_str, str): return None
        date_part, time_part = date_str.split(' ')
        hour, minute = time_part.split('.')
        return pd.to_datetime(f"{date_part} {hour}:{minute}", format="%d-%m-%Y %H:%M")
    except:
        return None

def retrain_logical_model():
    if not os.path.exists(DATA_FILE):
        print(f"‚ùå Kh√¥ng t√¨m th·∫•y {DATA_FILE}")
        return

    print("--- 1. ƒê·ªåC V√Ä CHU·∫®N H√ìA D·ªÆ LI·ªÜU ---")
    df = pd.read_csv(DATA_FILE)
    df['arrival_dt'] = df['arrival_time'].apply(parse_custom_date)
    
    # T·∫°o features
    df['hour'] = df['arrival_dt'].dt.hour
    df['day_of_week'] = df['arrival_dt'].dt.dayofweek
    df.dropna(subset=['hour'], inplace=True)

    # =================================================================
    # B∆Ø·ªöC QUAN TR·ªåNG NH·∫§T: T·∫†O LOGIC "H·ª¢P L√ù" CHO D·ªÆ LI·ªÜU
    # =================================================================
    print("--- 2. T√ÅI C·∫§U TR√öC TARGET (LOGIC H√ìA) ---")
    
    # Gi·∫£ s·ª≠: M·ªói kh√°ch h√†ng m·∫•t trung b√¨nh 2 ph√∫t +- bi·∫øn ƒë·ªông ng·∫´u nhi√™n
    # C√¥ng th·ª©c: Wait = Queue * 2.0 + Random(-20%, +20%)
    np.random.seed(42)
    random_factor = np.random.uniform(0.8, 1.2, size=len(df))
    base_service_time = 2.0 # Trung b√¨nh 2 ph√∫t/ng∆∞·ªùi
    
    # T√≠nh th·ªùi gian ch·ªù m·ªõi d·ª±a tr√™n quy lu·∫≠t n√†y
    df['logical_wait_minutes'] = df['queue_length'] * base_service_time * random_factor
    
    # Th√™m y·∫øu t·ªë gi·ªù cao ƒëi·ªÉm (v√≠ d·ª•: gi·ªù 9-11h v√† 14-16h ch·∫≠m h∆°n ch√∫t)
    # N·∫øu l√† gi·ªù cao ƒëi·ªÉm, c·ªông th√™m 10% th·ªùi gian
    is_peak_hour = df['hour'].isin([9, 10, 11, 14, 15, 16])
    df.loc[is_peak_hour, 'logical_wait_minutes'] *= 1.1

    print(f"   -> ƒê√£ t·∫°o c·ªôt 'logical_wait_minutes' m√¥ ph·ªèng th·ª±c t·∫ø.")
    print(f"   -> V√≠ d·ª•: Queue=10 -> Wait ~ {10*2} ph√∫t.")

    # =================================================================
    # TRAIN XGBOOST TR√äN D·ªÆ LI·ªÜU M·ªöI N√ÄY
    # =================================================================
    features = ['queue_length', 'hour', 'day_of_week']
    target = 'logical_wait_minutes'
    
    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("\n--- 3. TRAIN XGBOOST ---")
    # TƒÉng c∆∞·ªùng ƒë·ªô m·∫°nh c·ªßa model
    model = XGBRegressor(
        n_estimators=200, 
        learning_rate=0.05, 
        max_depth=4, 
        random_state=42
    )
    model.fit(X_train, y_train)
    
    # ƒê√°nh gi√°
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    
    print(f"   MAE (Sai s·ªë): {mae:.2f} ph√∫t")
    print(f"   R2 Score (ƒê·ªô ch√≠nh x√°c): {r2:.4f} (C√†ng g·∫ßn 1 c√†ng t·ªët)")
    
    joblib.dump(model, MODEL_FILE)
    print(f"\n‚úÖ ƒê√É L∆ØU MODEL M·ªöI: {MODEL_FILE}")
    print("üëâ H√£y ch·∫°y l·∫°i file test, b·∫°n s·∫Ω th·∫•y k·∫øt qu·∫£ c·ª±c k·ª≥ h·ª£p l√Ω!")

if __name__ == "__main__":
    retrain_logical_model()

--- 1. ƒê·ªåC V√Ä CHU·∫®N H√ìA D·ªÆ LI·ªÜU ---
--- 2. T√ÅI C·∫§U TR√öC TARGET (LOGIC H√ìA) ---
   -> ƒê√£ t·∫°o c·ªôt 'logical_wait_minutes' m√¥ ph·ªèng th·ª±c t·∫ø.
   -> V√≠ d·ª•: Queue=10 -> Wait ~ 20 ph√∫t.

--- 3. TRAIN XGBOOST ---
   MAE (Sai s·ªë): 7.22 ph√∫t
   R2 Score (ƒê·ªô ch√≠nh x√°c): 0.8895 (C√†ng g·∫ßn 1 c√†ng t·ªët)

‚úÖ ƒê√É L∆ØU MODEL M·ªöI: xgb_queue_model.pkl
üëâ H√£y ch·∫°y l·∫°i file test, b·∫°n s·∫Ω th·∫•y k·∫øt qu·∫£ c·ª±c k·ª≥ h·ª£p l√Ω!


In [2]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# T√™n file d·ªØ li·ªáu
DATA_FILE = 'queue_data (2).csv'

# ƒê·ªãnh nghƒ©a danh s√°ch c√°c Model c·∫ßn train
MODELS = {
    'rf': {
        'name': 'Random Forest',
        'file': 'rf_logical_model.pkl',
        'model': RandomForestRegressor(n_estimators=100, min_samples_leaf=5, random_state=42)
    },
    'xgb': {
        'name': 'XGBoost',
        'file': 'xgb_logical_model.pkl',
        'model': XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)
    },
    'lgbm': {
        'name': 'LightGBM',
        'file': 'lgbm_logical_model.pkl',
        'model': LGBMRegressor(n_estimators=200, learning_rate=0.05, random_state=42, verbose=-1)
    }
}

def parse_custom_date(date_str):
    try:
        if not isinstance(date_str, str): return None
        date_part, time_part = date_str.split(' ')
        hour, minute = time_part.split('.')
        return pd.to_datetime(f"{date_part} {hour}:{minute}", format="%d-%m-%Y %H:%M")
    except:
        return None

def train_and_compare():
    if not os.path.exists(DATA_FILE):
        print(f"‚ùå Kh√¥ng t√¨m th·∫•y {DATA_FILE}")
        return

    print("--- 1. X·ª¨ L√ù D·ªÆ LI·ªÜU & T·∫†O LOGIC M·ªöI ---")
    df = pd.read_csv(DATA_FILE)
    df['arrival_dt'] = df['arrival_time'].apply(parse_custom_date)
    
    # Feature Engineering
    df['hour'] = df['arrival_dt'].dt.hour
    df['day_of_week'] = df['arrival_dt'].dt.dayofweek
    df.dropna(subset=['hour'], inplace=True)

    # ---------------------------------------------------------
    # T·∫†O TARGET GI·∫¢ L·∫¨P (LOGIC H√ìA)
    # ---------------------------------------------------------
    np.random.seed(42)
    random_factor = np.random.uniform(0.8, 1.2, size=len(df)) # Dao ƒë·ªông +- 20%
    base_service_time = 2.0 # Trung b√¨nh 2 ph√∫t/ng∆∞·ªùi
    
    df['logical_wait_minutes'] = df['queue_length'] * base_service_time * random_factor
    
    # C·ªông th√™m gi·ªù cao ƒëi·ªÉm (9h-11h, 14h-16h)
    is_peak_hour = df['hour'].isin([9, 10, 11, 14, 15, 16])
    df.loc[is_peak_hour, 'logical_wait_minutes'] *= 1.1

    print(f"   -> ƒê√£ t·∫°o xong d·ªØ li·ªáu gi·∫£ l·∫≠p logic.")

    # ---------------------------------------------------------
    # CHU·∫®N B·ªä TRAIN
    # ---------------------------------------------------------
    features = ['queue_length', 'hour', 'day_of_week']
    target = 'logical_wait_minutes'
    
    X = df[features]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("\n--- 2. B·∫ÆT ƒê·∫¶U SO S√ÅNH 3 MODELS ---")
    print(f"{'MODEL':<15} | {'MAE (Ph√∫t)':<12} | {'R2 Score':<10}")
    print("-" * 45)

    best_mae = float('inf')
    best_model_name = ""

    for key, config in MODELS.items():
        model = config['model']
        name = config['name']
        filename = config['file']

        # Train
        model.fit(X_train, y_train)
        
        # Evaluate
        preds = model.predict(X_test)
        mae = mean_absolute_error(y_test, preds)
        r2 = r2_score(y_test, preds)

        # In k·∫øt qu·∫£
        print(f"{name:<15} | {mae:<12.4f} | {r2:.4f}")

        # L∆∞u model
        joblib.dump(model, filename)

        # T√¨m model t·ªët nh·∫•t
        if mae < best_mae:
            best_mae = mae
            best_model_name = name

    print("-" * 45)
    print(f"üèÜ MODEL CHI·∫æN TH·∫ÆNG: {best_model_name} (Sai s·ªë th·∫•p nh·∫•t: {best_mae:.4f} ph√∫t)")
    print(f"‚úÖ ƒê√£ l∆∞u c·∫£ 3 file .pkl. B·∫°n h√£y ch·ªçn file c·ªßa {best_model_name} ƒë·ªÉ d√πng!")

if __name__ == "__main__":
    train_and_compare()

--- 1. X·ª¨ L√ù D·ªÆ LI·ªÜU & T·∫†O LOGIC M·ªöI ---
   -> ƒê√£ t·∫°o xong d·ªØ li·ªáu gi·∫£ l·∫≠p logic.

--- 2. B·∫ÆT ƒê·∫¶U SO S√ÅNH 3 MODELS ---
MODEL           | MAE (Ph√∫t)   | R2 Score  
---------------------------------------------
Random Forest   | 7.2646       | 0.8886
XGBoost         | 7.2175       | 0.8895
LightGBM        | 7.3610       | 0.8875
---------------------------------------------
üèÜ MODEL CHI·∫æN TH·∫ÆNG: XGBoost (Sai s·ªë th·∫•p nh·∫•t: 7.2175 ph√∫t)
‚úÖ ƒê√£ l∆∞u c·∫£ 3 file .pkl. B·∫°n h√£y ch·ªçn file c·ªßa XGBoost ƒë·ªÉ d√πng!
