## Configuration và Setup

In [1]:
# ===============================================
# MAIN CONFIGURATION
# ===============================================
FORCE_RETRAIN = True  # Set True để train lại models đã có
SKIP_ON_ERROR = True   # Set True để skip configuration bị lỗi và tiếp tục
SAVE_DETAILED_LOGS = True  # Set True để lưu logs chi tiết

# Model selection (có thể chọn train một phần)
TRAIN_XGBOOST = True
TRAIN_LSTM = True

# Configurations to train (có thể customize)
CONFIGS_TO_TRAIN = ['7n_1n', '30n_1n', '30n_7n', '30n_30n', '90n_7n', '90n_30n']
# CONFIGS_TO_TRAIN = ['7n_1n', '30n_1n']  # Uncomment để test với configs nhỏ

print(f"🎯 Training Configuration:")
print(f"  Force Retrain: {FORCE_RETRAIN}")
print(f"  Skip on Error: {SKIP_ON_ERROR}")
print(f"  Train XGBoost: {TRAIN_XGBOOST}")
print(f"  Train LSTM: {TRAIN_LSTM}")
print(f"  Configurations: {CONFIGS_TO_TRAIN}")
print(f"  Total Models to Train: {len(CONFIGS_TO_TRAIN) * (int(TRAIN_XGBOOST) + int(TRAIN_LSTM))}")

🎯 Training Configuration:
  Force Retrain: True
  Skip on Error: True
  Train XGBoost: True
  Train LSTM: True
  Configurations: ['7n_1n', '30n_1n', '30n_7n', '30n_30n', '90n_7n', '90n_30n']
  Total Models to Train: 12


In [2]:
import sys
sys.path.append('../src')
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import time
import warnings
from datetime import datetime
from pathlib import Path

# Check TensorFlow availability
TF_AVAILABLE = False
try:
    import tensorflow as tf
    TF_AVAILABLE = True
    print(f"✅ TensorFlow {tf.__version__} available")
    
    # Check for GPU
    if tf.config.list_physical_devices('GPU'):
        print(f"🚀 GPU acceleration available")
    else:
        print(f"💻 Using CPU for training")
        
except ImportError as e:
    print(f"❌ TensorFlow not available: {e}")
    print(f"⚠️ LSTM training will fail")

# Import trainers
from xgboost_trainer import train_xgboost_model

if TF_AVAILABLE:
    from lstm_trainer import train_lstm_model
    print(f"✅ LSTM trainer imported successfully")
else:
    print(f"⚠️ LSTM trainer not available - TensorFlow missing")

from config import EXPERIMENTS, XGBOOST_PARAMS, LSTM_PARAMS, RANDOM_SEED

warnings.filterwarnings('ignore')
np.random.seed(RANDOM_SEED)

# Setup plotting
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_palette("husl")

print(f"✅ Libraries imported successfully")
print(f"🎲 Random seed: {RANDOM_SEED}")
print(f"📊 Available experiments: {list(EXPERIMENTS.keys())}")

# Update training flags based on availability
if not TF_AVAILABLE and TRAIN_LSTM:
    print(f"⚠️ Disabling LSTM training due to missing TensorFlow")
    TRAIN_LSTM = False

2025-10-03 23:03:55.746341: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-03 23:03:55.747196: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-03 23:03:55.855961: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-03 23:03:57.327618: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off,

✅ TensorFlow 2.20.0 available
💻 Using CPU for training
✅ LSTM trainer imported successfully
✅ Libraries imported successfully
🎲 Random seed: 28112001
📊 Available experiments: ['7n_1n', '30n_1n', '30n_7n', '30n_30n', '90n_7n', '90n_30n']


E0000 00:00:1759507438.258922   70494 cuda_executor.cc:1309] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1759507438.263447   70494 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


## Training Progress Tracking

In [3]:
class TrainingTracker:
    def __init__(self):
        self.start_time = time.time()
        self.results = []
        self.errors = []
        self.current_step = 0
        self.total_steps = 0
        
    def set_total_steps(self, total):
        self.total_steps = total
        
    def start_training(self, config_name, model_type):
        self.current_step += 1
        self.current_config = config_name
        self.current_model = model_type
        self.step_start_time = time.time()
        
        elapsed = time.time() - self.start_time
        print(f"\n{'='*60}")
        print(f"🚀 STEP {self.current_step}/{self.total_steps}: {config_name.upper()} - {model_type.upper()}")
        print(f"⏱️ Elapsed: {elapsed/60:.1f}min | ETA: {elapsed/self.current_step*(self.total_steps-self.current_step)/60:.1f}min")
        print(f"📊 Experiment: {EXPERIMENTS[config_name]['description']}")
        print(f"{'='*60}")
        
    def end_training(self, success=True, error_msg=None, results=None):
        step_time = time.time() - self.step_start_time
        
        result = {
            'config': self.current_config,
            'model_type': self.current_model,
            'success': success,
            'training_time': step_time,
            'timestamp': datetime.now().isoformat()
        }
        
        if success and results:
            result.update({
                'test_rmse': results.get('test_metrics', {}).get('RMSE', None),
                'test_r2': results.get('test_metrics', {}).get('R2', None),
                'best_params': results.get('best_params', {})
            })
            print(f"✅ SUCCESS: {self.current_config} {self.current_model} - {step_time/60:.1f}min")
            if results.get('test_metrics'):
                print(f"   RMSE: {results['test_metrics']['RMSE']:.4f} | R²: {results['test_metrics']['R2']:.4f}")
        else:
            result['error'] = error_msg
            self.errors.append(result)
            print(f"❌ FAILED: {self.current_config} {self.current_model} - {step_time/60:.1f}min")
            print(f"   Error: {error_msg}")
            
        self.results.append(result)
        
    def get_summary(self):
        total_time = time.time() - self.start_time
        successful = [r for r in self.results if r['success']]
        failed = [r for r in self.results if not r['success']]
        
        return {
            'total_time': total_time,
            'total_models': len(self.results),
            'successful': len(successful),
            'failed': len(failed),
            'success_rate': len(successful) / len(self.results) * 100 if self.results else 0,
            'results': self.results,
            'errors': self.errors
        }

# Initialize tracker
tracker = TrainingTracker()
print("✅ Training tracker initialized")

✅ Training tracker initialized


## Data Validation

In [4]:
def validate_data_availability():
    """Kiểm tra data đã sẵn sàng cho tất cả configurations"""
    data_folder = '../data'
    missing_data = []
    
    print("🔍 Checking data availability...")
    
    for config in CONFIGS_TO_TRAIN:
        # Check XGBoost data
        if TRAIN_XGBOOST:
            xgb_folder = f"{data_folder}/{config}_xgb"
            if not os.path.exists(xgb_folder):
                missing_data.append(f"{config}_xgb")
            else:
                required_files = ['X_train.csv', 'X_test.csv', 'y_train.csv', 'y_test.csv', 'metadata.json']
                for file in required_files:
                    if not os.path.exists(f"{xgb_folder}/{file}"):
                        missing_data.append(f"{config}_xgb/{file}")
        
        # Check LSTM data
        if TRAIN_LSTM:
            lstm_folder = f"{data_folder}/{config}_lstm"
            if not os.path.exists(lstm_folder):
                missing_data.append(f"{config}_lstm")
            else:
                required_files = ['X_train.npy', 'X_test.npy', 'y_train.npy', 'y_test.npy', 'metadata.json']
                for file in required_files:
                    if not os.path.exists(f"{lstm_folder}/{file}"):
                        missing_data.append(f"{config}_lstm/{file}")
    
    if missing_data:
        print("❌ Missing data detected:")
        for item in missing_data:
            print(f"   - {item}")
        print("\n💡 Please run notebook 02_feature_engineering.ipynb first")
        return False
    else:
        print("✅ All required data is available")
        return True

def check_existing_models():
    """Kiểm tra models đã train"""
    models_folder = '../models'
    existing_models = []
    
    print("\n🔍 Checking existing models...")
    
    for config in CONFIGS_TO_TRAIN:
        if TRAIN_XGBOOST:
            xgb_model = f"{models_folder}/{config}_xgb/best_model.pkl"
            if os.path.exists(xgb_model):
                existing_models.append(f"{config}_xgb")
                
        if TRAIN_LSTM:
            lstm_model = f"{models_folder}/{config}_lstm/best_model.h5"
            if os.path.exists(lstm_model):
                existing_models.append(f"{config}_lstm")
    
    if existing_models:
        print(f"⚠️ Found {len(existing_models)} existing models:")
        for model in existing_models:
            print(f"   - {model}")
        if not FORCE_RETRAIN:
            print(f"\n💡 These will be skipped (set FORCE_RETRAIN=True to retrain)")
        else:
            print(f"\n🔄 These will be retrained (FORCE_RETRAIN=True)")
    else:
        print("✅ No existing models found, will train all")
        
    return existing_models

# Run validation
data_ready = validate_data_availability()
existing_models = check_existing_models()

if not data_ready:
    raise RuntimeError("Data not ready for training")
    
print(f"\n🎯 Ready to train models!")

🔍 Checking data availability...
✅ All required data is available

🔍 Checking existing models...
⚠️ Found 3 existing models:
   - 7n_1n_xgb
   - 7n_1n_lstm
   - 30n_1n_xgb

🔄 These will be retrained (FORCE_RETRAIN=True)

🎯 Ready to train models!


## Training Execution Plan

In [5]:
def create_training_plan():
    """Tạo kế hoạch training"""
    plan = []
    
    for config in CONFIGS_TO_TRAIN:
        if TRAIN_XGBOOST:
            model_exists = f"{config}_xgb" in [m.replace('../models/', '').replace('/best_model.pkl', '') for m in existing_models]
            if not model_exists or FORCE_RETRAIN:
                plan.append((config, 'xgboost'))
                
        if TRAIN_LSTM and TF_AVAILABLE:
            model_exists = f"{config}_lstm" in [m.replace('../models/', '').replace('/best_model.keras', '') for m in existing_models]
            if not model_exists or FORCE_RETRAIN:
                plan.append((config, 'lstm'))
        elif TRAIN_LSTM and not TF_AVAILABLE:
            print(f"⚠️ Skipping LSTM for {config} - TensorFlow not available")
    
    return plan

# Create plan
training_plan = create_training_plan()

print(f"📋 TRAINING PLAN:")
print(f"   Total models to train: {len(training_plan)}")
print(f"\n📝 Training sequence:")
for i, (config, model_type) in enumerate(training_plan, 1):
    exp = EXPERIMENTS[config]
    print(f"   {i:2d}. {config.upper():<8} {model_type.upper():<8} ({exp['description']})")

# Set up tracker
tracker.set_total_steps(len(training_plan))

if len(training_plan) == 0:
    print("\n✅ All models already trained and FORCE_RETRAIN=False")
    print("💡 Set FORCE_RETRAIN=True if you want to retrain existing models")
else:
    print(f"\n🚀 Ready to start training {len(training_plan)} models!")

# Additional warnings
if TRAIN_LSTM and not TF_AVAILABLE:
    print(f"\n⚠️ WARNING: LSTM training requested but TensorFlow not available")
    print(f"   Please install TensorFlow: pip install tensorflow")

📋 TRAINING PLAN:
   Total models to train: 12

📝 Training sequence:
    1. 7N_1N    XGBOOST  (7 days input to predict water level at day 8 (not mean of days 8-14))
    2. 7N_1N    LSTM     (7 days input to predict water level at day 8 (not mean of days 8-14))
    3. 30N_1N   XGBOOST  (30 days input to predict water level at day 31)
    4. 30N_1N   LSTM     (30 days input to predict water level at day 31)
    5. 30N_7N   XGBOOST  (30 days input to predict water level at day 37 (not mean of days 31-37))
    6. 30N_7N   LSTM     (30 days input to predict water level at day 37 (not mean of days 31-37))
    7. 30N_30N  XGBOOST  (30 days input to predict water level at day 60)
    8. 30N_30N  LSTM     (30 days input to predict water level at day 60)
    9. 90N_7N   XGBOOST  (90 days input to predict water level at day 97)
   10. 90N_7N   LSTM     (90 days input to predict water level at day 97)
   11. 90N_30N  XGBOOST  (90 days input to predict water level at day 120)
   12. 90N_30N  LSTM   

## Main Training Loop

In [6]:
def train_single_model(config_name, model_type):
    """Train một model cho config và type cụ thể"""
    import os  # Move os import to the top
    
    try:
        if model_type == 'xgboost':
            # Suppress verbose output for XGBoost to reduce clutter
            old_stdout = os.dup(1)
            os.close(1)
            os.open(os.devnull, os.O_RDWR)
            
            try:
                trainer = train_xgboost_model(
                    config_name=config_name,
                    param_grid=XGBOOST_PARAMS,
                    data_folder='../data',
                    models_folder='../models',
                    cv_folds=3
                )
            finally:
                # Restore stdout
                os.dup2(old_stdout, 1)
                os.close(old_stdout)
            
            # Load results
            results_file = f"../models/{config_name}_xgb/results.json"
            with open(results_file, 'r') as f:
                results = json.load(f)
                
        elif model_type == 'lstm':
            if not TF_AVAILABLE:
                raise RuntimeError("TensorFlow not available for LSTM training")
                
            print(f"   📊 Starting LSTM training for {config_name}")
            print(f"   📁 Data folder: ../data/{config_name}_lstm")
            print(f"   🎯 Parameters: {LSTM_PARAMS}")
            
            # Check if LSTM data exists
            lstm_data_folder = f"../data/{config_name}_lstm"
            required_files = ['X_train.npy', 'X_test.npy', 'y_train.npy', 'y_test.npy']
            for file in required_files:
                file_path = f"{lstm_data_folder}/{file}"
                if not os.path.exists(file_path):
                    raise FileNotFoundError(f"Required LSTM data file not found: {file_path}")
            
            # Extract epochs and patience from param_grid
            lstm_param_grid = LSTM_PARAMS.copy()
            epochs = lstm_param_grid.pop('epochs', [100])[0] if isinstance(lstm_param_grid.get('epochs', [100]), list) else lstm_param_grid.pop('epochs', 100)
            patience = lstm_param_grid.pop('patience', [5])[0] if isinstance(lstm_param_grid.get('patience', [5]), list) else lstm_param_grid.pop('patience', 5)
                    
            trainer = train_lstm_model(
                config_name=config_name,
                param_grid=lstm_param_grid,  # Pass grid without epochs/patience
                data_folder='../data',
                models_folder='../models',
                epochs=epochs,  # Pass as separate parameter
                patience=patience,  # Pass as separate parameter
                verbose=0  # Reduce verbosity
            )
            
            # Load results
            results_file = f"../models/{config_name}_lstm/results.json"
            print(f"   📄 Loading results from: {results_file}")
            
            if not os.path.exists(results_file):
                raise FileNotFoundError(f"Results file not found: {results_file}")
                
            with open(results_file, 'r') as f:
                results = json.load(f)
        
        return True, results, None
        
    except Exception as e:
        import traceback
        error_msg = str(e)
        traceback_msg = traceback.format_exc()
        print(f"\n💥 Error training {config_name} {model_type}:")
        print(f"   Error: {error_msg}")
        # Only print traceback for debugging if needed
        if "TensorFlow not available" not in error_msg and "Results file not found" not in error_msg:
            print(f"   Traceback: {traceback_msg}")
        return False, None, error_msg

# Execute training plan
if len(training_plan) > 0:
    print(f"\n🚀 Starting training execution...")
    print(f"⏰ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"📊 Total models to train: {len(training_plan)}")
    
    for config_name, model_type in training_plan:
        tracker.start_training(config_name, model_type)
        
        # Clear output periodically to prevent truncation
        if tracker.current_step % 3 == 0:
            from IPython.display import clear_output
            clear_output(wait=True)
            print(f"🔄 Progress: {tracker.current_step}/{len(training_plan)} models trained")
        
        success, results, error_msg = train_single_model(config_name, model_type)
        tracker.end_training(success, error_msg, results)
        
        # Skip on error if configured
        if not success and not SKIP_ON_ERROR:
            print(f"\n🛑 Training stopped due to error (SKIP_ON_ERROR=False)")
            break
            
        # Brief pause between models
        time.sleep(0.5)
    
    print(f"\n🏁 Training execution completed!")
    print(f"⏰ End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
else:
    print(f"\n⏭️ No models to train (all exist and FORCE_RETRAIN=False)")

🔄 Progress: 12/12 models trained
   📊 Starting LSTM training for 90n_30n
   📁 Data folder: ../data/90n_30n_lstm
   🎯 Parameters: {'units': [32, 64], 'n_layers': [1, 2], 'dropout': [0.2, 0.5], 'batch_size': [32], 'epochs': [100], 'patience': [10]}
Loaded data for 90n_30n:
  X_train: (7328, 90, 6)
  X_test: (1743, 90, 6)
  y_train: (7328,) (scaled)
  y_test: (1743,) (scaled)
  Target scaler fitted: mean=0.8946, std=0.5896
\nStarting grid search for 90n_30n...
Parameter combinations: 8
\nTesting combination 1: {'batch_size': 32, 'dropout': 0.2, 'n_layers': 1, 'units': 32}
  Val Loss: 0.182778, Epochs: 73
  >>> New best model!
\nTesting combination 2: {'batch_size': 32, 'dropout': 0.2, 'n_layers': 1, 'units': 64}
  Val Loss: 0.180188, Epochs: 38
  >>> New best model!
\nTesting combination 3: {'batch_size': 32, 'dropout': 0.2, 'n_layers': 2, 'units': 32}
  Val Loss: 0.164282, Epochs: 77
  >>> New best model!
\nTesting combination 4: {'batch_size': 32, 'dropout': 0.2, 'n_layers': 2, 'units':



\n=== MODEL EVALUATION (Original Scale) ===
Training metrics:
  MAE: 0.151183
  MSE: 0.056175
  RMSE: 0.237012
  R2: 0.838400
\nTest metrics:
  MAE: 0.348969
  MSE: 0.267996
  RMSE: 0.517683
  R2: 0.178812
\nResults saved to ../models/90n_30n_lstm/
  - best_model.h5
  - grid_search_results_full.csv (all combinations)
  - grid_analysis.csv (analysis)
  - training_history.csv
  - results.json (summary)
\n✅ LSTM training completed for 90n_30n
   📄 Loading results from: ../models/90n_30n_lstm/results.json
✅ SUCCESS: 90n_30n lstm - 104.1min
   RMSE: 0.5177 | R²: 0.1788

🏁 Training execution completed!
⏰ End time: 2025-10-04 04:50:35


## Training Results Summary

In [7]:
# Get comprehensive summary
summary = tracker.get_summary()

print(f"\n" + "="*80)
print(f"📊 TRAINING SUMMARY")
print("="*80)

print(f"\n⏱️ Execution Time:")
print(f"   Total time: {summary['total_time']/3600:.1f}h {(summary['total_time']%3600)/60:.0f}min")
print(f"   Average per model: {summary['total_time']/max(1,summary['total_models'])/60:.1f}min")

print(f"\n📈 Success Rate:")
print(f"   Total models: {summary['total_models']}")
print(f"   Successful: {summary['successful']} ✅")
print(f"   Failed: {summary['failed']} ❌")
print(f"   Success rate: {summary['success_rate']:.1f}%")

# Detailed results table
if summary['results']:
    print(f"\n📋 Detailed Results:")
    results_df = pd.DataFrame(summary['results'])
    
    # Format for display
    display_cols = ['config', 'model_type', 'success', 'training_time', 'test_rmse', 'test_r2']
    display_df = results_df[display_cols].copy() if all(col in results_df.columns for col in display_cols) else results_df
    
    if 'training_time' in display_df.columns:
        display_df['training_time'] = display_df['training_time'].apply(lambda x: f"{x/60:.1f}min")
    if 'test_rmse' in display_df.columns:
        display_df['test_rmse'] = display_df['test_rmse'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
    if 'test_r2' in display_df.columns:
        display_df['test_r2'] = display_df['test_r2'].apply(lambda x: f"{x:.4f}" if pd.notna(x) else "N/A")
    
    print(display_df.to_string(index=False))

# Error summary
if summary['errors']:
    print(f"\n❌ Failed Models:")
    for error in summary['errors']:
        print(f"   - {error['config']} {error['model_type']}: {error['error']}")

# Save summary to file
if SAVE_DETAILED_LOGS:
    summary_file = f"../models/training_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(summary_file, 'w') as f:
        json.dump(summary, f, indent=2, default=str)
    print(f"\n💾 Summary saved to: {summary_file}")

print(f"\n🎉 All done! Check individual model folders in ../models/ for detailed results.")


📊 TRAINING SUMMARY

⏱️ Execution Time:
   Total time: 5.8h 47min
   Average per model: 28.9min

📈 Success Rate:
   Total models: 12
   Successful: 12 ✅
   Failed: 0 ❌
   Success rate: 100.0%

📋 Detailed Results:
 config model_type  success training_time test_rmse test_r2
  7n_1n    xgboost     True        1.1min    0.2841  0.7598
  7n_1n       lstm     True        7.4min    0.5451  0.1162
 30n_1n    xgboost     True        4.6min    0.2126  0.8656
 30n_1n       lstm     True       33.8min    0.5901 -0.0353
 30n_7n    xgboost     True        4.5min    0.2535  0.8090
 30n_7n       lstm     True       34.4min    0.5110  0.2237
30n_30n    xgboost     True        4.6min    0.3692  0.5939
30n_30n       lstm     True       36.5min    0.5541  0.0853
 90n_7n    xgboost     True       13.1min    0.2291  0.8400
 90n_7n       lstm     True       88.6min    0.3511  0.6244
90n_30n    xgboost     True       13.7min    0.3073  0.7106
90n_30n       lstm     True      104.1min    0.5177  0.1788

💾 Summ

## Quick Performance Comparison

In [8]:
def load_all_model_results():
    """Load kết quả từ tất cả models đã train"""
    models_folder = '../models'
    all_results = []
    
    for config in EXPERIMENTS.keys():
        # XGBoost results
        xgb_results_file = f"{models_folder}/{config}_xgb/results.json"
        if os.path.exists(xgb_results_file):
            try:
                with open(xgb_results_file, 'r') as f:
                    results = json.load(f)
                results['config'] = config
                results['model_type'] = 'XGBoost'
                results['description'] = EXPERIMENTS[config]['description']
                all_results.append(results)
            except:
                pass
        
        # LSTM results
        lstm_results_file = f"{models_folder}/{config}_lstm/results.json"
        if os.path.exists(lstm_results_file):
            try:
                with open(lstm_results_file, 'r') as f:
                    results = json.load(f)
                results['config'] = config
                results['model_type'] = 'LSTM'
                results['description'] = EXPERIMENTS[config]['description']
                all_results.append(results)
            except:
                pass
    
    return all_results

# Load và hiển thị comparison
all_results = load_all_model_results()

if all_results:
    print(f"\n📊 PERFORMANCE COMPARISON ({len(all_results)} models)")
    print("="*80)
    
    # Create comparison DataFrame
    comparison_data = []
    for result in all_results:
        row = {
            'Config': result['config'],
            'Model': result['model_type'],
            'Description': result['description'],
            'RMSE': result.get('test_metrics', {}).get('RMSE', 'N/A'),
            'R²': result.get('test_metrics', {}).get('R2', 'N/A'),
            'MAE': result.get('test_metrics', {}).get('MAE', 'N/A')
        }
        comparison_data.append(row)
    
    comparison_df = pd.DataFrame(comparison_data)
    
    # Sort by R² descending
    if 'R²' in comparison_df.columns:
        comparison_df = comparison_df.sort_values('R²', ascending=False)
    
    print(comparison_df.to_string(index=False))
    
    # Quick insights
    print(f"\n💡 Quick Insights:")
    
    # Best model overall
    numeric_df = comparison_df.copy()
    numeric_df = numeric_df[numeric_df['R²'] != 'N/A']
    if not numeric_df.empty:
        best_model = numeric_df.loc[numeric_df['R²'].astype(float).idxmax()]
        print(f"   🏆 Best overall: {best_model['Config']} {best_model['Model']} (R² = {best_model['R²']})")
    
    # Model type comparison
    xgb_models = comparison_df[comparison_df['Model'] == 'XGBoost']
    lstm_models = comparison_df[comparison_df['Model'] == 'LSTM']
    
    if not xgb_models.empty and not lstm_models.empty:
        xgb_avg_r2 = xgb_models[xgb_models['R²'] != 'N/A']['R²'].astype(float).mean() if not xgb_models[xgb_models['R²'] != 'N/A'].empty else 0
        lstm_avg_r2 = lstm_models[lstm_models['R²'] != 'N/A']['R²'].astype(float).mean() if not lstm_models[lstm_models['R²'] != 'N/A'].empty else 0
        print(f"   📊 XGBoost avg R²: {xgb_avg_r2:.3f} | LSTM avg R²: {lstm_avg_r2:.3f}")
    
    # Save comparison
    comparison_file = '../models/all_models_comparison.csv'
    comparison_df.to_csv(comparison_file, index=False)
    print(f"\n💾 Comparison saved to: {comparison_file}")
    
else:
    print(f"\n⚠️ No model results found for comparison")
    print(f"   Please train some models first")

print(f"\n🎯 Next steps:")
print(f"   1. Run notebook 06_model_comparison.ipynb for detailed analysis")
print(f"   2. Check individual model folders for detailed results")
print(f"   3. Consider ensemble methods for best performing models")
print(f"   4. Deploy best model for production use")


📊 PERFORMANCE COMPARISON (12 models)
 Config   Model                                                             Description     RMSE        R²      MAE
 30n_1n XGBoost                          30 days input to predict water level at day 31 0.212606  0.865615 0.124513
 90n_7n XGBoost                          90 days input to predict water level at day 97 0.229131  0.839985 0.152529
 30n_7n XGBoost 30 days input to predict water level at day 37 (not mean of days 31-37) 0.253469  0.809011 0.146621
  7n_1n XGBoost    7 days input to predict water level at day 8 (not mean of days 8-14) 0.284123  0.759842 0.180669
90n_30n XGBoost                         90 days input to predict water level at day 120 0.307339  0.710566 0.233413
 90n_7n    LSTM                          90 days input to predict water level at day 97 0.351054  0.624388 0.213215
30n_30n XGBoost                          30 days input to predict water level at day 60 0.369226  0.593873 0.271261
 30n_7n    LSTM 30 days input to p

## Kết luận

Notebook này đã hoàn thành việc training tất cả các experiments cho cả XGBoost và LSTM models.

### Các file được tạo:
- `../models/{config}_{model_type}/`: Thư mục chứa model và kết quả chi tiết
- `../models/all_models_comparison.csv`: Bảng so sánh tất cả models
- `../models/training_summary_{timestamp}.json`: Log chi tiết quá trình training

### Tính năng chính:
- ✅ Tự động training tuần tự tất cả configurations
- ✅ Skip models đã train (có thể force retrain)
- ✅ Error handling và recovery
- ✅ Progress tracking và time estimation
- ✅ Comprehensive results summary
- ✅ Quick performance comparison

### Sử dụng tiếp:
1. Chạy `06_model_comparison.ipynb` để phân tích chi tiết
2. Chọn model tốt nhất cho production
3. Xem xét ensemble methods
4. Monitor performance trên data mới

## ✅ Fixes Applied to Notebooks 03 & 04

### Dựa trên kinh nghiệm từ notebook 05, đã fix các lỗi sau:

#### **Notebook 03 (XGBoost Dynamic):**
✅ **Config fix**: `365n_7n, 365n_30n` → `90n_7n, 90n_30n`  
✅ **Enhanced imports**: Thêm datetime, seaborn  
✅ **Better validation**: Function-based data validation như notebook 05  
✅ **Progress tracking**: Thêm timestamps và progress messages  
✅ **Error handling**: Try-catch với detailed error messages  
✅ **Troubleshooting**: Thêm section troubleshooting  

#### **Notebook 04 (LSTM Dynamic):**
✅ **Config fix**: `365n_7n, 365n_30n` → `90n_7n, 90n_30n`  
✅ **TensorFlow check**: Enhanced availability check như notebook 05  
✅ **Overfitting warnings**: Thêm cảnh báo về overfitting risk  
✅ **Data size check**: Warning khi dataset nhỏ (<1000 samples)  
✅ **Enhanced training**: Epochs/patience handling như notebook 05  
✅ **Better error handling**: Improved try-catch blocks  

#### **Improvements Applied:**
- 🔧 Consistent configuration management
- 📊 Better progress tracking và time estimation  
- ⚠️ Enhanced error handling và validation
- 🎯 Specific warnings for LSTM overfitting issues
- 💡 Troubleshooting sections added
- 🔄 Code consistency with notebook 05 best practices

#### **Expected Results:**
- Notebooks 03 & 04 should now run reliably
- Better error messages and validation
- Consistent behavior with notebook 05
- Warnings about LSTM performance issues