TRAIN MODEL

In [1]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import (
    RandomForestRegressor, 
    GradientBoostingRegressor,
    VotingRegressor,
    StackingRegressor
)
from xgboost import XGBRegressor

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error
)
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

import optuna
from optuna.samplers import TPESampler
import pandas as pd
import numpy as np
import joblib
import os
import json
import time
import warnings
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold

print("✅ All ML libraries imported successfully!")

✅ All ML libraries imported successfully!


DEFINE PREPROCESSOR CLASS

In [None]:
class RealEstatePreprocessor:
    """
    Complete preprocessing pipeline for real estate data.
    Prevents data leakage by fitting only on training data.
    """
    
    def __init__(self, n_folds=5, random_state=42):
        self.n_folds = n_folds
        self.random_state = random_state
        
        # Store encoders and scalers (FITTED ON TRAIN ONLY!)
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.target_encodings = {}
        self.fill_values = {}
        self.feature_columns = None  # CRITICAL: Store feature order
        self.scaled_columns = None   # CRITICAL: Store which columns were scaled
        
        # Features to drop (leakage)
        self.leakage_cols = ['Price_per_m2', 'Price_category', 'Area_category', 'Ward']
        
        # Location scores
        self.city_base_scores = {
            'Hồ Chí Minh': 50,
            'Hà Nội': 50,
        }
        
        self.district_scores = {
            'Hồ Chí Minh': {
                'Quận 1': 10, 'Quận 3': 9, 'Bình Thạnh': 8, 'Phú Nhuận': 8,
                'Quận 2': 7, 'Quận 7': 7, 'Quận 10': 7, 'Tân Bình': 7,
                'Quận 5': 6, 'Quận 6': 6, 'Gò Vấp': 6, 'Quận 8': 5,
                'Quận 9': 5, 'Thủ Đức': 5, 'Quận 12': 4, 'Tân Phú': 6,
                'Bình Tân': 4, 'Bình Chánh': 4, 'Hóc Môn': 3,
                'Củ Chi': 2, 'Nhà Bè': 3, 'Cần Giờ': 1
            },
            'Hà Nội': {
                'Hoàn Kiếm': 10, 'Ba Đình': 9, 'Đống Đa': 8, 'Hai Bà Trưng': 8,
                'Cầu Giấy': 7, 'Thanh Xuân': 7, 'Tây Hồ': 7,
                'Long Biên': 6, 'Hoàng Mai': 6, 'Nam Từ Liêm': 6, 'Bắc Từ Liêm': 6,
                'Hà Đông': 5, 'Đông Anh': 4, 'Gia Lâm': 4, 'Thanh Trì': 4,
                'Sóc Sơn': 3, 'Ba Vì': 2, 'Mỹ Đức': 2, 'Chương Mỹ': 2,
                'Thường Tín': 3, 'Mê Linh': 3, 'Hoài Đức': 3, 'Thạch Thất': 2
            }
        }
        
    def clean_strings(self, df):
        """Step 1: Normalize string columns"""
        print("\n" + "=" * 60)
        print("STEP 1: STRING NORMALIZATION")
        print("=" * 60)
        
        string_cols = ['City', 'District']
        for col in string_cols:
            if col in df.columns:
                df[col] = df[col].str.strip().str.title()
                df[col] = df[col].str.replace(r'\.$', '', regex=True)
                print(f"{col} unique values: {df[col].nunique()}")
        
        return df
    
    def remove_leakage(self, df):
        """Step 2: Remove leakage columns"""
        print("\n" + "=" * 60)
        print("STEP 2: REMOVE LEAKAGE FEATURES")
        print("=" * 60)
        
        cols_to_drop = [col for col in self.leakage_cols if col in df.columns]
        if cols_to_drop:
            print(f"Dropping: {cols_to_drop}")
            df = df.drop(columns=cols_to_drop)
        return df
    
    def handle_missing(self, df, is_train=True):
        """Step 3: Handle missing values"""
        print("\n" + "=" * 60)
        print("STEP 3: HANDLE MISSING VALUES")
        print("=" * 60)
        
        # Numeric: fill with median
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if df[col].isna().sum() > 0:
                if is_train:
                    self.fill_values[col] = df[col].median()
                fill_val = self.fill_values.get(col, df[col].median())
                df[col].fillna(fill_val, inplace=True)
                if is_train:
                    print(f"✓ {col}: filled with TRAIN median {fill_val:.2f}")
                else:
                    print(f"✓ {col}: filled with STORED median {fill_val:.2f}")
        
        # Categorical: fill with 'Unknown'
        cat_cols = df.select_dtypes(include=['object']).columns
        for col in cat_cols:
            if df[col].isna().sum() > 0:
                df[col].fillna('Unknown', inplace=True)
        
        return df
    
    def remove_bad_records(self, df):
        """Step 4: Remove invalid records"""
        print("\n" + "=" * 60)
        print("STEP 4: REMOVE BAD RECORDS")
        print("=" * 60)
        
        initial_shape = df.shape[0]
        df = df[(df['Price'] > 0) & (df['Area'] > 0)]
        df = df.drop_duplicates()
        
        print(f"Removed {initial_shape - df.shape[0]} bad records")
        return df
    
    def handle_outliers(self, df, method='percentile'):
        """Step 5: Handle outliers"""
        print("\n" + "=" * 60)
        print("STEP 5: HANDLE OUTLIERS")
        print("=" * 60)
        
        cols = ['Price', 'Area', 'Frontage', 'Access Road']
        
        for col in cols:
            if col not in df.columns:
                continue
                
            lower = df[col].quantile(0.01)
            upper = df[col].quantile(0.99)
            df[col] = df[col].clip(lower, upper)
            print(f"{col}: capped to [{lower:.2f}, {upper:.2f}]")
        
        return df
    
    def engineer_features(self, df):
        """Step 6: Feature engineering"""
        print("\n" + "=" * 60)
        print("STEP 6: FEATURE ENGINEERING")
        print("=" * 60)
        
        # Basic features
        if 'Total_rooms' not in df.columns:
            df['Total_rooms'] = df['Bedrooms'] + df['Bathrooms']
        
        df['Bedroom_Bathroom_ratio'] = df['Bedrooms'] / df['Bathrooms'].replace(0, 1)
        df['Area_per_floor'] = df['Area'] / df['Floors'].replace(0, 1)
        df['Room_density'] = df['Total_rooms'] / df['Area']
        
        # Luxury score
        binary_features = ['Has_Frontage', 'Has_Access_Road', 
                          'Has_House_Direction', 'Has_Balcony_Direction']
        df['Luxury_score'] = sum(df[col] for col in binary_features if col in df.columns)
        
        # Location features
        df['City_Base_Score'] = df['City'].map(self.city_base_scores).fillna(40)
        
        # District score (no lambda - pickle-safe)
        district_scores = []
        for idx, row in df.iterrows():
            city = row['City']
            district = row['District']
            score = self.district_scores.get(city, {}).get(district, 3)
            district_scores.append(score)
        df['District_Score'] = district_scores
        
        df['Location_Score'] = df['City_Base_Score'] + df['District_Score']
        df['Location_Tier'] = pd.cut(
            df['Location_Score'],
            bins=[0, 50, 55, 60, 100],
            labels=['Suburban', 'Urban', 'Premium', 'Elite']
        )
        
        # Other features
        df['Is_Apartment'] = df['Address'].str.contains('Dự án|Project', case=False, na=False).astype(int)
        df['Full_Legal'] = (df['Legal status'] == 'Have Certificate').astype(int)
        df['Full_Furniture'] = (df['Furniture state'] == 'Full').astype(int)
        
        print(f"✓ Created 13 new features")
        return df
    
    def target_encode_kfold(self, df, col, target='Price', is_train=True):
        """Target encoding with K-Fold (prevents leakage)"""
        if is_train:
            # Initialize column
            df[f'{col}_target_enc'] = 0.0
            
            kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
            
            # Use iloc instead of loc to avoid index issues
            for train_idx, val_idx in kf.split(df):
                # Calculate mean target per category on train fold
                train_data = df.iloc[train_idx]
                target_means = train_data.groupby(col)[target].mean()
                global_mean = train_data[target].mean()
                
                # Apply to validation fold using iloc
                val_data = df.iloc[val_idx]
                encoded_values = val_data[col].map(target_means).fillna(global_mean)
                df.iloc[val_idx, df.columns.get_loc(f'{col}_target_enc')] = encoded_values.values
            
            # Store global encoding for test data
            self.target_encodings[col] = df.groupby(col)[target].mean().to_dict()
            self.target_encodings[f'{col}_global_mean'] = df[target].mean()
            print(f"  ✓ {col}: K-Fold encoded (train)")
        else:
            global_mean = self.target_encodings.get(f'{col}_global_mean', 0)
            df[f'{col}_target_enc'] = df[col].map(self.target_encodings[col]).fillna(global_mean)
            print(f"  ✓ {col}: Applied stored encoding (test)")
        
        return df
    
    def encode_categoricals(self, df, is_train=True):
        """Step 7: Encode categorical variables"""
        print("\n" + "=" * 60)
        print("STEP 7: ENCODE CATEGORICAL FEATURES")
        print("=" * 60)
        
        # Target encoding
        df = self.target_encode_kfold(df, 'District', is_train=is_train)
        df = self.target_encode_kfold(df, 'City', is_train=is_train)
        
        # Label encoding
        label_cols = ['Legal status', 'Furniture state', 'Location_Tier']
        for col in label_cols:
            if col not in df.columns:
                continue
            
            if is_train:
                self.label_encoders[col] = LabelEncoder()
                df[f'{col}_encoded'] = self.label_encoders[col].fit_transform(df[col].astype(str))
                print(f"  ✓ {col}: Label encoded (train)")
            else:
                # Encode using stored encoder (no lambda - pickle-safe)
                encoded_values = []
                for value in df[col].astype(str):
                    if value in self.label_encoders[col].classes_:
                        encoded_values.append(self.label_encoders[col].transform([value])[0])
                    else:
                        encoded_values.append(-1)
                df[f'{col}_encoded'] = encoded_values
                print(f"  ✓ {col}: Applied stored encoding (test)")
        
        return df
    
    def scale_features(self, df, is_train=True):
        """Step 8: Scale numeric features"""
        print("\n" + "=" * 60)
        print("STEP 8: SCALE NUMERIC FEATURES")
        print("=" * 60)
        
        exclude = ['Price', 'Address']
        
        if is_train:
            numeric_cols = [col for col in df.select_dtypes(include=[np.number]).columns 
                           if col not in exclude]
            
            df[numeric_cols] = self.scaler.fit_transform(df[numeric_cols])
            
            # Store which columns were scaled
            self.scaled_columns = numeric_cols
            
            print(f"✓ Fitted scaler on {len(numeric_cols)} features (TRAIN)")
            print(f"✓ Stored scaled columns: {numeric_cols[:5]}...")
        else:
            # Use stored columns from training
            if self.scaled_columns is None:
                raise ValueError("Scaler not fitted yet. Call fit_transform first.")
            
            # Only scale columns that exist and were in training
            cols_to_scale = [col for col in self.scaled_columns if col in df.columns]
            
            df[cols_to_scale] = self.scaler.transform(df[cols_to_scale])
            print(f"✓ Applied scaler to {len(cols_to_scale)} features (TEST)")
        
        return df
    
    def fit_transform(self, df):
        """Complete preprocessing for TRAINING data"""
        print("\n" + "🚀 " * 20)
        print("PREPROCESSING PIPELINE - TRAINING DATA")
        print("🚀 " * 20)
        
        # CRITICAL: Reset index to avoid KeyError in K-Fold
        df = df.reset_index(drop=True)
        print("✓ Index reset for safe processing")
        
        df = self.clean_strings(df)
        df = self.remove_leakage(df)
        df = self.handle_missing(df, is_train=True)
        df = self.remove_bad_records(df)
        df = self.handle_outliers(df)
        df = self.engineer_features(df)
        df = self.encode_categoricals(df, is_train=True)
        
        # Separate target
        y = df['Price'].copy()
        X = df.drop(columns=['Price', 'Address', 'City', 'District', 
                             'Legal status', 'Furniture state', 'Location_Tier','House direction', 'Balcony direction'], 
                    errors='ignore')
        
        X = self.scale_features(X, is_train=True)
        
        # CRITICAL: Store feature columns in order
        self.feature_columns = list(X.columns)
        
        print("\n" + "✅ " * 20)
        print(f"FINAL SHAPE: X={X.shape}, y={y.shape}")
        print(f"✅ Feature columns stored: {len(self.feature_columns)}")
        print("✅ " * 20)
        
        return X, y
    
    def transform(self, df):
        """
        Apply preprocessing to TEST/NEW data.
        
        IMPORTANT: This method ALWAYS returns only X (features).
        If you need y (target), extract it from the original df before calling this method.
        
        Returns:
            X (pd.DataFrame): Preprocessed features
        """
        print("\n" + "🔧 " * 20)
        print("PREPROCESSING PIPELINE - TEST/NEW DATA")
        print("🔧 " * 20)
        
        # CRITICAL: Reset index to avoid KeyError
        df = df.reset_index(drop=True)
        print("✓ Index reset for safe processing")
        
        df = self.clean_strings(df)
        df = self.remove_leakage(df)
        df = self.handle_missing(df, is_train=False)
        df = self.engineer_features(df)
        df = self.encode_categoricals(df, is_train=False)
        
        # Drop all non-feature columns
        X = df.drop(columns=['Price', 'Address', 'City', 'District', 
                             'Legal status', 'Furniture state', 'Location_Tier',
                             'House direction', 'Balcony direction'], 
                    errors='ignore')
        X = self.scale_features(X, is_train=False)
        
        # CRITICAL: Ensure features match training (same columns, same order)
        if self.feature_columns is not None:
            missing_cols = set(self.feature_columns) - set(X.columns)
            extra_cols = set(X.columns) - set(self.feature_columns)
            
            if missing_cols:
                print(f"⚠️  Adding missing features: {missing_cols}")
                for col in missing_cols:
                    X[col] = 0
            
            if extra_cols:
                print(f"⚠️  Dropping extra features: {extra_cols}")
                X = X.drop(columns=list(extra_cols))
            
            # Reorder columns to match training exactly
            X = X[self.feature_columns]
            print(f"✓ Features reordered to match training")
        
        print("\n" + "✅ " * 20)
        print(f"FINAL SHAPE: X={X.shape}")
        print(f"✓ Feature order verified: {list(X.columns)[:5]}...")
        print("✅ " * 20)
        
        # ALWAYS return only X (no tuple, no conditional return)
        return X

print("✅ RealEstatePreprocessor class defined successfully!")

✅ RealEstatePreprocessor class defined successfully!


DATA CLEAN

In [3]:
df = pd.read_csv('../data/housing_data_processed.csv')

print("="*80)
print("DATA LOADED")
print("="*80)
print(f"Total records: {df.shape[0]:,}")
print(f"Total features: {df.shape[1]}")
print(f"\nFirst few rows:")
print(df.head(3))


DATA LOADED
Total records: 22,245
Total features: 23

First few rows:
                                             Address  Area  Frontage  \
0  Đường Nguyễn Văn Khối, Phường 11, Gò Vấp, Hồ C...  54.0       4.0   
1   Đường Quang Trung, Phường 8, Gò Vấp, Hồ Chí Minh  92.0       4.0   
2  Dự án Him Lam Thường Tín, Huyện Thường Tín, Hà...  74.0       5.0   

   Access Road  Floors  Bedrooms  Bathrooms      Legal status Furniture state  \
0          3.5     2.0       2.0        3.0  Have certificate            Full   
1          5.0     2.0       4.0        4.0  Have certificate            Full   
2         18.0     5.0       4.0        5.0  Have certificate            Full   

   Price  ... Price_per_m2 Total_rooms Bedroom_Bathroom_ratio  Area_per_floor  \
0   5.35  ...     0.099074         5.0               0.666667            27.0   
1   6.90  ...     0.075000         8.0               1.000000            46.0   
2   9.90  ...     0.133784         9.0               0.800000            

In [4]:
train_df, test_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42,
    shuffle=True
)

print("="*80)
print("TRAIN-TEST SPLIT")
print("="*80)
print(f"📊 Training set: {train_df.shape[0]:,} records ({train_df.shape[0]/df.shape[0]*100:.1f}%)")
print(f"📊 Test set:     {test_df.shape[0]:,} records ({test_df.shape[0]/df.shape[0]*100:.1f}%)")
print(f"\nPrice distribution:")
print(f"  Train - Mean: {train_df['Price'].mean():.2f}, Median: {train_df['Price'].median():.2f}")
print(f"  Test  - Mean: {test_df['Price'].mean():.2f}, Median: {test_df['Price'].median():.2f}")

TRAIN-TEST SPLIT
📊 Training set: 17,796 records (80.0%)
📊 Test set:     4,449 records (20.0%)

Price distribution:
  Train - Mean: 6.18, Median: 6.20
  Test  - Mean: 6.22, Median: 6.20


In [5]:
preprocessor = RealEstatePreprocessor(
    n_folds=5,        # Number of folds for target encoding
    random_state=42   # For reproducibility
)

print("="*80)
print("PREPROCESSOR INITIALIZED")
print("="*80)
print(f"✓ K-Fold splits: {preprocessor.n_folds}")
print(f"✓ Random state: {preprocessor.random_state}")
print(f"✓ Leakage columns to remove: {preprocessor.leakage_cols}")
print(f"✓ Cities covered: {list(preprocessor.city_base_scores.keys())}")

PREPROCESSOR INITIALIZED
✓ K-Fold splits: 5
✓ Random state: 42
✓ Leakage columns to remove: ['Price_per_m2', 'Price_category', 'Area_category', 'Ward']
✓ Cities covered: ['Hồ Chí Minh', 'Hà Nội']


In [6]:
X_train, y_train = preprocessor.fit_transform(train_df)

print("\n" + "="*80)
print("TRAINING DATA PREPROCESSING COMPLETE")
print("="*80)
print(f"✅ X_train shape: {X_train.shape}")
print(f"✅ y_train shape: {y_train.shape}")
print(f"\nFeature names ({len(X_train.columns)}):")
for i, col in enumerate(X_train.columns, 1):
    print(f"  {i:2d}. {col}")


🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 
PREPROCESSING PIPELINE - TRAINING DATA
🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 
✓ Index reset for safe processing

STEP 1: STRING NORMALIZATION
City unique values: 4
District unique values: 73

STEP 2: REMOVE LEAKAGE FEATURES
Dropping: ['Price_per_m2', 'Price_category', 'Area_category', 'Ward']

STEP 3: HANDLE MISSING VALUES

STEP 4: REMOVE BAD RECORDS
Removed 24 bad records

STEP 5: HANDLE OUTLIERS
Price: capped to [1.68, 10.00]
Area: capped to [20.00, 160.00]
Frontage: capped to [3.00, 12.00]
Access Road: capped to [2.00, 21.00]

STEP 6: FEATURE ENGINEERING
✓ Created 13 new features

STEP 7: ENCODE CATEGORICAL FEATURES
  ✓ District: K-Fold encoded (train)
  ✓ City: K-Fold encoded (train)
  ✓ Legal status: Label encoded (train)
  ✓ Furniture state: Label encoded (train)
  ✓ Location_Tier: Label encoded (train)

STEP 8: SCALE NUMERIC FEATURES
✓ Fitted scaler on 26 features (TRAIN)

✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ 
FINAL SHAPE: X=(17772,

In [7]:

y_test = test_df['Price'].copy()

# Transform chỉ trả về X
X_test = preprocessor.transform(test_df)

print("\n" + "="*80)
print("TEST DATA PREPROCESSING COMPLETE")
print("="*80)
print(f"✅ X_test shape: {X_test.shape}")
print(f"✅ y_test shape: {y_test.shape}")


🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 
PREPROCESSING PIPELINE - TEST/NEW DATA
🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 
✓ Index reset for safe processing

STEP 1: STRING NORMALIZATION
City unique values: 5
District unique values: 63

STEP 2: REMOVE LEAKAGE FEATURES
Dropping: ['Price_per_m2', 'Price_category', 'Area_category', 'Ward']

STEP 3: HANDLE MISSING VALUES

STEP 6: FEATURE ENGINEERING
✓ Created 13 new features

STEP 7: ENCODE CATEGORICAL FEATURES
  ✓ District: Applied stored encoding (test)
  ✓ City: Applied stored encoding (test)
  ✓ Legal status: Applied stored encoding (test)
  ✓ Furniture state: Applied stored encoding (test)
  ✓ Location_Tier: Applied stored encoding (test)

STEP 8: SCALE NUMERIC FEATURES
✓ Applied scaler to 26 features (TEST)

✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ 
FINAL SHAPE: X=(4449, 26)
✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ ✅ 

TEST DATA PREPROCESSING COMPLETE
✅ X_test shape: (4449, 26)
✅ y_test shape: (4449,)


In [16]:
print("\n" + "🔍 " * 20)
print("DATA LEAKAGE VERIFICATION (CORRECTED)")
print("🔍 " * 20)

# Check 1: Feature names match
print("\n1. Feature consistency:")
if list(X_train.columns) == list(X_test.columns):
    print("   ✅ Train and test have SAME features")
    print(f"   Total features: {len(X_train.columns)}")
else:
    print("   ❌ WARNING: Feature mismatch!")
    missing_in_test = set(X_train.columns) - set(X_test.columns)
    missing_in_train = set(X_test.columns) - set(X_train.columns)
    if missing_in_test:
        print(f"   Missing in test: {missing_in_test}")
    if missing_in_train:
        print(f"   Missing in train: {missing_in_train}")

# Check 2: No Price in features
print("\n2. Target variable:")
if 'Price' not in X_train.columns and 'Price' not in X_test.columns:
    print("   ✅ Price NOT in features (correct!)")
else:
    print("   ❌ WARNING: Price found in features!")

# Check 3: Verify stored values
print("\n3. Stored preprocessing values:")
print(f"   ✅ Fill values stored: {len(preprocessor.fill_values)} numeric columns")
if len(preprocessor.fill_values) > 0:
    print(f"      Example: {list(preprocessor.fill_values.items())[:2]}")

print(f"   ✅ Label encoders stored: {len(preprocessor.label_encoders)} categorical columns")
if len(preprocessor.label_encoders) > 0:
    print(f"      Columns: {list(preprocessor.label_encoders.keys())}")

target_enc_cols = [k for k in preprocessor.target_encodings.keys() if not k.endswith('_global_mean')]
print(f"   ✅ Target encodings stored: {len(target_enc_cols)} columns")
if len(target_enc_cols) > 0:
    print(f"      Columns: {target_enc_cols}")

print(f"   ✅ Scaler fitted: {preprocessor.scaler.n_features_in_} features")

# Check 4: CORRECTED - Compare stored encoding values directly
print("\n4. Target encoding verification:")
print("   Checking that test uses stored train encodings...")

if 'District' in preprocessor.target_encodings:
    district_encodings = preprocessor.target_encodings['District']
    
    # Show sample of stored encodings
    sample_districts = list(district_encodings.items())[:3]
    print(f"\n   Stored District encodings (from train):")
    for district, encoding in sample_districts:
        print(f"      {district}: {encoding:.2f}")
    
    # Verify test data uses these exact values
    if 'District_target_enc' in X_test.columns:
        # Get a sample district from test_df that exists in training
        common_districts = set(test_df['District']) & set(district_encodings.keys())
        if common_districts:
            sample_district = list(common_districts)[0]
            stored_encoding = district_encodings[sample_district]
            
            # Find this encoding in X_test
            test_indices = test_df.index[test_df['District'] == sample_district].tolist()
            if test_indices:
                # Get the first matching index in test_df
                original_idx = test_indices[0]
                # Find corresponding position in X_test (which was reset to 0-based index)
                position_in_test = test_df.index.get_loc(original_idx)
                actual_test_encoding = X_test.iloc[position_in_test]['District_target_enc']
                
                print(f"\n   Verification for District='{sample_district}':")
                print(f"      Stored (from train): {stored_encoding:.6f}")
                print(f"      Applied (in test):   {actual_test_encoding:.6f}")
                
                if abs(stored_encoding - actual_test_encoding) < 1e-6:
                    print(f"      ✅ MATCH! Test uses train encoding")
                else:
                    print(f"      ❌ MISMATCH! Possible leakage!")

# Check 5: Scaler verification
print("\n5. Scaler verification:")
print(f"   Train data range (sample feature):")
sample_feature = X_train.columns[0]
print(f"      {sample_feature}: [{X_train[sample_feature].min():.4f}, {X_train[sample_feature].max():.4f}]")
print(f"   Test data range (same feature):")
print(f"      {sample_feature}: [{X_test[sample_feature].min():.4f}, {X_test[sample_feature].max():.4f}]")
print(f"   ℹ️  Test range can exceed train range (this is normal)")

# Check 6: No raw categorical columns in features
print("\n6. Categorical columns removed:")
categorical_originals = ['City', 'District', 'Legal status', 'Furniture state', 'Location_Tier']
found_in_train = [col for col in categorical_originals if col in X_train.columns]
found_in_test = [col for col in categorical_originals if col in X_test.columns]

if not found_in_train and not found_in_test:
    print(f"   ✅ Original categorical columns removed")
    print(f"      Removed: {categorical_originals}")
else:
    print(f"   ❌ WARNING: Original categorical columns still present!")
    if found_in_train:
        print(f"      In train: {found_in_train}")
    if found_in_test:
        print(f"      In test: {found_in_test}")

# Check 7: Leakage columns removed
print("\n7. Leakage columns removed:")
leakage_in_train = [col for col in preprocessor.leakage_cols if col in X_train.columns]
leakage_in_test = [col for col in preprocessor.leakage_cols if col in X_test.columns]

if not leakage_in_train and not leakage_in_test:
    print(f"   ✅ All leakage columns removed")
    print(f"      Target leakage columns: {preprocessor.leakage_cols}")
else:
    print(f"   ❌ WARNING: Leakage columns still present!")
    if leakage_in_train:
        print(f"      In train: {leakage_in_train}")
    if leakage_in_test:
        print(f"      In test: {leakage_in_test}")

# Summary
print("\n" + "=" * 60)
print("VERIFICATION SUMMARY")
print("=" * 60)

all_checks_passed = (
    list(X_train.columns) == list(X_test.columns) and
    'Price' not in X_train.columns and
    'Price' not in X_test.columns and
    len(preprocessor.fill_values) > 0 and
    len(preprocessor.label_encoders) > 0 and
    not found_in_train and
    not found_in_test and
    not leakage_in_train and
    not leakage_in_test
)

if all_checks_passed:
    print("✅ ALL CHECKS PASSED - No data leakage detected!")
    print("✅ Ready for model training")
else:
    print("⚠️  Some checks failed - review warnings above")

print("=" * 60)


🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 
DATA LEAKAGE VERIFICATION (CORRECTED)
🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 🔍 

1. Feature consistency:
   ✅ Train and test have SAME features
   Total features: 26

2. Target variable:
   ✅ Price NOT in features (correct!)

3. Stored preprocessing values:
   ✅ Fill values stored: 0 numeric columns
   ✅ Label encoders stored: 3 categorical columns
      Columns: ['Legal status', 'Furniture state', 'Location_Tier']
   ✅ Target encodings stored: 2 columns
      Columns: ['District', 'City']
   ✅ Scaler fitted: 26 features

4. Target encoding verification:
   Checking that test uses stored train encodings...

   Stored District encodings (from train):
      Ba Vì: 8.43
      Ba Đình: 7.29
      Bình Chánh: 5.28

   Verification for District='Ba Đình':
      Stored (from train): 7.294094
      Applied (in test):   1.397555
      ❌ MISMATCH! Possible leakage!

5. Scaler verification:
   Train data range (sample feature):
      Area: [-1.3661, 4.0915

In [17]:
# Thêm feature_columns vào preprocessor hiện tại
with open('../models/best_model_metadata.json', 'r') as f:
    metadata = json.load(f)

preprocessor.feature_columns = metadata['feature_names']

# Lưu lại
joblib.dump(preprocessor, '../models/preprocessor.joblib', compress=3)
print("✅ Preprocessor updated with feature_columns from metadata!")

✅ Preprocessor updated with feature_columns from metadata!


DEFINE EVALUATION METRICS

In [10]:
def evaluate_model(y_true, y_pred, model_name="Model"):
    """
    Calculate comprehensive regression metrics
    """
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    
    # Adjusted R²
    n = len(y_true)
    p = X_train.shape[1]  # number of features
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    
    return {
        'Model': model_name,
        'MAE': mae,
        'RMSE': rmse,
        'R²': r2,
        'Adj_R²': adj_r2,
        'MAPE (%)': mape
    }


def print_metrics(metrics_dict):
    """Pretty print metrics"""
    print(f"\n{'='*70}")
    print(f"📊 {metrics_dict['Model']} Performance")
    print(f"{'='*70}")
    print(f"  MAE (Mean Absolute Error):        {metrics_dict['MAE']:>15,.2f}")
    print(f"  RMSE (Root Mean Squared Error):   {metrics_dict['RMSE']:>15,.2f}")
    print(f"  R² Score:                         {metrics_dict['R²']:>15.4f}")
    print(f"  Adjusted R²:                      {metrics_dict['Adj_R²']:>15.4f}")
    print(f"  MAPE (Mean Abs Percentage Error): {metrics_dict['MAPE (%)']:>15.2f}%")
    print(f"{'='*70}")

HYPERPARAMETER TUNING WITH OPTUNA

In [11]:
print("\n" + "🔬 " * 20)
print("HYPERPARAMETER TUNING WITH OPTUNA")
print("🔬 " * 20)

def tune_with_optuna(model_name, model_class, n_trials=50):
    """
    Tune hyperparameters using Optuna
    """
    print(f"\n{'='*70}")
    print(f"Tuning {model_name} with Optuna")
    print(f"{'='*70}")
    print(f"  Trials: {n_trials}")
    print(f"  Metric: MAE (lower is better)")
    print(f"  Sampler: TPE (Tree-structured Parzen Estimator)")
    
    def objective(trial):
        if model_name == 'Ridge_Regression':
            params = {
                'alpha': trial.suggest_float('alpha', 0.01, 10.0, log=True),
                'random_state': 42
            }
            model = Ridge(**params)
            
        elif model_name == 'Lasso_Regression':
            params = {
                'alpha': trial.suggest_float('alpha', 0.01, 10.0, log=True),
                'random_state': 42
            }
            model = Lasso(**params)
            
        elif model_name == 'Random_Forest':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 200),
                'max_depth': trial.suggest_int('max_depth', 10, 30),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
                'random_state': 42,
                'n_jobs': -1
            }
            model = RandomForestRegressor(**params)
            
        elif model_name == 'Gradient_Boosting':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 200),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'random_state': 42
            }
            model = GradientBoostingRegressor(**params)
            
        elif model_name == 'XGBoost':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 200),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'gamma': trial.suggest_float('gamma', 0, 0.5),
                'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0, 1.0),
                'random_state': 42,
                'n_jobs': -1
            }
            model = XGBRegressor(**params)
        
        # Train and evaluate
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        
        return mae
    
    # Create study
    study = optuna.create_study(
        direction='minimize',
        sampler=TPESampler(seed=42)
    )
    
    start_time = time.time()
    
    # Optimize with progress bar suppression
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    
    tuning_time = time.time() - start_time
    
    print(f"  ✅ Tuning completed in {tuning_time:.2f}s")
    print(f"  🎯 Best MAE: {study.best_value:,.2f}")
    print(f"  🎯 Best Trial: #{study.best_trial.number + 1}")
    print(f"  📋 Best Parameters:")
    for param, value in study.best_params.items():
        if isinstance(value, float):
            print(f"      {param}: {value:.6f}")
        else:
            print(f"      {param}: {value}")
    
    # Create model with best parameters
    if model_name == 'Ridge_Regression':
        best_model = Ridge(**study.best_params, random_state=42)
    elif model_name == 'Lasso_Regression':
        best_model = Lasso(**study.best_params, random_state=42)
    elif model_name == 'Random_Forest':
        best_model = RandomForestRegressor(**study.best_params, random_state=42, n_jobs=-1)
    elif model_name == 'Gradient_Boosting':
        best_model = GradientBoostingRegressor(**study.best_params, random_state=42)
    elif model_name == 'XGBoost':
        best_model = XGBRegressor(**study.best_params, random_state=42, n_jobs=-1)
    
    best_model.fit(X_train, y_train)
    
    return best_model, study.best_params, tuning_time, study


🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 
HYPERPARAMETER TUNING WITH OPTUNA
🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 🔬 


TUNING CONFIGURATION

In [12]:
print("\n" + "⚙️ " * 20)
print("TUNING CONFIGURATION")
print("⚙️ " * 20)

# Configuration
ENABLE_TUNING = True  # Set to False to skip tuning
N_TRIALS = 50  # Number of Optuna trials

print(f"\n✅ Hyperparameter Tuning: {'ENABLED' if ENABLE_TUNING else 'DISABLED'}")
print(f"✅ Optimization Method: Optuna (TPE Sampler)")
print(f"✅ Number of Trials: {N_TRIALS}")

# Models that will be tuned
TUNE_MODELS = ['Ridge_Regression', 'Lasso_Regression', 'Random_Forest', 
               'Gradient_Boosting', 'XGBoost']

print(f"✅ Models to tune: {', '.join([m.replace('_', ' ') for m in TUNE_MODELS])}")


# ============================================================================
# CELL 14: CREATE MODELS DIRECTORIES
# ============================================================================
models_dir = '../models'
tuned_models_dir = f'{models_dir}/tuned'
default_models_dir = f'{models_dir}/default'

os.makedirs(models_dir, exist_ok=True)
os.makedirs(tuned_models_dir, exist_ok=True)
os.makedirs(default_models_dir, exist_ok=True)

print(f"\n✅ Models directories created:")
print(f"   Main: {models_dir}")
print(f"   Tuned: {tuned_models_dir}")
print(f"   Default: {default_models_dir}")



⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ 
TUNING CONFIGURATION
⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ ⚙️ 

✅ Hyperparameter Tuning: ENABLED
✅ Optimization Method: Optuna (TPE Sampler)
✅ Number of Trials: 50
✅ Models to tune: Ridge Regression, Lasso Regression, Random Forest, Gradient Boosting, XGBoost

✅ Models directories created:
   Main: ../models
   Tuned: ../models/tuned
   Default: ../models/default


DEFINE AND TUNE MODELS

In [13]:
print("\n" + "🎯 " * 20)
print("DEFINING AND TUNING MODELS")
print("🎯 " * 20)

models = {}
tuning_results = {}

# 1. Linear Regression (no tuning needed)
print("\n[1/8] Linear Regression (no tuning)")
models['Linear_Regression'] = LinearRegression()

# 2-6. Models with optional tuning
tunable_models = {
    'Ridge_Regression': Ridge(alpha=1.0, random_state=42),
    'Lasso_Regression': Lasso(alpha=1.0, random_state=42),
    'Random_Forest': RandomForestRegressor(
        n_estimators=100, max_depth=20, random_state=42, n_jobs=-1
    ),
    'Gradient_Boosting': GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42
    ),
    'XGBoost': XGBRegressor(
        n_estimators=100, learning_rate=0.1, max_depth=7, random_state=42, n_jobs=-1
    )
}

for i, (name, default_model) in enumerate(tunable_models.items(), 2):
    print(f"\n[{i}/8] {name.replace('_', ' ')}")
    
    # Save default model
    default_path = f'{default_models_dir}/{name}_default.joblib'
    joblib.dump(default_model, default_path, compress=3)
    print(f"  💾 Default model saved: {default_path}")
    
    if not ENABLE_TUNING or name not in TUNE_MODELS:
        models[name] = default_model
        print(f"  ⚠️  Using default parameters (tuning disabled)")
        
    else:
        try:
            tuned_model, best_params, tuning_time, study = tune_with_optuna(
                name, type(default_model), n_trials=N_TRIALS
            )
            
            # Save tuned model
            tuned_path = f'{tuned_models_dir}/{name}_tuned.joblib'
            joblib.dump(tuned_model, tuned_path, compress=3)
            print(f"  💾 Tuned model saved: {tuned_path}")
            
            models[name] = tuned_model
            tuning_results[name] = {
                'method': 'Optuna',
                'best_params': best_params,
                'tuning_time': tuning_time,
                'best_value': study.best_value,
                'n_trials': len(study.trials)
            }
        except Exception as e:
            print(f"  ❌ Tuning failed: {e}")
            models[name] = default_model

# 7. Voting Regressor (uses tuned models if available)
print(f"\n[7/8] Voting Regressor")
print(f"  📦 Ensemble of: Ridge, RF, GB, XGB")
models['Voting_Regressor'] = VotingRegressor(
    estimators=[
        ('ridge', models['Ridge_Regression']),
        ('rf', models['Random_Forest']),
        ('gb', models['Gradient_Boosting']),
        ('xgb', models['XGBoost'])
    ]
)

# 8. Stacking Regressor (uses tuned models if available)
print(f"\n[8/8] Stacking Regressor")
print(f"  📦 Base: Ridge, RF, GB, XGB | Final: Ridge")
models['Stacking_Regressor'] = StackingRegressor(
    estimators=[
        ('ridge', models['Ridge_Regression']),
        ('rf', models['Random_Forest']),
        ('gb', models['Gradient_Boosting']),
        ('xgb', models['XGBoost'])
    ],
    final_estimator=Ridge(alpha=1.0, random_state=42),
    cv=5
)

print(f"\n✅ All {len(models)} models defined!")


[I 2025-10-25 16:45:38,771] A new study created in memory with name: no-name-13eb994a-9cbc-409a-8aea-19a67c0d3d9e



🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 
DEFINING AND TUNING MODELS
🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 🎯 

[1/8] Linear Regression (no tuning)

[2/8] Ridge Regression
  💾 Default model saved: ../models/default/Ridge_Regression_default.joblib

Tuning Ridge_Regression with Optuna
  Trials: 50
  Metric: MAE (lower is better)
  Sampler: TPE (Tree-structured Parzen Estimator)
  ✅ Tuning completed in 0.36s
  🎯 Best MAE: 1.19
  🎯 Best Trial: #50
  📋 Best Parameters:
      alpha: 0.010065
  💾 Tuned model saved: ../models/tuned/Ridge_Regression_tuned.joblib

[3/8] Lasso Regression
  💾 Default model saved: ../models/default/Lasso_Regression_default.joblib

Tuning Lasso_Regression with Optuna
  Trials: 50
  Metric: MAE (lower is better)
  Sampler: TPE (Tree-structured Parzen Estimator)
  ✅ Tuning completed in 53.00s
  🎯 Best MAE: 1.19
  🎯 Best Trial: #50
  📋 Best Parameters:
      alpha: 0.010065
  💾 Tuned model saved: ../models/tuned/Lasso_Regression_tuned.joblib

[4/8] Random Forest
  💾 Def

[W 2025-10-25 16:53:08,510] Trial 43 failed with parameters: {'n_estimators': 184, 'learning_rate': 0.04977330916622023, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 5, 'subsample': 0.7446021525526079} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Letha\AppData\Local\Programs\Python\Python313\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Letha\AppData\Local\Temp\ipykernel_1908\947159569.py", line 72, in objective
    model.fit(X_train, y_train)
    ~~~~~~~~~^^^^^^^^^^^^^^^^^^
  File "c:\Users\Letha\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Letha\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\ensemble\_gb.py", line 787, in fit
    n_stages = self._fit_stages(
        X_train,
    ...<8 lines>...
    

KeyboardInterrupt: 

TRAIN AND EVALUATE ALL MODELS

In [None]:
print("\n" + "🚀 " * 20)
print("TRAINING ALL MODELS")
print("🚀 " * 20)

# Store results
results = []
trained_models = {}

print(f"\nTraining data: {X_train.shape[0]:,} samples × {X_train.shape[1]} features")
print(f"Test data:     {X_test.shape[0]:,} samples × {X_test.shape[1]} features\n")

# Train each model
for i, (name, model) in enumerate(models.items(), 1):
    print(f"\n{'='*70}")
    print(f"[{i}/{len(models)}] Training: {name.replace('_', ' ')}")
    print(f"{'='*70}")
    
    try:
        # Start timer
        start_time = time.time()
        
        # Train model
        print("  ⏳ Fitting model...")
        model.fit(X_train, y_train)
        
        # Training time
        training_time = time.time() - start_time
        print(f"  ✅ Training completed in {training_time:.2f}s")
        
        # Predict
        print("  🔮 Making predictions...")
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Evaluate on train set
        train_metrics = evaluate_model(y_train, y_train_pred, f"{name} (Train)")
        
        # Evaluate on test set
        test_metrics = evaluate_model(y_test, y_test_pred, f"{name} (Test)")
        
        # Print quick summary
        print(f"  📊 Train R²: {train_metrics['R²']:.4f} | Test R²: {test_metrics['R²']:.4f}")
        print(f"  📊 Train MAE: {train_metrics['MAE']:,.0f} | Test MAE: {test_metrics['MAE']:,.0f}")
        
        # Calculate overfitting
        overfit = train_metrics['R²'] - test_metrics['R²']
        overfit_status = "✅ Good" if overfit < 0.05 else "⚠️  Slight" if overfit < 0.1 else "❌ High"
        print(f"  📊 Overfitting: {overfit:.4f} ({overfit_status})")
        
        # Add tuning info if available
        tuned = "✅ Yes" if name in tuning_results else "❌ No"
        tuning_time_total = tuning_results[name]['tuning_time'] if name in tuning_results else 0
        
        # Store results
        results.append({
            'Model': name.replace('_', ' '),
            'Tuned': tuned,
            'Train_R²': train_metrics['R²'],
            'Test_R²': test_metrics['R²'],
            'Train_MAE': train_metrics['MAE'],
            'Test_MAE': test_metrics['MAE'],
            'Train_RMSE': train_metrics['RMSE'],
            'Test_RMSE': test_metrics['RMSE'],
            'MAPE (%)': test_metrics['MAPE (%)'],
            'Adj_R²': test_metrics['Adj_R²'],
            'Training_Time (s)': training_time,
            'Tuning_Time (s)': tuning_time_total,
            'Total_Time (s)': training_time + tuning_time_total,
            'Overfit': overfit
        })
        
        # Save model (3 locations)
        # 1. Main directory (final trained model)
        model_path = f'{models_dir}/{name}.joblib'
        print(f"  💾 Saving final model...")
        joblib.dump(model, model_path, compress=3)
        print(f"  ✅ Saved: {model_path}")
        
        # 2. If this is a tuned model, also save in tuned directory
        if name in tuning_results:
            tuned_trained_path = f'{tuned_models_dir}/{name}_trained.joblib'
            joblib.dump(model, tuned_trained_path, compress=3)
            print(f"  ✅ Also saved in tuned directory: {tuned_trained_path}")
        
        
        
        # Store in memory for later use
        trained_models[name] = model
        
    except Exception as e:
        print(f"  ❌ Error training {name}: {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print("\n" + "✅ " * 20)
print("ALL MODELS TRAINED & SAVED!")
print("✅ " * 20)


🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 
TRAINING ALL MODELS
🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 🚀 

Training data: 17,772 samples × 26 features
Test data:     4,449 samples × 26 features


[1/8] Training: Linear Regression
  ⏳ Fitting model...
  ✅ Training completed in 0.01s
  🔮 Making predictions...
  📊 Train R²: 0.4917 | Test R²: 0.4020
  📊 Train MAE: 1 | Test MAE: 1
  📊 Overfitting: 0.0897 (⚠️  Slight)
  💾 Saving final model...
  ✅ Saved: ../models/Linear_Regression.joblib

[2/8] Training: Ridge Regression
  ⏳ Fitting model...
  ✅ Training completed in 0.00s
  🔮 Making predictions...
  📊 Train R²: 0.4917 | Test R²: 0.4020
  📊 Train MAE: 1 | Test MAE: 1
  📊 Overfitting: 0.0897 (⚠️  Slight)
  💾 Saving final model...
  ✅ Saved: ../models/Ridge_Regression.joblib
  ✅ Also saved in tuned directory: ../models/tuned/Ridge_Regression_trained.joblib

[3/8] Training: Lasso Regression
  ⏳ Fitting model...
  ✅ Training completed in 1.43s
  🔮 Making predictions...
  📊 Train R²: 0.4904 | Test 

COMPARE ALL MODELS

In [None]:
print("\n" + "📊 " * 20)
print("MODEL COMPARISON")
print("📊 " * 20)

# Create comparison dataframe
results_df = pd.DataFrame(results)

# Sort by Test R² (descending)
results_df = results_df.sort_values('Test_R²', ascending=False).reset_index(drop=True)

# Display results
print("\n" + "="*120)
print("PERFORMANCE RANKING (sorted by Test R²)")
print("="*120)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
print(results_df.to_string(index=False))

# Save comparison table
comparison_path = f'{models_dir}/model_comparison.csv'
results_df.to_csv(comparison_path, index=False)
print(f"\n✅ Comparison table saved: {comparison_path}")


# ============================================================================
# CELL 18: TUNING RESULTS SUMMARY
# ============================================================================
if tuning_results:
    print("\n" + "🔬 " * 20)
    print("OPTUNA HYPERPARAMETER TUNING SUMMARY")
    print("🔬 " * 20)
    
    for model_name, info in tuning_results.items():
        print(f"\n{'='*70}")
        print(f"Model: {model_name.replace('_', ' ')}")
        print(f"{'='*70}")
        print(f"  Optimization Method: {info['method']}")
        print(f"  Number of Trials: {info['n_trials']}")
        print(f"  Tuning Time: {info['tuning_time']:.2f}s")
        print(f"  Best MAE: {info['best_value']:,.2f}")
        print(f"  Best Parameters:")
        for param, value in info['best_params'].items():
            if isinstance(value, float):
                print(f"    {param}: {value:.6f}")
            else:
                print(f"    {param}: {value}")
    
    # Save tuning results
    tuning_path = f'{models_dir}/optuna_tuning_results.json'
    with open(tuning_path, 'w') as f:
        # Convert to JSON-serializable format
        tuning_json = {}
        for k, v in tuning_results.items():
            tuning_json[k] = {
                'method': v['method'],
                'n_trials': int(v['n_trials']),
                'tuning_time': float(v['tuning_time']),
                'best_value': float(v['best_value']),
                'best_params': {pk: float(pv) if isinstance(pv, (np.integer, np.floating)) else pv 
                               for pk, pv in v['best_params'].items()}
            }
        json.dump(tuning_json, f, indent=2)
    
    print(f"\n✅ Optuna tuning results saved: {tuning_path}")


📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 
MODEL COMPARISON
📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 📊 

PERFORMANCE RANKING (sorted by Test R²)
             Model Tuned  Train_R²  Test_R²  Train_MAE  Test_MAE  Train_RMSE  Test_RMSE  MAPE (%)   Adj_R²  Training_Time (s)  Tuning_Time (s)  Total_Time (s)  Overfit
Stacking Regressor  ❌ No  0.830283 0.618791   0.640922  0.967225    0.847923   1.285388 18.567903 0.616550          48.454065         0.000000       48.454065 0.211491
           XGBoost ✅ Yes  0.801791 0.616517   0.692654  0.972739    0.916336   1.289217 18.664387 0.614262           0.310204        14.333030       14.643234 0.185274
 Gradient Boosting ✅ Yes  0.865069 0.615061   0.557159  0.966888    0.756048   1.291663 18.488343 0.612797           8.057933       291.990098      300.048032 0.250008
  Voting Regressor  ❌ No  0.806988 0.603537   0.697458  0.990823    0.904245   1.310854 19.158447 0.601206           9.509927         0.000000        9.509927 0.203450
     Random Forest 

SUMMARY STATISTICS

In [None]:
print("\n" + "="*70)
print("SUMMARY STATISTICS")
print("="*70)

best_r2_idx = results_df['Test_R²'].idxmax()
best_mae_idx = results_df['Test_MAE'].idxmin()
fastest_idx = results_df['Total_Time (s)'].idxmin()

print(f"\n🏆 Best Model (Test R²):")
print(f"   {results_df.loc[best_r2_idx, 'Model']}")
print(f"   Tuned: {results_df.loc[best_r2_idx, 'Tuned']}")
print(f"   R² = {results_df.loc[best_r2_idx, 'Test_R²']:.4f}")
print(f"   MAE = {results_df.loc[best_r2_idx, 'Test_MAE']:,.2f}")
print(f"   RMSE = {results_df.loc[best_r2_idx, 'Test_RMSE']:,.2f}")

print(f"\n🎯 Best Model (Test MAE):")
print(f"   {results_df.loc[best_mae_idx, 'Model']}")
print(f"   Tuned: {results_df.loc[best_mae_idx, 'Tuned']}")
print(f"   MAE = {results_df.loc[best_mae_idx, 'Test_MAE']:,.2f}")
print(f"   R² = {results_df.loc[best_mae_idx, 'Test_R²']:.4f}")

print(f"\n⚡ Fastest Model:")
print(f"   {results_df.loc[fastest_idx, 'Model']}")
print(f"   Total Time = {results_df.loc[fastest_idx, 'Total_Time (s)']:.2f}s")
print(f"   R² = {results_df.loc[fastest_idx, 'Test_R²']:.4f}")

print(f"\n📊 Average Metrics:")
print(f"   Test R²:       {results_df['Test_R²'].mean():.4f}")
print(f"   Test MAE:      {results_df['Test_MAE'].mean():,.2f}")
print(f"   Test RMSE:     {results_df['Test_RMSE'].mean():,.2f}")
print(f"   Training Time: {results_df['Training_Time (s)'].mean():.2f}s")
print(f"   Tuning Time:   {results_df['Tuning_Time (s)'].mean():.2f}s")
print(f"   Total Time:    {results_df['Total_Time (s)'].mean():.2f}s")



SUMMARY STATISTICS

🏆 Best Model (Test R²):
   Stacking Regressor
   Tuned: ❌ No
   R² = 0.6188
   MAE = 0.97
   RMSE = 1.29

🎯 Best Model (Test MAE):
   Gradient Boosting
   Tuned: ✅ Yes
   MAE = 0.97
   R² = 0.6151

⚡ Fastest Model:
   Linear Regression
   Total Time = 0.01s
   R² = 0.4020

📊 Average Metrics:
   Test R²:       0.5334
   Test MAE:      1.06
   Test RMSE:     1.41
   Training Time: 8.63s
   Tuning Time:   49.28s
   Total Time:    57.91s


DETAILED BEST MODEL ANALYSIS

In [None]:
print("\n" + "🏆 " * 20)
print("BEST MODEL DETAILED ANALYSIS")
print("🏆 " * 20)

# Get best model (by Test R²)
best_model_name = results_df.iloc[0]['Model']
best_model_key = best_model_name.replace(' ', '_')
best_model = trained_models[best_model_key]

print(f"\n🥇 Best Model: {best_model_name}")
if best_model_key in tuning_results:
    print(f"   ✅ Hyperparameter Tuning: Optuna")
    print(f"   ✅ Trials: {tuning_results[best_model_key]['n_trials']}")
    print(f"   ✅ Tuning Time: {tuning_results[best_model_key]['tuning_time']:.2f}s")

# Detailed metrics
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

train_metrics = evaluate_model(y_train, y_train_pred, f"{best_model_name} (Train)")
test_metrics = evaluate_model(y_test, y_test_pred, f"{best_model_name} (Test)")

print_metrics(train_metrics)
print_metrics(test_metrics)

# Feature importance (if available)
if hasattr(best_model, 'feature_importances_'):
    print("\n" + "="*70)
    print("TOP 15 MOST IMPORTANT FEATURES")
    print("="*70)
    
    feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print(feature_importance.head(15).to_string(index=False))
    
    # Save feature importance
    feature_importance.to_csv(f'{models_dir}/feature_importance_{best_model_key}.csv', index=False)
    print(f"\n✅ Feature importance saved: {models_dir}/feature_importance_{best_model_key}.csv")


# ============================================================================
# CELL 21: SAVE BEST MODEL METADATA
# ============================================================================
print("\n" + "💾 " * 20)
print("SAVING METADATA")
print("💾 " * 20)

best_row = results_df.iloc[0]

metadata = {
    'best_model_name': best_model_name,
    'best_model_file': f'{best_model_key}.joblib',
    'hyperparameter_tuning': {
        'method': 'Optuna (TPE Sampler)',
        'tuned': best_model_key in tuning_results,
        'n_trials': tuning_results[best_model_key]['n_trials'] if best_model_key in tuning_results else 0,
        'best_params': tuning_results[best_model_key]['best_params'] if best_model_key in tuning_results else {}
    },
    'test_r2': float(best_row['Test_R²']),
    'test_mae': float(best_row['Test_MAE']),
    'test_rmse': float(best_row['Test_RMSE']),
    'mape': float(best_row['MAPE (%)']),
    'adj_r2': float(best_row['Adj_R²']),
    'train_r2': float(best_row['Train_R²']),
    'overfitting': float(best_row['Overfit']),
    'training_time': float(best_row['Training_Time (s)']),
    'tuning_time': float(best_row['Tuning_Time (s)']),
    'total_time': float(best_row['Total_Time (s)']),
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'n_features': int(X_train.shape[1]),
    'n_train_samples': int(X_train.shape[0]),
    'n_test_samples': int(X_test.shape[0]),
    'feature_names': list(X_train.columns),
    'all_models_trained': [m.replace('_', ' ') for m in trained_models.keys()]
}

metadata_path = f'{models_dir}/best_model_metadata.json'
with open(metadata_path, 'w', encoding='utf-8') as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

print(f"✅ Metadata saved: {metadata_path}")


🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 
BEST MODEL DETAILED ANALYSIS
🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 🏆 

🥇 Best Model: Stacking Regressor

📊 Stacking Regressor (Train) Performance
  MAE (Mean Absolute Error):                   0.64
  RMSE (Root Mean Squared Error):              0.85
  R² Score:                                  0.8303
  Adjusted R²:                               0.8300
  MAPE (Mean Abs Percentage Error):           11.93%

📊 Stacking Regressor (Test) Performance
  MAE (Mean Absolute Error):                   0.97
  RMSE (Root Mean Squared Error):              1.29
  R² Score:                                  0.6188
  Adjusted R²:                               0.6166
  MAPE (Mean Abs Percentage Error):           18.57%

💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 
SAVING METADATA
💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 💾 
✅ Metadata saved: ../models/best_model_metadata.json


In [18]:
# Tạo dữ liệu test giả
test_input = pd.DataFrame([{
    'Address': 'Test Address, Quận 1, Hồ Chí Minh',
    'Area': 100.0,
    'Frontage': 5.0,
    'Access Road': 4.0,
    'Floors': 3.0,
    'Bedrooms': 3.0,
    'Bathrooms': 2.0,
    'Legal status': 'Have Certificate',
    'Furniture state': 'Full',
    'Price': 0.0,
    'City': 'Hồ Chí Minh',
    'District': 'Quận 1',
    'House direction': 'East',
    'Balcony direction': 'South',
    'Has_Frontage': 1,
    'Has_Access_Road': 1,
    'Has_House_Direction': 1,
    'Has_Balcony_Direction': 1
}])

print("\n" + "="*80)
print("TEST PREPROCESSING")
print("="*80)

# Transform
X_test = preprocessor.transform(test_input)

print(f"\n✅ Transform successful!")
print(f"   Shape: {X_test.shape}")
print(f"   Columns: {list(X_test.columns)}")
print(f"   Feature count: {len(X_test.columns)}")
print(f"   Expected: {len(metadata['feature_names'])}")

if list(X_test.columns) == metadata['feature_names']:
    print("✅ Feature order PERFECT!")
else:
    print("⚠️  Feature order mismatch!")

# Load model và predict
model = joblib.load('../models/Stacking_Regressor.joblib')
prediction = model.predict(X_test.to_numpy())[0]

print(f"\n✅ Prediction successful: {prediction:.2f} tỷ")


TEST PREPROCESSING

🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 
PREPROCESSING PIPELINE - TEST/NEW DATA
🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 🔧 
✓ Index reset for safe processing

STEP 1: STRING NORMALIZATION
City unique values: 1
District unique values: 1

STEP 2: REMOVE LEAKAGE FEATURES

STEP 3: HANDLE MISSING VALUES

STEP 6: FEATURE ENGINEERING
✓ Created 13 new features

STEP 7: ENCODE CATEGORICAL FEATURES
  ✓ District: Applied stored encoding (test)
  ✓ City: Applied stored encoding (test)
  ✓ Legal status: Applied stored encoding (test)
  ✓ Furniture state: Applied stored encoding (test)
  ✓ Location_Tier: Applied stored encoding (test)

STEP 8: SCALE NUMERIC FEATURES


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.
