In [2]:
# #try random forest

# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# # Define file paths. Assuming files are in the current working directory.
# TARGET_COLUMN = 'HotelValue'
# train_data_path = 'train.csv'
# test_data_path = 'test.csv'


# # --- 1. Data Loading and Initial Setup ---
# print("--- 1. Data Loading and Initial Setup ---")
# # Attempt to load files (using a robust method to handle potential path changes)
# try:
#     df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
#     df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
# except FileNotFoundError:
#     df_train = pd.read_csv(train_data_path)
#     df_test = pd.read_csv(test_data_path)

# X_train = df_train.drop(columns=[TARGET_COLUMN])
# y_train = df_train[TARGET_COLUMN]
# X_test = df_test.copy()

# test_ids = X_test['Id']

# # Columns to drop due to irrelevance or excessive missing values
# columns_to_drop = [
#     'Id',
#     'PoolQuality',       # Excessive missing data
#     'BoundaryFence',     # Excessive missing data
#     'ExtraFacility',     # Excessive missing data
#     'ServiceLaneType',   # Excessive missing data
# ]

# X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
# X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

# # --- 2. Target Transformation ---
# # Log-transform the target variable
# y_train_log = np.log1p(y_train)


# # --- 3. Feature Engineering ---
# def engineer_features(df):
#     df = df.copy()
    
#     # 3.1 Age features
#     df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
#     # Use max(ConstructionYear, RenovationYear) to get the most recent date
#     df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)

#     # 3.2 Interaction feature
#     df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
#     # 3.3 Log transform selected skewed numerical features
#     for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
#         if col in df.columns:
#             # Impute with 0 for log transformation, then transform
#             temp_df = df[col].fillna(0)
#             df[col + '_Log'] = np.log1p(temp_df)

#     # Drop original year-related columns to avoid multicollinearity
#     df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    
#     return df

# X_train_fe = engineer_features(X_train)
# X_test_fe = engineer_features(X_test)


# # --- 4. Feature Classification and Preprocessing Pipelines ---
# numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
# categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

# # Numerical Transformer: Impute missing with median
# numerical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median'))
#     # No scaling is needed for Random Forest
# ])

# # Categorical Transformer: Impute missing with 'None' string, then One-Hot Encode
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_features),
#         ('cat', categorical_transformer, categorical_features)
#     ],
#     remainder='drop'
# )


# # --- 5. Model Training (Random Forest Regressor) ---

# # Using Random Forest for powerful non-linear modeling
# rf_model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     # n_jobs=-1 uses all available cores. max_depth controls complexity.
#     ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=15))
# ])

# rf_model_pipeline.fit(X_train_fe, y_train_log)


# # --- 6. Prediction and Submission File Creation ---
# y_test_log_pred = rf_model_pipeline.predict(X_test_fe)

# # Reverse log-transformation to get predictions in the original dollar scale
# y_test_pred = np.expm1(y_test_log_pred)
# y_test_pred[y_test_pred < 0] = 0 # Ensure non-negative predictions

# # Create and save the submission DataFrame
# submission_df = pd.DataFrame({
#     'Id': test_ids,
#     TARGET_COLUMN: y_test_pred
# })

# submission_file_name = 'random_forest_submission.csv'
# submission_df.to_csv(submission_file_name, index=False)
# print(f"Submission file saved as '{submission_file_name}'")

--- 1. Data Loading and Initial Setup ---
Submission file saved as 'random_forest_submission.csv'


In [7]:
#lasoo + polynomial regression

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error # <-- CHANGE HERE

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    # Attempt to load from a common nested folder structure
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    # Fallback to the current directory
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        # Exit or raise error if data can't be loaded
        raise

X_train = df_train.drop(columns=[TARGET_COLUMN])
y_train = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

# Columns to drop (based on the original code's specification)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType'
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    # Time-based features
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    # Interaction feature
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    # Log transformation for skewed numerical features
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    # Drop source columns used for feature engineering
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute, Scale, and add Polynomial Features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    # ('poly', PolynomialFeatures(degree=2, include_bias=False)) # <--- COMMENT THIS OUT
])


# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


# --- 5. Model Training (Lasso Regression with Cross-Validation) ---
print("\n--- 5. Model Training (LassoCV) ---")
lasso_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LassoCV(cv=5, random_state=42, max_iter=10000)) 
])

lasso_model_pipeline.fit(X_train_fe, y_train_log)
print("Model training complete.")

# Optional: Check model performance on training data
y_train_log_pred = lasso_model_pipeline.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0 # Ensure no negative values after reverse transformation

# Change the function call:
rmse_train = root_mean_squared_error(y_train, y_train_pred) # <-- FIX: Use root_mean_squared_error
r2_train = r2_score(y_train, y_train_pred)

print(f"LassoCV Optimal Alpha: {lasso_model_pipeline['regressor'].alpha_:.6f}")
print(f"Training RMSE (Original Scale): {rmse_train:,.2f}")
print(f"Training R-squared: {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = lasso_model_pipeline.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 # Final check to ensure non-negative values

submission_df = pd.DataFrame({
    'Id': test_ids,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'lasso_poly_submission.csv'
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Training data shape: (1200, 75)
Test data shape: (260, 75)

--- 3. Feature Engineering Complete ---
Number of numerical features: 40
Number of categorical features: 39
Final training features shape: (1200, 79)

--- 5. Model Training (LassoCV) ---
Model training complete.
LassoCV Optimal Alpha: 0.000638
Training RMSE (Original Scale): 22,876.82
Training R-squared: 0.9131

--- 6. Prediction & Submission ---
Prediction process complete.
Submission file 'lasso_poly_submission.csv' created with 260 predictions.
First 5 test predictions:
     Id     HotelValue
0   893  151109.234540
1  1106  327772.164773
2   414  105405.649409
3   523  155654.118290
4  1037  320053.072460


In [12]:
#lasso + cleaned data

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    # Attempt to load from a common nested folder structure
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    # Fallback to the current directory
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        # Exit or raise error if data can't be loaded
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal (New Step) ---
# Remove samples based on Target Value (extremely low/high values)
# and based on large/extreme values in key predictor columns (UsableArea and OverallQuality).
initial_row_count = len(df_train)

# 1. Target-based cleaning: Remove extreme values (e.g., bottom 0.1% and top 0.1% of prices)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

# 2. Predictor-based cleaning (Common for this type of dataset)
# Remove properties with extremely large UsableArea (e.g., > 4000 sq ft)
if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

# Remove properties with poor OverallQuality and high UsableArea (often errors)
if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

# Apply the mask to both features and target
X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()

# Sync test_ids for the remaining rows
test_ids_cleaned = X_test['Id'] # No change to test IDs as we don't drop test rows

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# Columns to drop (based on the original code's specification)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 'BasementHalfBaths', 'LowQualityArea'
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    # Time-based features
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    # Interaction feature
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    # Log transformation for skewed numerical features
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    # Drop source columns used for feature engineering
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute, Scale, and add Polynomial Features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    # The degree=2 poly features were commented out in your previous code to speed things up. 
    # I'll keep them commented unless performance is a concern.
    # ('poly', PolynomialFeatures(degree=2, include_bias=False)) 
])


# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


# --- 5. Model Training (Lasso Regression with Cross-Validation) ---
print("\n--- 5. Model Training (LassoCV) ---")
lasso_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LassoCV(cv=5, random_state=42, max_iter=10000)) 
])

lasso_model_pipeline.fit(X_train_fe, y_train_log)
print("Model training complete.")

# Optional: Check model performance on training data
y_train_log_pred = lasso_model_pipeline.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0 # Ensure no negative values after reverse transformation

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"LassoCV Optimal Alpha: {lasso_model_pipeline['regressor'].alpha_:.6f}")
print(f"Training RMSE (Original Scale): {rmse_train:,.2f}")
print(f"Training R-squared: {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = lasso_model_pipeline.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 # Final check to ensure non-negative values

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'lasso_submission_cleaned.csv' # Changed filename to reflect cleaning
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6
Cleaned training data shape: (1194, 73)
Test data shape: (260, 73)

--- 3. Feature Engineering Complete ---
Number of numerical features: 38
Number of categorical features: 39
Final training features shape: (1194, 77)

--- 5. Model Training (LassoCV) ---
Model training complete.
LassoCV Optimal Alpha: 0.000646
Training RMSE (Original Scale): 17,477.56
Training R-squared: 0.9453

--- 6. Prediction & Submission ---
Prediction process complete.
Submission file 'lasso_submission_cleaned.csv' created with 260 predictions.
First 5 test predictions:
     Id     HotelValue
0   893  151947.354104
1  1106  327944.333030
2   414  104910.258353
3   523  150776.861201
4  1037  313606.712862


In [11]:
# # try elastic nets + maximum likelihood

# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# # CHANGED: Import ElasticNetCV instead of LassoCV
# from sklearn.linear_model import ElasticNetCV
# from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

# # Define file paths
# train_data_path = 'train.csv'
# test_data_path = 'test.csv'
# TARGET_COLUMN = 'HotelValue'

# # --- 1. Data Loading and Initial Setup ---
# print("--- 1. Data Loading ---")
# try:
#     # Attempt to load from a common nested folder structure
#     df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
#     df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
#     print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
# except FileNotFoundError:
#     # Fallback to the current directory
#     try:
#         df_train = pd.read_csv(train_data_path)
#         df_test = pd.read_csv(test_data_path)
#         print("Loaded data from current directory.")
#     except FileNotFoundError as e:
#         print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
#         print(f"Details: {e}")
#         # Exit or raise error if data can't be loaded
#         raise

# # Separate features and target before dropping/cleaning
# X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
# y_train_raw = df_train[TARGET_COLUMN]
# X_test = df_test.copy()
# test_ids = X_test['Id']

# print(f"Initial training data shape: {X_train_raw.shape}")


# # --- 1.5 Outlier Removal ---
# # Remove samples based on Target Value and key Predictor columns.
# initial_row_count = len(df_train)

# # 1. Target-based cleaning: Remove extreme values (e.g., bottom 0.1% and top 0.1% of prices)
# y_lower_bound = y_train_raw.quantile(0.001)
# y_upper_bound = y_train_raw.quantile(0.999)
# outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

# # 2. Predictor-based cleaning (Common for this type of dataset)
# # Remove properties with extremely large UsableArea (e.g., > 4000 sq ft)
# if 'UsableArea' in X_train_raw.columns:
#     outlier_mask &= (X_train_raw['UsableArea'] < 4000)

# # Remove properties with poor OverallQuality and high UsableArea (often errors)
# if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
#     outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

# # Apply the mask to both features and target
# X_train = X_train_raw[outlier_mask].copy()
# y_train = y_train_raw[outlier_mask].copy()

# # Note: test_ids_cleaned is used for the submission df but X_test is not filtered
# test_ids_cleaned = X_test['Id'] 

# print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# # Columns to drop (based on the original code's specification)
# columns_to_drop = [
#     'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 'BasementHalfBaths', 'LowQualityArea'
# ]
# X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
# X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

# print(f"Cleaned training data shape: {X_train.shape}")
# print(f"Test data shape: {X_test.shape}")

# # --- 2. Target Transformation ---
# y_train_log = np.log1p(y_train)


# # --- 3. Feature Engineering ---
# def engineer_features(df):
#     df = df.copy()
#     # Time-based features
#     df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
#     df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
#     # Interaction feature
#     df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
#     # Log transformation for skewed numerical features
#     for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
#         if col in df.columns:
#             temp_df = df[col].fillna(0)
#             df[col + '_Log'] = np.log1p(temp_df)
    
#     # Drop source columns used for feature engineering
#     df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
#     return df

# X_train_fe = engineer_features(X_train)
# X_test_fe = engineer_features(X_test)

# numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
# categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

# print("\n--- 3. Feature Engineering Complete ---")
# print(f"Number of numerical features: {len(numerical_features)}")
# print(f"Number of categorical features: {len(categorical_features)}")
# print(f"Final training features shape: {X_train_fe.shape}")

# # --- 4. Preprocessing Pipelines ---

# # Numerical Transformer: Impute, Scale, and add Polynomial Features
# numerical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler()), 
#     # ('poly', PolynomialFeatures(degree=2, include_bias=False)) # Kept commented for speed
# ])


# # Categorical Transformer: Impute and One-Hot Encode
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_features),
#         ('cat', categorical_transformer, categorical_features)
#     ],
#     remainder='drop'
# )


# # --- 5. Model Training (Elastic Net Regression with Cross-Validation) ---
# print("\n--- 5. Model Training (ElasticNetCV) ---")
# # CHANGED: Use ElasticNetCV. l1_ratio defines the mix between L1 (Lasso) and L2 (Ridge).
# # We search over a range of ratios to find the best balance.
# elastic_net_model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('regressor', ElasticNetCV(
#         l1_ratio=[.1, .5, .7, .9, .95, .99, 1], # Search for optimal L1/L2 mix
#         cv=5, 
#         random_state=42, 
#         max_iter=10000
#     )) 
# ])

# elastic_net_model_pipeline.fit(X_train_fe, y_train_log)
# print("Model training complete.")

# # Optional: Check model performance on training data
# y_train_log_pred = elastic_net_model_pipeline.predict(X_train_fe)
# y_train_pred = np.expm1(y_train_log_pred)
# y_train_pred[y_train_pred < 0] = 0 # Ensure no negative values after reverse transformation

# rmse_train = root_mean_squared_error(y_train, y_train_pred)
# r2_train = r2_score(y_train, y_train_pred)

# print(f"ElasticNetCV Optimal Alpha: {elastic_net_model_pipeline['regressor'].alpha_:.6f}")
# print(f"ElasticNetCV Optimal L1 Ratio: {elastic_net_model_pipeline['regressor'].l1_ratio_:.2f}")
# print(f"Training RMSE (Original Scale): {rmse_train:,.2f}")
# print(f"Training R-squared: {r2_train:.4f}")


# # --- 6. Prediction and Submission File Creation ---
# print("\n--- 6. Prediction & Submission ---")
# y_test_log_pred = elastic_net_model_pipeline.predict(X_test_fe)

# # Reverse log-transformation
# y_test_pred = np.expm1(y_test_log_pred)
# y_test_pred[y_test_pred < 0] = 0 # Final check to ensure non-negative values

# submission_df = pd.DataFrame({
#     'Id': test_ids_cleaned,
#     TARGET_COLUMN: y_test_pred
# })

# submission_filename = 'elasticnet_submission_cleaned.csv' # Changed filename
# submission_df.to_csv(submission_filename, index=False)

# print("Prediction process complete.")
# print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
# print("First 5 test predictions:")
# print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6
Cleaned training data shape: (1194, 73)
Test data shape: (260, 73)

--- 3. Feature Engineering Complete ---
Number of numerical features: 38
Number of categorical features: 39
Final training features shape: (1194, 77)

--- 5. Model Training (ElasticNetCV) ---
Model training complete.
ElasticNetCV Optimal Alpha: 0.000646
ElasticNetCV Optimal L1 Ratio: 1.00
Training RMSE (Original Scale): 17,477.56
Training R-squared: 0.9453

--- 6. Prediction & Submission ---
Prediction process complete.
Submission file 'elasticnet_submission_cleaned.csv' created with 260 predictions.
First 5 test predictions:
     Id     HotelValue
0   893  151947.354104
1  1106  327944.333030
2   414  104910.258353
3   523  150776.861201
4  1037  313606.712862
