In [2]:
# #try random forest

# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# # Define file paths. Assuming files are in the current working directory.
# TARGET_COLUMN = 'HotelValue'
# train_data_path = 'train.csv'
# test_data_path = 'test.csv'


# # --- 1. Data Loading and Initial Setup ---
# print("--- 1. Data Loading and Initial Setup ---")
# # Attempt to load files (using a robust method to handle potential path changes)
# try:
#     df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
#     df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
# except FileNotFoundError:
#     df_train = pd.read_csv(train_data_path)
#     df_test = pd.read_csv(test_data_path)

# X_train = df_train.drop(columns=[TARGET_COLUMN])
# y_train = df_train[TARGET_COLUMN]
# X_test = df_test.copy()

# test_ids = X_test['Id']

# # Columns to drop due to irrelevance or excessive missing values
# columns_to_drop = [
#     'Id',
#     'PoolQuality',       # Excessive missing data
#     'BoundaryFence',     # Excessive missing data
#     'ExtraFacility',     # Excessive missing data
#     'ServiceLaneType',   # Excessive missing data
# ]

# X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
# X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

# # --- 2. Target Transformation ---
# # Log-transform the target variable
# y_train_log = np.log1p(y_train)


# # --- 3. Feature Engineering ---
# def engineer_features(df):
#     df = df.copy()
    
#     # 3.1 Age features
#     df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
#     # Use max(ConstructionYear, RenovationYear) to get the most recent date
#     df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)

#     # 3.2 Interaction feature
#     df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
#     # 3.3 Log transform selected skewed numerical features
#     for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
#         if col in df.columns:
#             # Impute with 0 for log transformation, then transform
#             temp_df = df[col].fillna(0)
#             df[col + '_Log'] = np.log1p(temp_df)

#     # Drop original year-related columns to avoid multicollinearity
#     df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    
#     return df

# X_train_fe = engineer_features(X_train)
# X_test_fe = engineer_features(X_test)


# # --- 4. Feature Classification and Preprocessing Pipelines ---
# numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
# categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

# # Numerical Transformer: Impute missing with median
# numerical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median'))
#     # No scaling is needed for Random Forest
# ])

# # Categorical Transformer: Impute missing with 'None' string, then One-Hot Encode
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_features),
#         ('cat', categorical_transformer, categorical_features)
#     ],
#     remainder='drop'
# )


# # --- 5. Model Training (Random Forest Regressor) ---

# # Using Random Forest for powerful non-linear modeling
# rf_model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     # n_jobs=-1 uses all available cores. max_depth controls complexity.
#     ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=15))
# ])

# rf_model_pipeline.fit(X_train_fe, y_train_log)


# # --- 6. Prediction and Submission File Creation ---
# y_test_log_pred = rf_model_pipeline.predict(X_test_fe)

# # Reverse log-transformation to get predictions in the original dollar scale
# y_test_pred = np.expm1(y_test_log_pred)
# y_test_pred[y_test_pred < 0] = 0 # Ensure non-negative predictions

# # Create and save the submission DataFrame
# submission_df = pd.DataFrame({
#     'Id': test_ids,
#     TARGET_COLUMN: y_test_pred
# })

# submission_file_name = 'random_forest_submission.csv'
# submission_df.to_csv(submission_file_name, index=False)
# print(f"Submission file saved as '{submission_file_name}'")

--- 1. Data Loading and Initial Setup ---
Submission file saved as 'random_forest_submission.csv'


In [7]:
#lasoo + polynomial regression

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error # <-- CHANGE HERE

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    # Attempt to load from a common nested folder structure
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    # Fallback to the current directory
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        # Exit or raise error if data can't be loaded
        raise

X_train = df_train.drop(columns=[TARGET_COLUMN])
y_train = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

# Columns to drop (based on the original code's specification)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType'
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    # Time-based features
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    # Interaction feature
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    # Log transformation for skewed numerical features
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    # Drop source columns used for feature engineering
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute, Scale, and add Polynomial Features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    # ('poly', PolynomialFeatures(degree=2, include_bias=False)) # <--- COMMENT THIS OUT
])


# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


# --- 5. Model Training (Lasso Regression with Cross-Validation) ---
print("\n--- 5. Model Training (LassoCV) ---")
lasso_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LassoCV(cv=5, random_state=42, max_iter=10000)) 
])

lasso_model_pipeline.fit(X_train_fe, y_train_log)
print("Model training complete.")

# Optional: Check model performance on training data
y_train_log_pred = lasso_model_pipeline.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0 # Ensure no negative values after reverse transformation

# Change the function call:
rmse_train = root_mean_squared_error(y_train, y_train_pred) # <-- FIX: Use root_mean_squared_error
r2_train = r2_score(y_train, y_train_pred)

print(f"LassoCV Optimal Alpha: {lasso_model_pipeline['regressor'].alpha_:.6f}")
print(f"Training RMSE (Original Scale): {rmse_train:,.2f}")
print(f"Training R-squared: {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = lasso_model_pipeline.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 # Final check to ensure non-negative values

submission_df = pd.DataFrame({
    'Id': test_ids,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'lasso_poly_submission.csv'
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Training data shape: (1200, 75)
Test data shape: (260, 75)

--- 3. Feature Engineering Complete ---
Number of numerical features: 40
Number of categorical features: 39
Final training features shape: (1200, 79)

--- 5. Model Training (LassoCV) ---
Model training complete.
LassoCV Optimal Alpha: 0.000638
Training RMSE (Original Scale): 22,876.82
Training R-squared: 0.9131

--- 6. Prediction & Submission ---
Prediction process complete.
Submission file 'lasso_poly_submission.csv' created with 260 predictions.
First 5 test predictions:
     Id     HotelValue
0   893  151109.234540
1  1106  327772.164773
2   414  105405.649409
3   523  155654.118290
4  1037  320053.072460


In [10]:
#lasso + cleaned data

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    # Attempt to load from a common nested folder structure
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    # Fallback to the current directory
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        # Exit or raise error if data can't be loaded
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal (New Step) ---
# Remove samples based on Target Value (extremely low/high values)
# and based on large/extreme values in key predictor columns (UsableArea and OverallQuality).
initial_row_count = len(df_train)

# 1. Target-based cleaning: Remove extreme values (e.g., bottom 0.1% and top 0.1% of prices)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

# 2. Predictor-based cleaning (Common for this type of dataset)
# Remove properties with extremely large UsableArea (e.g., > 4000 sq ft)
if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

# Remove properties with poor OverallQuality and high UsableArea (often errors)
if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

# Apply the mask to both features and target
X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()

# Sync test_ids for the remaining rows
test_ids_cleaned = X_test['Id'] # No change to test IDs as we don't drop test rows

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# Columns to drop (based on the original code's specification)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 'BasementHalfBaths', 'LowQualityArea'
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    # Time-based features
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    # Interaction feature
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    # Log transformation for skewed numerical features
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    # Drop source columns used for feature engineering
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute, Scale, and add Polynomial Features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    # The degree=2 poly features were commented out in your previous code to speed things up. 
    # I'll keep them commented unless performance is a concern.
    # ('poly', PolynomialFeatures(degree=2, include_bias=False)) 
])


# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


# --- 5. Model Training (Lasso Regression with Cross-Validation) ---
print("\n--- 5. Model Training (LassoCV) ---")
lasso_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LassoCV(cv=5, random_state=42, max_iter=10000)) 
])

lasso_model_pipeline.fit(X_train_fe, y_train_log)
print("Model training complete.")

# Optional: Check model performance on training data
y_train_log_pred = lasso_model_pipeline.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0 # Ensure no negative values after reverse transformation

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"LassoCV Optimal Alpha: {lasso_model_pipeline['regressor'].alpha_:.6f}")
print(f"Training RMSE (Original Scale): {rmse_train:,.2f}")
print(f"Training R-squared: {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = lasso_model_pipeline.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 # Final check to ensure non-negative values

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'lasso_submission_cleaned.csv' # Changed filename to reflect cleaning
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6
Cleaned training data shape: (1194, 73)
Test data shape: (260, 73)

--- 3. Feature Engineering Complete ---
Number of numerical features: 38
Number of categorical features: 39
Final training features shape: (1194, 77)

--- 5. Model Training (LassoCV) ---
Model training complete.
LassoCV Optimal Alpha: 0.000646
Training RMSE (Original Scale): 17,477.56
Training R-squared: 0.9453

--- 6. Prediction & Submission ---
Prediction process complete.
Submission file 'lasso_submission_cleaned.csv' created with 260 predictions.
First 5 test predictions:
     Id     HotelValue
0   893  151947.354104
1  1106  327944.333030
2   414  104910.258353
3   523  150776.861201
4  1037  313606.712862


In [11]:
# # try elastic nets + maximum likelihood

# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# # CHANGED: Import ElasticNetCV instead of LassoCV
# from sklearn.linear_model import ElasticNetCV
# from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

# # Define file paths
# train_data_path = 'train.csv'
# test_data_path = 'test.csv'
# TARGET_COLUMN = 'HotelValue'

# # --- 1. Data Loading and Initial Setup ---
# print("--- 1. Data Loading ---")
# try:
#     # Attempt to load from a common nested folder structure
#     df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
#     df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
#     print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
# except FileNotFoundError:
#     # Fallback to the current directory
#     try:
#         df_train = pd.read_csv(train_data_path)
#         df_test = pd.read_csv(test_data_path)
#         print("Loaded data from current directory.")
#     except FileNotFoundError as e:
#         print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
#         print(f"Details: {e}")
#         # Exit or raise error if data can't be loaded
#         raise

# # Separate features and target before dropping/cleaning
# X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
# y_train_raw = df_train[TARGET_COLUMN]
# X_test = df_test.copy()
# test_ids = X_test['Id']

# print(f"Initial training data shape: {X_train_raw.shape}")


# # --- 1.5 Outlier Removal ---
# # Remove samples based on Target Value and key Predictor columns.
# initial_row_count = len(df_train)

# # 1. Target-based cleaning: Remove extreme values (e.g., bottom 0.1% and top 0.1% of prices)
# y_lower_bound = y_train_raw.quantile(0.001)
# y_upper_bound = y_train_raw.quantile(0.999)
# outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

# # 2. Predictor-based cleaning (Common for this type of dataset)
# # Remove properties with extremely large UsableArea (e.g., > 4000 sq ft)
# if 'UsableArea' in X_train_raw.columns:
#     outlier_mask &= (X_train_raw['UsableArea'] < 4000)

# # Remove properties with poor OverallQuality and high UsableArea (often errors)
# if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
#     outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

# # Apply the mask to both features and target
# X_train = X_train_raw[outlier_mask].copy()
# y_train = y_train_raw[outlier_mask].copy()

# # Note: test_ids_cleaned is used for the submission df but X_test is not filtered
# test_ids_cleaned = X_test['Id'] 

# print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# # Columns to drop (based on the original code's specification)
# columns_to_drop = [
#     'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 'BasementHalfBaths', 'LowQualityArea'
# ]
# X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
# X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

# print(f"Cleaned training data shape: {X_train.shape}")
# print(f"Test data shape: {X_test.shape}")

# # --- 2. Target Transformation ---
# y_train_log = np.log1p(y_train)


# # --- 3. Feature Engineering ---
# def engineer_features(df):
#     df = df.copy()
#     # Time-based features
#     df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
#     df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
#     # Interaction feature
#     df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
#     # Log transformation for skewed numerical features
#     for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
#         if col in df.columns:
#             temp_df = df[col].fillna(0)
#             df[col + '_Log'] = np.log1p(temp_df)
    
#     # Drop source columns used for feature engineering
#     df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
#     return df

# X_train_fe = engineer_features(X_train)
# X_test_fe = engineer_features(X_test)

# numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
# categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

# print("\n--- 3. Feature Engineering Complete ---")
# print(f"Number of numerical features: {len(numerical_features)}")
# print(f"Number of categorical features: {len(categorical_features)}")
# print(f"Final training features shape: {X_train_fe.shape}")

# # --- 4. Preprocessing Pipelines ---

# # Numerical Transformer: Impute, Scale, and add Polynomial Features
# numerical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler()), 
#     # ('poly', PolynomialFeatures(degree=2, include_bias=False)) # Kept commented for speed
# ])


# # Categorical Transformer: Impute and One-Hot Encode
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_features),
#         ('cat', categorical_transformer, categorical_features)
#     ],
#     remainder='drop'
# )


# # --- 5. Model Training (Elastic Net Regression with Cross-Validation) ---
# print("\n--- 5. Model Training (ElasticNetCV) ---")
# # CHANGED: Use ElasticNetCV. l1_ratio defines the mix between L1 (Lasso) and L2 (Ridge).
# # We search over a range of ratios to find the best balance.
# elastic_net_model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('regressor', ElasticNetCV(
#         l1_ratio=[.1, .5, .7, .9, .95, .99, 1], # Search for optimal L1/L2 mix
#         cv=5, 
#         random_state=42, 
#         max_iter=10000
#     )) 
# ])

# elastic_net_model_pipeline.fit(X_train_fe, y_train_log)
# print("Model training complete.")

# # Optional: Check model performance on training data
# y_train_log_pred = elastic_net_model_pipeline.predict(X_train_fe)
# y_train_pred = np.expm1(y_train_log_pred)
# y_train_pred[y_train_pred < 0] = 0 # Ensure no negative values after reverse transformation

# rmse_train = root_mean_squared_error(y_train, y_train_pred)
# r2_train = r2_score(y_train, y_train_pred)

# print(f"ElasticNetCV Optimal Alpha: {elastic_net_model_pipeline['regressor'].alpha_:.6f}")
# print(f"ElasticNetCV Optimal L1 Ratio: {elastic_net_model_pipeline['regressor'].l1_ratio_:.2f}")
# print(f"Training RMSE (Original Scale): {rmse_train:,.2f}")
# print(f"Training R-squared: {r2_train:.4f}")


# # --- 6. Prediction and Submission File Creation ---
# print("\n--- 6. Prediction & Submission ---")
# y_test_log_pred = elastic_net_model_pipeline.predict(X_test_fe)

# # Reverse log-transformation
# y_test_pred = np.expm1(y_test_log_pred)
# y_test_pred[y_test_pred < 0] = 0 # Final check to ensure non-negative values

# submission_df = pd.DataFrame({
#     'Id': test_ids_cleaned,
#     TARGET_COLUMN: y_test_pred
# })

# submission_filename = 'elasticnet_submission_cleaned.csv' # Changed filename
# submission_df.to_csv(submission_filename, index=False)

# print("Prediction process complete.")
# print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
# print("First 5 test predictions:")
# print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6
Cleaned training data shape: (1194, 73)
Test data shape: (260, 73)

--- 3. Feature Engineering Complete ---
Number of numerical features: 38
Number of categorical features: 39
Final training features shape: (1194, 77)

--- 5. Model Training (ElasticNetCV) ---
Model training complete.
ElasticNetCV Optimal Alpha: 0.000646
ElasticNetCV Optimal L1 Ratio: 1.00
Training RMSE (Original Scale): 17,477.56
Training R-squared: 0.9453

--- 6. Prediction & Submission ---
Prediction process complete.
Submission file 'elasticnet_submission_cleaned.csv' created with 260 predictions.
First 5 test predictions:
     Id     HotelValue
0   893  151947.354104
1  1106  327944.333030
2   414  104910.258353
3   523  150776.861201
4  1037  313606.712862


In [5]:
#try bayesian lasoo

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# Using BayesianRidge for a Bayesian approach to linear regression
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import r2_score, root_mean_squared_error

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    # Attempt to load from a common nested folder structure
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    # Fallback to the current directory
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

# Predictor-based cleaning (UsableArea and OverallQuality)
if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)
if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

# Apply the mask to both features and target
X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# Columns to drop
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType'
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    # ('poly', PolynomialFeatures(degree=2, include_bias=False)) 
])

# Categorical Transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop',
    # CRITICAL FIX: Changed 'sparse_output=False' to 'sparse_threshold=0' 
    # for modern Scikit-learn versions (>= 1.2).
    sparse_threshold=0.0
)


# --- 5. Model Training (Bayesian Ridge Regression) ---
print("\n--- 5. Model Training (BayesianRidge) ---")
bayesian_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # FIX: Using 'max_iter' for modern scikit-learn compatibility
    ('regressor', BayesianRidge(max_iter=10000)) 
])

bayesian_model_pipeline.fit(X_train_fe, y_train_log)
print("Model training complete.")

# Optional: Check model performance on training data
y_train_log_pred = bayesian_model_pipeline.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0 

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

alpha_est = bayesian_model_pipeline['regressor'].alpha_
lambda_est = bayesian_model_pipeline['regressor'].lambda_

print(f"BayesianRidge Estimated Alpha (Noise Precision): {alpha_est:.6f}")
print(f"BayesianRidge Estimated Lambda (Weight Precision): {lambda_est:.6f}")
print(f"Training RMSE (Original Scale): {rmse_train:,.2f}")
print(f"Training R-squared: {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = bayesian_model_pipeline.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'bayesianridge_submission_cleaned.csv' 
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6
Cleaned training data shape: (1194, 75)
Test data shape: (260, 75)

--- 3. Feature Engineering Complete ---
Number of numerical features: 40
Number of categorical features: 39
Final training features shape: (1194, 79)

--- 5. Model Training (BayesianRidge) ---
Model training complete.
BayesianRidge Estimated Alpha (Noise Precision): 107.463893
BayesianRidge Estimated Lambda (Weight Precision): 1656.384294
Training RMSE (Original Scale): 16,770.82
Training R-squared: 0.9497

--- 6. Prediction & Submission ---
Prediction process complete.
Submission file 'bayesianridge_submission_cleaned.csv' created with 260 predictions.
First 5 test predictions:
     Id     HotelValue
0   893  152026.789240
1  1106  328480.329555
2   414  105060.132452
3   523  154274.255646
4  1037  315312.441403


In [7]:
#bayesian ridge

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# Using BayesianRidge (closest native Scikit-learn Bayesian model)
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import r2_score, root_mean_squared_error

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)
if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# Columns to drop
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType'
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
])

# Categorical Transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop',
    # FIX: Use 'sparse_threshold=0.0' for modern Scikit-learn dense output
    sparse_threshold=0.0
)


# --- 5. Model Training (Bayesian Ridge Regression - Closest Scikit-learn Bayesian Model) ---
print("\n--- 5. Model Training (BayesianRidge) ---")
# NOTE: Scikit-learn does not have a native 'BayesianLasso' model. 
# We use BayesianRidge, which assumes Gaussian priors (Ridge behavior).
# For true Bayesian Lasso (Laplace priors), a library like PyMC or Stan would be required.
bayesian_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', BayesianRidge(max_iter=10000)) 
])

bayesian_model_pipeline.fit(X_train_fe, y_train_log)
print("Model training complete.")

# Optional: Check model performance on training data
y_train_log_pred = bayesian_model_pipeline.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0 

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

alpha_est = bayesian_model_pipeline['regressor'].alpha_
lambda_est = bayesian_model_pipeline['regressor'].lambda_

print(f"BayesianRidge Estimated Alpha (Noise Precision): {alpha_est:.6f}")
print(f"BayesianRidge Estimated Lambda (Weight Precision): {lambda_est:.6f}")
print(f"Training RMSE (Original Scale): {rmse_train:,.2f}")
print(f"Training R-squared: {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = bayesian_model_pipeline.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'bayesianridge_submission_cleaned.csv' 
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6
Cleaned training data shape: (1194, 75)
Test data shape: (260, 75)

--- 3. Feature Engineering Complete ---
Number of numerical features: 40
Number of categorical features: 39
Final training features shape: (1194, 79)

--- 5. Model Training (BayesianRidge) ---
Model training complete.
BayesianRidge Estimated Alpha (Noise Precision): 107.463893
BayesianRidge Estimated Lambda (Weight Precision): 1656.384294
Training RMSE (Original Scale): 16,770.82
Training R-squared: 0.9497

--- 6. Prediction & Submission ---
Prediction process complete.
Submission file 'bayesianridge_submission_cleaned.csv' created with 260 predictions.
First 5 test predictions:
     Id     HotelValue
0   893  152026.789240
1  1106  328480.329555
2   414  105060.132452
3   523  154274.255646
4  1037  315312.441403


In [3]:
#xgboost try1

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split 
import xgboost as xgb
from sklearn.metrics import r2_score, root_mean_squared_error

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)
if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# Columns to drop
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType'
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines (Simplified for Tree-Based Model) ---

# Numerical Transformer: Only Impute
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])

# Categorical Transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop',
    sparse_threshold=0.0
)

# --- 4.5 Manual Preprocessing & Feature Alignment (CRITICAL FIX) ---

# 1. Fit the preprocessor on the ENTIRE training set (X_train_fe) to learn all categories/imputation values.
#    This ensures a consistent feature set size for all subsequent data splits.
X_train_processed = preprocessor.fit_transform(X_train_fe)
X_test_processed = preprocessor.transform(X_test_fe)

# 2. Split the *preprocessed* training data (now an array) for fitting and validation.
X_fit_processed, X_val_processed, y_fit, y_val = train_test_split(
    X_train_processed, y_train_log, test_size=0.1, random_state=42
)

# 3. Create a simplified pipeline that just contains the regressor (since preprocessing is done).
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=1000, 
    learning_rate=0.05, 
    max_depth=4, 
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50,
)


# --- 5. Model Training (XGBRegressor - Boosting Method) ---
print("\n--- 5. Model Training (XGBRegressor) ---")

# The eval_set must contain tuples of (features, target)
eval_set_processed = [(X_val_processed, y_val)]

# Fit the XGBoost model directly (outside of a pipeline) on the numerical arrays.
xgb_model.fit(
    X_fit_processed, y_fit, 
    eval_set=eval_set_processed, 
    verbose=False
)
print("Model training complete.")

# Optional: Check model performance on training data
# Note: Predict on the full preprocessed training set (X_train_processed)
y_train_log_pred = xgb_model.predict(X_train_processed)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0 

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"XGBRegressor Best Iteration: {xgb_model.best_iteration}")
print(f"Training RMSE (Original Scale): {rmse_train:,.2f}")
print(f"Training R-squared: {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
# Predict on the preprocessed test set (X_test_processed)
y_test_log_pred = xgb_model.predict(X_test_processed)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'xgboost_submission_final.csv' 
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6
Cleaned training data shape: (1194, 75)
Test data shape: (260, 75)

--- 3. Feature Engineering Complete ---
Number of numerical features: 40
Number of categorical features: 39
Final training features shape: (1194, 79)

--- 5. Model Training (XGBRegressor) ---
Model training complete.
XGBRegressor Best Iteration: 252
Training RMSE (Original Scale): 11,267.07
Training R-squared: 0.9773

--- 6. Prediction & Submission ---
Prediction process complete.
Submission file 'xgboost_submission_final.csv' created with 260 predictions.
First 5 test predictions:
     Id     HotelValue
0   893  144160.406250
1  1106  307986.218750
2   414  110486.804688
3   523  145697.625000
4  1037  329098.656250


In [4]:
#xgboost try2

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split 
import xgboost as xgb
from sklearn.metrics import r2_score, root_mean_squared_error

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        raise

X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)
if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# Columns to drop
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType'
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Only Impute
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])

# Categorical Transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop',
    sparse_threshold=0.0
)

# --- 4.5 Manual Preprocessing & Feature Alignment ---

# Fit the preprocessor on the ENTIRE training set (X_train_fe)
X_train_processed = preprocessor.fit_transform(X_train_fe)
X_test_processed = preprocessor.transform(X_test_fe)

# Split the *preprocessed* training data (now an array) for fitting and validation.
X_fit_processed, X_val_processed, y_fit, y_val = train_test_split(
    X_train_processed, y_train_log, test_size=0.1, random_state=42
)

# 3. Create the XGBoost model with regularization parameters (to prevent overfitting).
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=2000, 
    learning_rate=0.01,         # Reduced learning rate (smaller steps)
    max_depth=3,                # Reduced max_depth (simpler trees)
    min_child_weight=5,         # Increased min_child_weight (more regularization)
    gamma=0.1,                  # Added minimum loss reduction for splits
    reg_alpha=0.1,              # Added L1 regularization (Lasso)
    reg_lambda=1.0,             # Added L2 regularization (Ridge)
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=100,  # Increased early stopping rounds
)


# --- 5. Model Training (XGBRegressor - Boosting Method) ---
print("\n--- 5. Model Training (XGBRegressor) ---")

eval_set_processed = [(X_val_processed, y_val)]

# Fit the XGBoost model directly on the numerical arrays.
xgb_model.fit(
    X_fit_processed, y_fit, 
    eval_set=eval_set_processed, 
    verbose=False # Set to True to see early stopping results
)
print("Model training complete.")

# Optional: Check model performance on training data
y_train_log_pred = xgb_model.predict(X_train_processed)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0 

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"XGBRegressor Best Iteration: {xgb_model.best_iteration}")
print(f"Training RMSE (Original Scale): {rmse_train:,.2f}")
print(f"Training R-squared: {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = xgb_model.predict(X_test_processed)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'xgboost_submission_regularized.csv' 
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6
Cleaned training data shape: (1194, 75)
Test data shape: (260, 75)

--- 3. Feature Engineering Complete ---
Number of numerical features: 40
Number of categorical features: 39
Final training features shape: (1194, 79)

--- 5. Model Training (XGBRegressor) ---
Model training complete.
XGBRegressor Best Iteration: 580
Training RMSE (Original Scale): 18,511.73
Training R-squared: 0.9387

--- 6. Prediction & Submission ---
Prediction process complete.
Submission file 'xgboost_submission_regularized.csv' created with 260 predictions.
First 5 test predictions:
     Id     HotelValue
0   893  140108.921875
1  1106  325542.875000
2   414  117880.695312
3   523  149607.937500
4  1037  310675.750000


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# CHANGED: Using LassoCV for regularization and feature selection
from sklearn.linear_model import LassoCV
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold # Kept for potential future use

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    # Attempt to load from a common nested folder structure
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    # Fallback to the current directory
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal (Kept) ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging (Kept) ---
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    df['BasementAvgQuality'] = 0.0
    mask_has_finished = df['BasementFinishedSF'] > 0
    df.loc[mask_has_finished, 'BasementAvgQuality'] = (
        df.loc[mask_has_finished, 'TotalBasementScore'] / df.loc[mask_has_finished, 'BasementFinishedSF']
    )
    
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)

print("Merging porch features...")
for df in [X_train, X_test]:
    df['TotalPorchArea'] = (
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
             errors='ignore', inplace=True)

# Columns to drop (including multicollinearity removal)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea',
    # Multicollinearity removal
    'ParkingCapacity', 
    'GroundFloorArea', 
    'TotalRooms', 
    'UpperFloorArea', 
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering (Kept) ---
def engineer_features(df):
    df = df.copy()
    
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines (Modified) ---

# Numerical Transformer: Impute, Scale, and ADD Polynomial Features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    # CRITICAL CHANGE: Add PolynomialFeatures to generate non-linear response
    ('poly', PolynomialFeatures(degree=2, include_bias=False)) 
])

# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


# --- 5. Model Training (LassoCV - Regularized Non-linear Fit) ---
print("\n--- 5. Model Training (LassoCV with Polynomial Features) ---")

# LassoCV automatically tunes the regularization strength (alpha) using cross-validation.
lasso_poly_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LassoCV(cv=5, random_state=42, max_iter=10000, n_jobs=-1)) 
])

lasso_poly_pipeline.fit(X_train_fe, y_train_log)
print("Model training complete.")

# Evaluate optimized model
best_model = lasso_poly_pipeline

y_train_log_pred = best_model.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"LassoCV Optimal Alpha: {best_model['regressor'].alpha_:.6f}")
print(f"\nPolynomial Lasso RMSE (Train): {rmse_train:,.2f}")
print(f"Polynomial Lasso R² (Train): {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = best_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'lasso_poly_submission_final.csv'
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merging basement features...
Merging porch features...
Cleaned training data shape: (1194, 66)
Test data shape: (260, 66)

--- 3. Feature Engineering Complete ---
Number of numerical features: 34
Number of categorical features: 37
Final training features shape: (1194, 71)

--- 5. Model Training (LassoCV with Polynomial Features) ---
Model training complete.
LassoCV Optimal Alpha: 0.001725

Polynomial Lasso RMSE (Train): 16,886.31
Polynomial Lasso R² (Train): 0.9490

--- 6. Prediction & Submission ---
Prediction process complete.
Submission file 'lasso_poly_submission_final.csv' created with 260 predictions.
First 5 test predictions:
     Id     HotelValue
0   893  148924.126945
1  1106  335121.773030
2   414  109544.419398
3   523  157249.584302
4  1037  315144.759281


In [1]:
#sid wala best

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    # Attempt to load from a common nested folder structure
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    # Fallback to the current directory
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        # Exit or raise error if data can't be loaded
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal (New Step) ---
# Remove samples based on Target Value (extremely low/high values)
# and based on large/extreme values in key predictor columns (UsableArea and OverallQuality).
initial_row_count = len(df_train)

# 1. Target-based cleaning: Remove extreme values (e.g., bottom 0.1% and top 0.1% of prices)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

# 2. Predictor-based cleaning (Common for this type of dataset)
# Remove properties with extremely large UsableArea (e.g., > 4000 sq ft)
if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

# Remove properties with poor OverallQuality and high UsableArea (often errors)
if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

# Apply the mask to both features and target
X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()

# Sync test_ids for the remaining rows
test_ids_cleaned = X_test['Id'] # No change to test IDs as we don't drop test rows

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging (Before Dropping Columns) ---
# Merge Basement Features into Weighted Quality Score
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    # Fill NaN values
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    # Map types to scores
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    # Calculate weighted quality score
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    # Average basement quality (cast to float to avoid dtype warning)
    df['BasementAvgQuality'] = 0.0
    mask_has_finished = df['BasementFinishedSF'] > 0
    df.loc[mask_has_finished, 'BasementAvgQuality'] = (
        df.loc[mask_has_finished, 'TotalBasementScore'] / df.loc[mask_has_finished, 'BasementFinishedSF']
    )
    
    # Drop original basement facility columns
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)

# Merge Porch/Veranda Features
print("Merging porch features...")
for df in [X_train, X_test]:
    df['TotalPorchArea'] = (
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
            errors='ignore', inplace=True)

# Columns to drop (including multicollinearity removal)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea',
    # Multicollinearity removal
    'ParkingCapacity',  # Keep ParkingArea
    'GroundFloorArea',  # Keep UsableArea
    'TotalRooms',       # Keep FullBaths
    'UpperFloorArea',   # Captured in UsableArea
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    
    # Time-based features
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    # Handle RenovationYear: if 0 or missing, use ConstructionYear
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    # Interaction features
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    # Bathroom quality feature
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    # Log transformation for skewed numerical features
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    # Drop source columns used for feature engineering
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute, Scale, and add Polynomial Features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    # The degree=2 poly features were commented out in your previous code to speed things up. 
    # I'll keep them commented unless performance is a concern.
    # ('poly', PolynomialFeatures(degree=2, include_bias=False)) 
])


# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# --- 5. Model Training: Linear Regression + Bayesian Optimization ---
print("\n--- 5A. Baseline Linear Regression ---")

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold

# Baseline Linear Regression pipeline
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train baseline model
lr_pipeline.fit(X_train_fe, y_train_log)

# Evaluate baseline model
y_train_log_pred = lr_pipeline.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0

rmse_train_lr = root_mean_squared_error(y_train, y_train_pred)
r2_train_lr = r2_score(y_train, y_train_pred)

print(f"Linear Regression RMSE (Train): {rmse_train_lr:,.2f}")
print(f"Linear Regression R² (Train): {r2_train_lr:.4f}")


# --- 5B. Bayesian Optimization for Linear Regression ---
print("\n--- 5B. Bayesian Optimization for Linear Regression ---")

from skopt import BayesSearchCV
from skopt.space import Categorical, Real
from sklearn.linear_model import Ridge

# We'll wrap Linear Regression in Ridge to allow tuning small alpha (acts as regularization)
# Because pure LinearRegression has almost no hyperparameters to tune
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(random_state=42, max_iter=10000))
])

# Define Bayesian search space
search_space = {
    'regressor__alpha': Real(1e-6, 1e1, prior='log-uniform'),  # small regularization
    'regressor__fit_intercept': Categorical([True, False]),
    'regressor__tol': Real(1e-5, 1e-2, prior='log-uniform')
}

cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=ridge_pipeline,
    search_spaces=search_space,
    n_iter=30,
    cv=cv_strategy,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

bayes_search.fit(X_train_fe, y_train_log)

print("\nBest Bayesian Hyperparameters:")
print(bayes_search.best_params_)

# Evaluate optimized model
best_model = bayes_search.best_estimator_

y_train_log_pred_bayes = best_model.predict(X_train_fe)
y_train_pred_bayes = np.expm1(y_train_log_pred_bayes)
y_train_pred_bayes[y_train_pred_bayes < 0] = 0

rmse_train_bayes = root_mean_squared_error(y_train, y_train_pred_bayes)
r2_train_bayes = r2_score(y_train, y_train_pred_bayes)

print(f"\nBayesian Optimized Linear Regression RMSE (Train): {rmse_train_bayes:,.2f}")
print(f"Bayesian Optimized Linear Regression R² (Train): {r2_train_bayes:.4f}")


# --- Compare and Select Best Model ---
print("\n--- Model Comparison ---")
print(f"Baseline Linear Regression RMSE: {rmse_train_lr:,.2f} | R²: {r2_train_lr:.4f}")
print(f"Bayesian Optimized Regression RMSE: {rmse_train_bayes:,.2f} | R²: {r2_train_bayes:.4f}")

if rmse_train_bayes < rmse_train_lr:
    final_model = best_model
    print("✅ Using Bayesian-optimized Linear Regression as final model.")
else:
    final_model = lr_pipeline
    print("✅ Using baseline Linear Regression as final model (performed better or equal).")

# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = final_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 # Final check to ensure non-negative values

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'linear_submission_cleaned.csv' # Changed filename to reflect cleaning
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merging basement features...
Merging porch features...
Cleaned training data shape: (1194, 66)
Test data shape: (260, 66)

--- 3. Feature Engineering Complete ---
Number of numerical features: 34
Number of categorical features: 37
Final training features shape: (1194, 71)

--- 5A. Baseline Linear Regression ---
Linear Regression RMSE (Train): 15,318.01
Linear Regression R² (Train): 0.9580

--- 5B. Bayesian Optimization for Linear Regression ---
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candi

In [None]:
#polynomial bayesian

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge # Changed from LassoCV to Ridge for Bayes search
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
from skopt import BayesSearchCV
from skopt.space import Categorical, Real, Integer

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    # Attempt to load from a common nested folder structure
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    # Fallback to the current directory
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging ---
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    df['BasementAvgQuality'] = 0.0
    mask_has_finished = df['BasementFinishedSF'] > 0
    df.loc[mask_has_finished, 'BasementAvgQuality'] = (
        df.loc[mask_has_finished, 'TotalBasementScore'] / df.loc[mask_has_finished, 'BasementFinishedSF']
    )
    
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)

print("Merging porch features...")
for df in [X_train, X_test]:
    df['TotalPorchArea'] = (
        df['OpenVerandaArea'].fillna(0)+
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea', 'OpenVarandaArea'], 
             errors='ignore', inplace=True)

# Columns to drop (including multicollinearity removal)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea',
    'ParkingCapacity', 
    'GroundFloorArea', 
    'TotalRooms', 
    'UpperFloorArea', 
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute and Scale (Polynomials will be added in the final pipeline)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
])


# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


# --- 5. Model Training (Polynomial Regression + Bayesian Optimization) ---
print("\n--- 5. Model Training (Polynomial Regression + Bayesian Optimization) ---")

# Define the pipeline: Preprocessor + PolynomialFeatures + Ridge Regression
poly_ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('regressor', Ridge(random_state=42))
])

# Define search space for Bayesian Optimization
search_space = {
    'poly__degree': Integer(1, 3),      # Non-linearity via degree
    'regressor__alpha': Real(1e-3, 100, prior='log-uniform') # Regularization
}

cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=poly_ridge_pipeline,
    search_spaces=search_space,
    n_iter=20, # number of optimization iterations
    cv=cv_strategy,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

bayes_search.fit(X_train_fe, y_train_log)

print("\nBest Bayesian Hyperparameters:")
print(bayes_search.best_params_)

# Train final model using best parameters
final_model = bayes_search.best_estimator_

# Evaluate optimized model on training data
y_train_log_pred = final_model.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"\nPolynomial Ridge RMSE (Train): {rmse_train:,.2f}")
print(f"Polynomial Ridge R² (Train): {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = final_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 # Final check to ensure non-negative values

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'poly_bayesian_submission_final.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merging basement features...
Merging porch features...
Cleaned training data shape: (1194, 66)
Test data shape: (260, 66)

--- 3. Feature Engineering Complete ---
Number of numerical features: 34
Number of categorical features: 37
Final training features shape: (1194, 71)

--- 5. Model Training (Polynomial Regression + Bayesian Optimization) ---
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [5]:
#try1

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
from skopt import BayesSearchCV
from skopt.space import Categorical, Real, Integer

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        raise

X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging & Quality Index ---

# Helper function to create a general quality index
def create_quality_index(df):
    # Mapping for General Quality/Condition features (e.g., Ex, Gd, Ta, Fa, Po)
    quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0, 'No': 0, 'NaN': 0}
    
    quality_cols = [
        'ExteriorQuality', 'ExteriorCondition', 'KitchenQuality', 'HeatingQuality',
        'BasementCondition', 'BasementExposure', 'LoungeQuality', 'ParkingQuality'
    ]
    
    # Sum scores from available quality columns
    df['TotalQualityScore'] = 0
    
    for col in quality_cols:
        if col in df.columns:
            # Map, fill NaNs, and add to total score
            score = df[col].fillna('None').astype(str).str.upper().map(quality_map).fillna(0)
            df['TotalQualityScore'] += score
            # Drop individual quality columns to reduce noise/multicollinearity
            df.drop(columns=[col], errors='ignore', inplace=True)
    
    return df

X_train = create_quality_index(X_train)
X_test = create_quality_index(X_test)
print("\nMerged quality/condition features into TotalQualityScore.")

# Merge Basement Features into Weighted Quality Score (Original)
print("Merging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    df['BasementAvgQuality'] = 0.0
    mask_has_finished = df['BasementFinishedSF'] > 0
    df.loc[mask_has_finished, 'BasementAvgQuality'] = (
        df.loc[mask_has_finished, 'TotalBasementScore'] / df.loc[mask_has_finished, 'BasementFinishedSF']
    )
    
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)

# Merge Porch/Veranda Features (Original)
print("Merging porch features...")
for df in [X_train, X_test]:
    # Use 'OpenPorchArea' as the correct column name based on prior troubleshooting
    df['TotalPorchArea'] = (
        df['OpenVerandaArea'].fillna(0) +
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['OpenPorchArea', 'EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
             errors='ignore', inplace=True)

# Columns to drop (including multicollinearity removal)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea',
    'ParkingCapacity', 
    'GroundFloorArea', 
    'TotalRooms', 
    'UpperFloorArea', 
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering (Original) ---
def engineer_features(df):
    df = df.copy()
    
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines (Original) ---

# Numerical Transformer: Impute and Scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
])

# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


# --- 5. Model Training (Bayesian Optimized Ridge) ---
print("\n--- 5. Model Training (Bayesian Optimized Ridge) ---")

poly_ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # Note: Using degree=1 here to ensure a linear model for this test
    ('poly', PolynomialFeatures(degree=1, include_bias=False)), 
    ('regressor', Ridge(random_state=42))
])

# Define search space for Bayesian Optimization
search_space = {
    # Only optimizing alpha and tolerance since degree is fixed at 1 (linear fit)
    'regressor__alpha': Real(1e-6, 100, prior='log-uniform'), 
    'regressor__fit_intercept': Categorical([True, False]),
}

cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=poly_ridge_pipeline,
    search_spaces=search_space,
    n_iter=20, 
    cv=cv_strategy,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

bayes_search.fit(X_train_fe, y_train_log)

print("\nBest Bayesian Hyperparameters:")
print(bayes_search.best_params_)

# Train final model using best parameters
final_model = bayes_search.best_estimator_

# Evaluate optimized model on training data
y_train_log_pred = final_model.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"\nWeighted Quality Index + Optimized Linear Regression RMSE (Train): {rmse_train:,.2f}")
print(f"Weighted Quality Index + Optimized Linear Regression R² (Train): {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = final_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'bayesian_weighted_quality_test1.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merged quality/condition features into TotalQualityScore.
Merging basement features...
Merging porch features...
Cleaned training data shape: (1194, 59)
Test data shape: (260, 59)

--- 3. Feature Engineering Complete ---
Number of numerical features: 35
Number of categorical features: 29
Final training features shape: (1194, 64)

--- 5. Model Training (Bayesian Optimized Ridge) ---
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates

In [4]:
#try2

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
from skopt import BayesSearchCV
from skopt.space import Categorical, Real

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        raise

X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging ---
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    df['BasementAvgQuality'] = 0.0
    mask_has_finished = df['BasementFinishedSF'] > 0
    df.loc[mask_has_finished, 'BasementAvgQuality'] = (
        df.loc[mask_has_finished, 'TotalBasementScore'] / df.loc[mask_has_finished, 'BasementFinishedSF']
    )
    
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)

print("Merging porch features...")
for df in [X_train, X_test]:
    # Merged porch area features
    df['TotalPorchArea'] = (
        df['OpenVerandaArea'].fillna(0) + # Corrected name from previous version
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['OpenPorchArea', 'EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
             errors='ignore', inplace=True)

# Columns to drop (including multicollinearity removal)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea',
    'ParkingCapacity', 
    'GroundFloorArea', 
    'TotalRooms', 
    'UpperFloorArea', 
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    # NOTE: The log transforms here are applied to features that WILL BE ZERO-IMPUTED later. 
    # This is fine since the missing values are handled by the manual imputation inside this function.
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

# --- Define Feature Lists for Targeted Imputation ---
all_numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

# Define features where NaN means 0 (Area, Length, etc.)
zero_impute_features = [
    'RoadAccessLength', 'TotalPorchArea', 'BasementTotalSF', 'BasementFinishedSF', 
    'FacadeArea', 'ParkingArea', 'TotalBasementScore', 'Fireplaces', 'LandArea',
    'RoadAccessLength_Log', 'FacadeArea_Log', 'BasementTotalSF_Log', 'ParkingArea_Log', 
    'LandArea_Log'
]
# Ensure zero_impute_features only contains columns that actually exist
zero_impute_features = [col for col in zero_impute_features if col in all_numerical_features]

# The rest of the numerical features (where NaN is best filled by the median)
general_numerical_features = [
    col for col in all_numerical_features if col not in zero_impute_features
]

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features (Total): {len(all_numerical_features)}")
print(f"Zero-imputed features: {len(zero_impute_features)}")
print(f"Median-imputed features: {len(general_numerical_features)}")


# --- 4. Preprocessing Pipelines (Targeted Imputation) ---

# Pipeline 1: Zero Imputation and Scaling (for areas/lengths where NaN=0)
zero_impute_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)), # FIX: Zero Imputation
    ('scaler', StandardScaler()), 
])

# Pipeline 2: Median Imputation and Scaling (for general metrics)
median_impute_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Original Median Imputation
    ('scaler', StandardScaler()), 
])

# Categorical Transformer (Unchanged)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('zero_num', zero_impute_transformer, zero_impute_features), # Added Zero Imputation
        ('median_num', median_impute_transformer, general_numerical_features), # Kept Median Imputation
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


# --- 5. Model Training (Bayesian Optimized Ridge) ---
print("\n--- 5. Model Training (Bayesian Optimized Ridge with Targeted Imputation) ---")

poly_ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=1, include_bias=False)), # Linear Model
    ('regressor', Ridge(random_state=42))
])

# Define search space for Bayesian Optimization
search_space = {
    'regressor__alpha': Real(1e-6, 100, prior='log-uniform'), 
    'regressor__fit_intercept': Categorical([True, False]),
}

cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=poly_ridge_pipeline,
    search_spaces=search_space,
    n_iter=20, 
    cv=cv_strategy,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

bayes_search.fit(X_train_fe, y_train_log)

print("\nBest Bayesian Hyperparameters:")
print(bayes_search.best_params_)

# Train final model using best parameters
final_model = bayes_search.best_estimator_

# Evaluate optimized model on training data
y_train_log_pred = final_model.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"\nTargeted Imputation + Optimized Linear Regression RMSE (Train): {rmse_train:,.2f}")
print(f"Targeted Imputation + Optimized Linear Regression R² (Train): {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = final_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'bayesian_targeted_imputation_test2.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merging basement features...
Merging porch features...
Cleaned training data shape: (1194, 66)
Test data shape: (260, 66)

--- 3. Feature Engineering Complete ---
Number of numerical features (Total): 34
Zero-imputed features: 13
Median-imputed features: 21

--- 5. Model Training (Bayesian Optimized Ridge with Targeted Imputation) ---
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1

In [7]:
#try3

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# NEW: Random Forest Regressor and Hyperparameter Optimization
from sklearn.ensemble import RandomForestRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup (Unchanged) ---
print("--- 1. Data Loading ---")
try:
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        raise

X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal (Unchanged) ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging (Unchanged) ---
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    df['BasementAvgQuality'] = 0.0
    mask_has_finished = df['BasementFinishedSF'] > 0
    df.loc[mask_has_finished, 'BasementAvgQuality'] = (
        df.loc[mask_has_finished, 'TotalBasementScore'] / df.loc[mask_has_finished, 'BasementFinishedSF']
    )
    
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)

print("Merging porch features...")
for df in [X_train, X_test]:
    # Corrected name for porch area
    df['TotalPorchArea'] = (
        df['OpenVerandaArea'].fillna(0) + 
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['OpenPorchArea', 'EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
             errors='ignore', inplace=True)

# Columns to drop (Unchanged)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea',
    'ParkingCapacity', 
    'GroundFloorArea', 
    'TotalRooms', 
    'UpperFloorArea', 
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation (Unchanged) ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering (Unchanged) ---
def engineer_features(df):
    df = df.copy()
    
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines (Simplified for Random Forest) ---

# Numerical Transformer: Only Impute (Scaling is NOT needed for tree models)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    # REMOVED: Scaling is not needed
])


# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop',
    sparse_threshold=0.0 # Force dense output for better compatibility
)


# --- 5. Model Training (Optimized Random Forest) ---
print("\n--- 5. Model Training (Optimized Random Forest Regressor) ---")

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # Random Forest is the best bagging method for this data type
    ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
])

# Define Bayesian search space for Random Forest (focus on complexity/regularization)
search_space_rf = {
    'regressor__n_estimators': Integer(200, 800), # Number of trees
    'regressor__max_depth': Integer(5, 15),       # Max depth of each tree (crucial for controlling overfitting)
    'regressor__min_samples_split': Integer(2, 10),
    'regressor__min_samples_leaf': Integer(1, 5),
    'regressor__max_features': Categorical(['sqrt', 0.5, 0.7]), # Number of features to consider at each split
}

cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

bayes_search_rf = BayesSearchCV(
    estimator=rf_pipeline,
    search_spaces=search_space_rf,
    n_iter=20, # Number of optimization iterations
    cv=cv_strategy,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Run optimization
bayes_search_rf.fit(X_train_fe, y_train_log)

print("\nBest Random Forest Hyperparameters:")
print(bayes_search_rf.best_params_)

# Train final model using best parameters
final_model = bayes_search_rf.best_estimator_

# Evaluate optimized model on training data
y_train_log_pred = final_model.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"\nRandom Forest RMSE (Train): {rmse_train:,.2f}")
print(f"Random Forest R² (Train): {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = final_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'random_forest_optimized_test3.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merging basement features...
Merging porch features...
Cleaned training data shape: (1194, 66)
Test data shape: (260, 66)

--- 3. Feature Engineering Complete ---
Number of numerical features: 34
Number of categorical features: 37
Final training features shape: (1194, 71)

--- 5. Model Training (Optimized Random Forest Regressor) ---
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 

In [9]:
#try4

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
from skopt import BayesSearchCV
from skopt.space import Categorical, Real, Integer

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging ---
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    df['BasementAvgQuality'] = 0.0
    mask_has_finished = df['BasementFinishedSF'] > 0
    df.loc[mask_has_finished, 'BasementAvgQuality'] = (
        df.loc[mask_has_finished, 'TotalBasementScore'] / df.loc[mask_has_finished, 'BasementFinishedSF']
    )
    
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)

print("Merging porch features...")
for df in [X_train, X_test]:
    df['TotalPorchArea'] = (
        df['OpenVerandaArea'].fillna(0) + 
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['OpenVerandaArea', 'EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
             errors='ignore', inplace=True)

# Columns to drop (Original Multicollinearity/Redundancy list)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea',
    'ParkingCapacity', 
    'GroundFloorArea', 
    'TotalRooms', 
    'UpperFloorArea', 
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

# --- 3.5 Multicollinearity Pruning (NEW STEP) ---
# Dropping the highly correlated numerical features (rho > 0.85) to stabilize the linear model.
multicollinearity_drop_features = [
    'BasementFinishedSF',   # Retain TotalBasementScore
    'UsableArea',           # Retain QualityArea
    'FullBaths',            # Retain TotalBathrooms
    'RoadAccessLength'      # Retain RoadAccessLength_Log
]
X_train_fe = X_train_fe.drop(columns=multicollinearity_drop_features, errors='ignore')
X_test_fe = X_test_fe.drop(columns=multicollinearity_drop_features, errors='ignore')


numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering & Pruning Complete ---")
print(f"Number of numerical features (Pruned): {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute and Scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
])

# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


# --- 5. Model Training (Bayesian Optimized Ridge with Pruning) ---
print("\n--- 5. Model Training (Bayesian Optimized Ridge with Pruning) ---")

poly_ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=1, include_bias=False)), # Linear Model (degree=1)
    ('regressor', Ridge(random_state=42, max_iter=10000))
])

# Define Bayesian search space
search_space = {
    'regressor__alpha': Real(1e-6, 1e1, prior='log-uniform'), 
    'regressor__fit_intercept': Categorical([True, False]),
}

cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=poly_ridge_pipeline,
    search_spaces=search_space,
    n_iter=20, # Number of optimization iterations
    cv=cv_strategy,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

bayes_search.fit(X_train_fe, y_train_log)

print("\nBest Bayesian Hyperparameters:")
print(bayes_search.best_params_)

# Train final model using best parameters
final_model = bayes_search.best_estimator_

# Evaluate optimized model on training data
y_train_log_pred = final_model.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"\nMulticollinearity Pruned + Optimized Linear Regression RMSE (Train): {rmse_train:,.2f}")
print(f"Multicollinearity Pruned + Optimized Linear Regression R² (Train): {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = final_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'bayesian_pruning_final.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merging basement features...
Merging porch features...
Cleaned training data shape: (1194, 65)
Test data shape: (260, 65)

--- 3. Feature Engineering & Pruning Complete ---
Number of numerical features (Pruned): 29
Number of categorical features: 37
Final training features shape: (1194, 66)

--- 5. Model Training (Bayesian Optimized Ridge with Pruning) ---
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting

In [10]:
#try5

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
from skopt import BayesSearchCV
from skopt.space import Categorical, Real, Integer

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging ---
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    df['BasementAvgQuality'] = 0.0
    mask_has_finished = df['BasementFinishedSF'] > 0
    df.loc[mask_has_finished, 'BasementAvgQuality'] = (
        df.loc[mask_has_finished, 'TotalBasementScore'] / df.loc[mask_has_finished, 'BasementFinishedSF']
    )
    
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)

print("Merging porch features...")
for df in [X_train, X_test]:
    df['TotalPorchArea'] = (
        df['OpenVerandaArea'].fillna(0) + 
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['OpenVerandaArea', 'EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
             errors='ignore', inplace=True)

# Columns to drop (Original Multicollinearity/Redundancy list)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea',
    'ParkingCapacity', 
    'GroundFloorArea', 
    'TotalRooms', 
    'UpperFloorArea', 
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

# --- 3.5 Multicollinearity Pruning (NEW STEP) ---
# Dropping the highly correlated numerical features (rho > 0.85) to stabilize the linear model.
# Retain the engineered/log-transformed features where possible.
multicollinearity_drop_features = [
    'BasementFinishedSF',   # Retain TotalBasementScore
    'UsableArea',           # Retain QualityArea
    'FullBaths',            # Retain TotalBathrooms
    'RoadAccessLength'      # Retain RoadAccessLength_Log
]
X_train_fe = X_train_fe.drop(columns=multicollinearity_drop_features, errors='ignore')
X_test_fe = X_test_fe.drop(columns=multicollinearity_drop_features, errors='ignore')


numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering & Pruning Complete ---")
print(f"Number of numerical features (Pruned): {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute and Scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
])

# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


# --- 5. Model Training (Bayesian Optimized Ridge with Pruning) ---
print("\n--- 5. Model Training (Bayesian Optimized Ridge with Pruning) ---")

poly_ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=1, include_bias=False)), # Linear Model (degree=1)
    ('regressor', Ridge(random_state=42, max_iter=10000))
])

# Define Bayesian search space
search_space = {
    'regressor__alpha': Real(1e-6, 1e1, prior='log-uniform'), 
    'regressor__fit_intercept': Categorical([True, False]),
}

cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=poly_ridge_pipeline,
    search_spaces=search_space,
    n_iter=20, # Number of optimization iterations
    cv=cv_strategy,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

bayes_search.fit(X_train_fe, y_train_log)

print("\nBest Bayesian Hyperparameters:")
print(bayes_search.best_params_)

# Train final model using best parameters
final_model = bayes_search.best_estimator_

# Evaluate optimized model on training data
y_train_log_pred = final_model.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"\nMulticollinearity Pruned + Optimized Linear Regression RMSE (Train): {rmse_train:,.2f}")
print(f"Multicollinearity Pruned + Optimized Linear Regression R² (Train): {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = final_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'bayesian_pruning_final.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merging basement features...
Merging porch features...
Cleaned training data shape: (1194, 65)
Test data shape: (260, 65)

--- 3. Feature Engineering & Pruning Complete ---
Number of numerical features (Pruned): 29
Number of categorical features: 37
Final training features shape: (1194, 66)

--- 5. Model Training (Bayesian Optimized Ridge with Pruning) ---
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting

In [1]:
#try6

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
from skopt import BayesSearchCV
from skopt.space import Categorical, Real, Integer

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging ---
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    df['BasementAvgQuality'] = 0.0
    mask_has_finished = df['BasementFinishedSF'] > 0
    df.loc[mask_has_finished, 'BasementAvgQuality'] = (
        df.loc[mask_has_finished, 'TotalBasementScore'] / df.loc[mask_has_finished, 'BasementFinishedSF']
    )
    
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)

print("Merging porch features...")
for df in [X_train, X_test]:
    # Correcting column name to 'OpenVerandaArea'
    df['TotalPorchArea'] = (
        df['OpenVerandaArea'].fillna(0) + 
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['OpenVerandaArea', 'EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
             errors='ignore', inplace=True)

# Columns to drop (Original Multicollinearity/Redundancy list)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea',
    'ParkingCapacity', 
    'GroundFloorArea', 
    'TotalRooms', 
    'UpperFloorArea', 
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

# --- 3.5 Multicollinearity Pruning ---
multicollinearity_drop_features = [
    'BasementFinishedSF',   
    'UsableArea',           
    'FullBaths',            
    'RoadAccessLength'      
]
X_train_fe = X_train_fe.drop(columns=multicollinearity_drop_features, errors='ignore')
X_test_fe = X_test_fe.drop(columns=multicollinearity_drop_features, errors='ignore')


numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering & Pruning Complete ---")
print(f"Number of numerical features (Pruned): {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute and Scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
])

# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


# --- 5. Model Training (Bayesian Optimized Ridge with 75/25 Split) ---
print("\n--- 5. Model Training (Bayesian Optimized Ridge with 75/25 Split) ---")

poly_ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=1, include_bias=False)), # Linear Model (degree=1)
    ('regressor', Ridge(random_state=42, max_iter=10000))
])

# Define Bayesian search space
search_space = {
    'regressor__alpha': Real(1e-6, 1e1, prior='log-uniform'), 
    'regressor__fit_intercept': Categorical([True, False]),
}

# CHANGE: Using n_splits=4 for KFold to achieve 75% training / 25% validation split
cv_strategy = KFold(n_splits=4, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=poly_ridge_pipeline,
    search_spaces=search_space,
    n_iter=20, # Number of optimization iterations
    cv=cv_strategy, # Changed to 4 folds
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

bayes_search.fit(X_train_fe, y_train_log)

print("\nBest Bayesian Hyperparameters:")
print(bayes_search.best_params_)

# Train final model using best parameters
final_model = bayes_search.best_estimator_

# Evaluate optimized model on training data
y_train_log_pred = final_model.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"\n75/25 Split CV RMSE (Train): {rmse_train:,.2f}")
print(f"75/25 Split CV R² (Train): {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = final_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'bayesian_75_25_split_final.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merging basement features...
Merging porch features...
Cleaned training data shape: (1194, 65)
Test data shape: (260, 65)

--- 3. Feature Engineering & Pruning Complete ---
Number of numerical features (Pruned): 29
Number of categorical features: 37
Final training features shape: (1194, 66)

--- 5. Model Training (Bayesian Optimized Ridge with 75/25 Split) ---
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fitting 4 folds for each of 1 candidates, totalling 4 fits
Fit

In [2]:
#try7

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
from skopt import BayesSearchCV
from skopt.space import Categorical, Real, Integer

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal ---
initial_row_count = len(df_train)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 5) & (X_train_raw['UsableArea'] > 3000))

X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()
test_ids_cleaned = X_test['Id'] 

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging ---
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    df['BasementAvgQuality'] = 0.0
    mask_has_finished = df['BasementFinishedSF'] > 0
    df.loc[mask_has_finished, 'BasementAvgQuality'] = (
        df.loc[mask_has_finished, 'TotalBasementScore'] / df.loc[mask_has_finished, 'BasementFinishedSF']
    )
    
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)

print("Merging porch features...")
for df in [X_train, X_test]:
    df['TotalPorchArea'] = (
        df['OpenVerandaArea'].fillna(0) + 
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['OpenVerandaArea', 'EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
             errors='ignore', inplace=True)

# Columns to drop (Original Multicollinearity/Redundancy list)
columns_to_drop = [
    'Id', 'PoolQuality', 'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea',
    'ParkingCapacity', 
    'GroundFloorArea', 
    'TotalRooms', 
    'UpperFloorArea', 
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

# --- 3.5 Multicollinearity Pruning ---
multicollinearity_drop_features = [
    'BasementFinishedSF',   
    'UsableArea',           
    'FullBaths',            
    'RoadAccessLength'      
]
X_train_fe = X_train_fe.drop(columns=multicollinearity_drop_features, errors='ignore')
X_test_fe = X_test_fe.drop(columns=multicollinearity_drop_features, errors='ignore')


numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering & Pruning Complete ---")
print(f"Number of numerical features (Pruned): {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute and Scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
])

# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)


# --- 5. Model Training (Bayesian Optimized Ridge with 6 Folds) ---
print("\n--- 5. Model Training (Bayesian Optimized Ridge with 6 Folds) ---")

poly_ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=1, include_bias=False)), # Linear Model (degree=1)
    ('regressor', Ridge(random_state=42, max_iter=10000))
])

# Define Bayesian search space
search_space = {
    'regressor__alpha': Real(1e-6, 1e1, prior='log-uniform'), 
    'regressor__fit_intercept': Categorical([True, False]),
}

# CHANGE: Using n_splits=6 for KFold to achieve 6-fold cross-validation
cv_strategy = KFold(n_splits=6, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=poly_ridge_pipeline,
    search_spaces=search_space,
    n_iter=20, 
    cv=cv_strategy, # Changed to 6 folds
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

bayes_search.fit(X_train_fe, y_train_log)

print("\nBest Bayesian Hyperparameters:")
print(bayes_search.best_params_)

# Train final model using best parameters
final_model = bayes_search.best_estimator_

# Evaluate optimized model on training data
y_train_log_pred = final_model.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0

rmse_train = root_mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"\n6-Fold CV RMSE (Train): {rmse_train:,.2f}")
print(f"6-Fold CV R² (Train): {r2_train:.4f}")


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = final_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'bayesian_6_fold_final.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merging basement features...
Merging porch features...
Cleaned training data shape: (1194, 65)
Test data shape: (260, 65)

--- 3. Feature Engineering & Pruning Complete ---
Number of numerical features (Pruned): 29
Number of categorical features: 37
Final training features shape: (1194, 66)

--- 5. Model Training (Bayesian Optimized Ridge with 6 Folds) ---
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting

In [4]:
##overallQuality<3
##ParkingCap taken

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    # Attempt to load from a common nested folder structure
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    # Fallback to the current directory
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        # Exit or raise error if data can't be loaded
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal (New Step) ---
# Remove samples based on Target Value (extremely low/high values)
# and based on large/extreme values in key predictor columns (UsableArea and OverallQuality).
initial_row_count = len(df_train)

# 1. Target-based cleaning: Remove extreme values (e.g., bottom 0.1% and top 0.1% of prices)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

# 2. Predictor-based cleaning (Common for this type of dataset)
# Remove properties with extremely large UsableArea (e.g., > 4000 sq ft)
if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

# Remove properties with poor OverallQuality and high UsableArea (often errors)
if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 3) & (X_train_raw['UsableArea'] > 3000))

# Apply the mask to both features and target
X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()

# Sync test_ids for the remaining rows
test_ids_cleaned = X_test['Id'] # No change to test IDs as we don't drop test rows

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging (Before Dropping Columns) ---
# Merge Basement Features into Weighted Quality Score
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    # Fill NaN values
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    # Map types to scores
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    # Calculate weighted quality score
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    # Average basement quality (cast to float to avoid dtype warning)
    df['BasementAvgQuality'] = 0.0
    mask_has_finished = df['BasementFinishedSF'] > 0
    df.loc[mask_has_finished, 'BasementAvgQuality'] = (
        df.loc[mask_has_finished, 'TotalBasementScore'] / df.loc[mask_has_finished, 'BasementFinishedSF']
    )
    
    # Drop original basement facility columns
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)


# --- NEW SECTION: Feature Engineering for Pool ---
print("Engineering Pool features...")
# Define a quality map for PoolQuality. 
# 'None' (or NaN) = 0, 'Fa' (Fair) = 1, 'Ex' (Excellent) = 4.
# Added 'TA' (Typical) and 'Gd' (Good) as they are common.
pool_quality_map = {
    'None': 0,
    'Fa': 1,
    'Ex': 2,
}

for df in [X_train, X_test]:
    # Fill NaN values first. 'PoolArea' NaNs mean 0 area.
    df['SwimmingPoolArea'] = df['SwimmingPoolArea'].fillna(0)
    df['PoolQuality'] = df['PoolQuality'].fillna('None')
    
    # Map quality strings to numeric scores
    df['PoolQuality_Score'] = df['PoolQuality'].map(pool_quality_map).fillna(0)
    
    # Create the new feature by multiplying quality by area
    df['TotalPoolScore'] = df['PoolQuality_Score'] * df['SwimmingPoolArea']
    
    # Now drop the original columns since they are combined
    df.drop(columns=['PoolQuality', 'SwimmingPoolArea','PoolQuality_Score'],
            errors='ignore', inplace=True)
    
# Merge Porch/Veranda Features
print("Merging porch features...")
for df in [X_train, X_test]:
    df['TotalPorchArea'] = (
        df['OpenVerandaArea'].fillna(0)+
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['OpenVerandaArea','EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
            errors='ignore', inplace=True)
# --- END OF NEW SECTION ---
# Columns to drop (including multicollinearity removal)
columns_to_drop = [
    'Id',  'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea',
    # Multicollinearity removal
    'ParkingArea',  # Keep ParkingCapacity
    # 'GroundFloorArea',  # Keep UsableArea
    # 'TotalRooms',       # Keep FullBaths
    # 'UpperFloorArea',   # Captured in UsableArea
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    
# --- NEW ORDINAL PARKING MAPPING ---
    # Define maps for ordinal parking features
    quality_map_5pt = {
        'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0
    }
    parking_finish_map = {
        'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0
    }
    
    # Overwrite categorical columns with their new numerical scores
    
    # Impute and map Quality
    df['ParkingQuality'] = df['ParkingQuality'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # Impute and map Condition
    df['ParkingCondition'] = df['ParkingCondition'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # Impute and map Finish
    df['ParkingFinish'] = df['ParkingFinish'].fillna('None').map(parking_finish_map).fillna(0)
    

    # --- NEW PROPERTY FUNCTIONALITY MAPPING ---
    # This feature represents deductions from 'Typical'
    functionality_map = {
        'Typ': 7,  # Typical
        'Min1': 6, # Minor Deductions 1
        'Min2': 5, # Minor Deductions 2
        'Mod': 4,  # Moderate Deductions
        'Maj1': 3, # Major Deductions 1
        'Maj2': 2, # Major Deductions 2
        'Sev': 1,  # Severely Damaged
        'None': 0  # Assuming 'None' is worse than 'Sev' or not applicable
    }
    ##--- NEW EXTERIOR QUALITY/CONDITION MAPPING ---
    df['ExteriorQuality'] = df['ExteriorQuality'].fillna('None').map(quality_map_5pt).fillna(0)
    df['ExteriorCondition'] = df['ExteriorCondition'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # # --- NEW BASEMENT FEATURES MAPPING ---
    # # BasementHeight (uses 5-point map)
    # df['BasementHeight'] = df['BasementHeight'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # BasementCondition (uses 5-point map)
    df['BasementCondition'] = df['BasementCondition'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # BasementExposure (custom map)
    exposure_map = {
        'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0
    }
    df['BasementExposure'] = df['BasementExposure'].fillna('None').map(exposure_map).fillna(0)
    

    # --- NEW KITCHEN/HEATING QUALITY MAPPING ---
    df['KitchenQuality'] = df['KitchenQuality'].fillna('None').map(quality_map_5pt).fillna(0)
    df['HeatingQuality'] = df['HeatingQuality'].fillna('None').map(quality_map_5pt).fillna(0)
    # --- END NEW KITCHEN/HEATING SECTION ---

    # Impute and map PropertyFunctionality. 
    # Use fillna('Typ') if 'None' should be treated as 'Typical'
    df['PropertyFunctionality'] = df['PropertyFunctionality'].fillna('None').map(functionality_map).fillna(0)
    # --- END NEW FUNCTIONALITY SECTION ---
    # By overwriting the columns, they will now be automatically
    # treated as 'numerical' features by the rest of the script.
    # Time-based features
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    # Handle RenovationYear: if 0 or missing, use ConstructionYear
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    # Interaction features
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    # Bathroom quality feature
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    # Log transformation for skewed numerical features
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    # Drop source columns used for feature engineering
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute, Scale, and add Polynomial Features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    # The degree=2 poly features were commented out in your previous code to speed things up. 
    # I'll keep them commented unless performance is a concern.
    # ('poly', PolynomialFeatures(degree=2, include_bias=False)) 
])


# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# --- 5. Model Training: Linear Regression + Bayesian Optimization ---
print("\n--- 5A. Baseline Linear Regression ---")

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold

# Baseline Linear Regression pipeline
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train baseline model
lr_pipeline.fit(X_train_fe, y_train_log)

# Evaluate baseline model
y_train_log_pred = lr_pipeline.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0

rmse_train_lr = root_mean_squared_error(y_train, y_train_pred)
r2_train_lr = r2_score(y_train, y_train_pred)

print(f"Linear Regression RMSE (Train): {rmse_train_lr:,.2f}")
print(f"Linear Regression R² (Train): {r2_train_lr:.4f}")


# --- 5B. Bayesian Optimization for Linear Regression ---
print("\n--- 5B. Bayesian Optimization for Linear Regression ---")

from skopt import BayesSearchCV
from skopt.space import Categorical, Real
from sklearn.linear_model import Ridge

# We'll wrap Linear Regression in Ridge to allow tuning small alpha (acts as regularization)
# Because pure LinearRegression has almost no hyperparameters to tune
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(random_state=42, max_iter=10000))
])

# Define Bayesian search space
search_space = {
    'regressor__alpha': Real(1e-6, 1e1, prior='log-uniform'),  # small regularization
    'regressor__fit_intercept': Categorical([True, False]),
    'regressor__tol': Real(1e-5, 1e-2, prior='log-uniform')
}

cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=ridge_pipeline,
    search_spaces=search_space,
    n_iter=30,
    cv=cv_strategy,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

bayes_search.fit(X_train_fe, y_train_log)

print("\nBest Bayesian Hyperparameters:")
print(bayes_search.best_params_)

# Evaluate optimized model
best_model = bayes_search.best_estimator_

y_train_log_pred_bayes = best_model.predict(X_train_fe)
y_train_pred_bayes = np.expm1(y_train_log_pred_bayes)
y_train_pred_bayes[y_train_pred_bayes < 0] = 0

rmse_train_bayes = root_mean_squared_error(y_train, y_train_pred_bayes)
r2_train_bayes = r2_score(y_train, y_train_pred_bayes)

print(f"\nBayesian Optimized Linear Regression RMSE (Train): {rmse_train_bayes:,.2f}")
print(f"Bayesian Optimized Linear Regression R² (Train): {r2_train_bayes:.4f}")


# --- Compare and Select Best Model ---
print("\n--- Model Comparison ---")
print(f"Baseline Linear Regression RMSE: {rmse_train_lr:,.2f} | R²: {r2_train_lr:.4f}")
print(f"Bayesian Optimized Regression RMSE: {rmse_train_bayes:,.2f} | R²: {r2_train_bayes:.4f}")

if rmse_train_bayes < rmse_train_lr:
    final_model = best_model
    print("✅ Using Bayesian-optimized Linear Regression as final model.")
else:
    final_model = lr_pipeline
    print("✅ Using baseline Linear Regression as final model (performed better or equal).")


# --- NEW SECTION: Display Final Features ---
print("\n--- Final Features Considered by the Model ---")
try:
    # Access the 'preprocessor' step from the final fitted pipeline
    preprocessor_step = final_model.named_steps['preprocessor']
    
    # Get the feature names out
    final_feature_names = preprocessor_step.get_feature_names_out()
    
    print(f"Total number of features after preprocessing: {len(final_feature_names)}")
    print("List of all features fed into the regressor:")
    
    # Print all feature names
    for i, name in enumerate(final_feature_names):
        print(f"  {i+1}: {name}")

except Exception as e:
    print(f"Could not retrieve feature names: {e}")
# --- END OF NEW SECTION ---


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = final_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 # Final check to ensure non-negative values

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'linear_selfMod2Parking_submission_cleaned.csv' # Changed filename to reflect cleaning
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merging basement features...
Engineering Pool features...
Merging porch features...
Cleaned training data shape: (1194, 68)
Test data shape: (260, 68)

--- 3. Feature Engineering Complete ---
Number of numerical features: 45
Number of categorical features: 27
Final training features shape: (1194, 72)

--- 5A. Baseline Linear Regression ---
Linear Regression RMSE (Train): 16,263.18
Linear Regression R² (Train): 0.9527

--- 5B. Bayesian Optimization for Linear Regression ---
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fittin

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    # Attempt to load from a common nested folder structure
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    # Fallback to the current directory
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        # Exit or raise error if data can't be loaded
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal (New Step) ---
# Remove samples based on Target Value (extremely low/high values)
# and based on large/extreme values in key predictor columns (UsableArea and OverallQuality).
initial_row_count = len(df_train)

# 1. Target-based cleaning: Remove extreme values (e.g., bottom 0.1% and top 0.1% of prices)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

# 2. Predictor-based cleaning (Common for this type of dataset)
# Remove properties with extremely large UsableArea (e.g., > 4000 sq ft)
if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

# Remove properties with poor OverallQuality and high UsableArea (often errors)
if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 3) & (X_train_raw['UsableArea'] > 3000))

# Apply the mask to both features and target
X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()

# Sync test_ids for the remaining rows
test_ids_cleaned = X_test['Id'] # No change to test IDs as we don't drop test rows

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging (Before Dropping Columns) ---
# Merge Basement Features into Weighted Quality Score
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    # Fill NaN values
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    # Map types to scores
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    # Calculate weighted quality score
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    # The 'BasementAvgQuality' calculation is removed as requested.
    # The intermediate score columns are still dropped to clean up the feature set.
    
    # Drop original basement facility columns
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)


# --- NEW SECTION: Feature Engineering for Pool ---
print("Engineering Pool features...")
# Define a quality map for PoolQuality. 
# 'None' (or NaN) = 0, 'Fa' (Fair) = 1, 'Ex' (Excellent) = 4.
# Added 'TA' (Typical) and 'Gd' (Good) as they are common.
pool_quality_map = {
    'None': 0,
    'Fa': 1,
    'Ex': 2,
}

for df in [X_train, X_test]:
    # Fill NaN values first. 'PoolArea' NaNs mean 0 area.
    df['SwimmingPoolArea'] = df['SwimmingPoolArea'].fillna(0)
    df['PoolQuality'] = df['PoolQuality'].fillna('None')
    
    # Map quality strings to numeric scores
    df['PoolQuality_Score'] = df['PoolQuality'].map(pool_quality_map).fillna(0)
    
    # Create the new feature by multiplying quality by area
    df['TotalPoolScore'] = df['PoolQuality_Score'] * df['SwimmingPoolArea']
    
    # Now drop the original columns since they are combined
    df.drop(columns=['PoolQuality', 'SwimmingPoolArea','PoolQuality_Score'],
             errors='ignore', inplace=True)
    
# Merge Porch/Veranda Features
print("Merging porch features...")
for df in [X_train, X_test]:
    df['TotalPorchArea'] = (
        df['OpenVerandaArea'].fillna(0)+
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['OpenVerandaArea','EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
             errors='ignore', inplace=True)
# --- END OF NEW SECTION ---
# Columns to drop (including multicollinearity removal)
columns_to_drop = [
    'Id',  'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea',
    # Multicollinearity removal
    'ParkingArea',  # Keep ParkingCapacity
    # 'GroundFloorArea',  # Keep UsableArea
    # 'TotalRooms',      # Keep FullBaths
    # 'UpperFloorArea',   # Captured in UsableArea
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    
# --- NEW ORDINAL PARKING MAPPING ---
    # Define maps for ordinal parking features
    quality_map_5pt = {
        'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0
    }
    parking_finish_map = {
        'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0
    }
    
    # Overwrite categorical columns with their new numerical scores
    
    # Impute and map Quality
    df['ParkingQuality'] = df['ParkingQuality'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # Impute and map Condition
    df['ParkingCondition'] = df['ParkingCondition'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # Impute and map Finish
    df['ParkingFinish'] = df['ParkingFinish'].fillna('None').map(parking_finish_map).fillna(0)
    

    # --- NEW PROPERTY FUNCTIONALITY MAPPING ---
    # This feature represents deductions from 'Typical'
    functionality_map = {
        'Typ': 7,  # Typical
        'Min1': 6, # Minor Deductions 1
        'Min2': 5, # Minor Deductions 2
        'Mod': 4,  # Moderate Deductions
        'Maj1': 3, # Major Deductions 1
        'Maj2': 2, # Major Deductions 2
        'Sev': 1,  # Severely Damaged
        'None': 0  # Assuming 'None' is worse than 'Sev' or not applicable
    }
    ##--- NEW EXTERIOR QUALITY/CONDITION MAPPING ---
    df['ExteriorQuality'] = df['ExteriorQuality'].fillna('None').map(quality_map_5pt).fillna(0)
    df['ExteriorCondition'] = df['ExteriorCondition'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # # --- NEW BASEMENT FEATURES MAPPING ---
    # # BasementHeight (uses 5-point map)
    # df['BasementHeight'] = df['BasementHeight'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # BasementCondition (uses 5-point map)
    df['BasementCondition'] = df['BasementCondition'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # BasementExposure (custom map)
    exposure_map = {
        'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0
    }
    df['BasementExposure'] = df['BasementExposure'].fillna('None').map(exposure_map).fillna(0)
    

    # --- NEW KITCHEN/HEATING QUALITY MAPPING ---
    df['KitchenQuality'] = df['KitchenQuality'].fillna('None').map(quality_map_5pt).fillna(0)
    df['HeatingQuality'] = df['HeatingQuality'].fillna('None').map(quality_map_5pt).fillna(0)
    # --- END NEW KITCHEN/HEATING SECTION ---

    # Impute and map PropertyFunctionality. 
    # Use fillna('Typ') if 'None' should be treated as 'Typical'
    df['PropertyFunctionality'] = df['PropertyFunctionality'].fillna('None').map(functionality_map).fillna(0)
    # --- END NEW FUNCTIONALITY SECTION ---
    # By overwriting the columns, they will now be automatically
    # treated as 'numerical' features by the rest of the script.
    # Time-based features
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    # Handle RenovationYear: if 0 or missing, use ConstructionYear
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    # Interaction features
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    # Bathroom quality feature
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    # Log transformation for skewed numerical features
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    # Drop source columns used for feature engineering
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute, Scale, and add Polynomial Features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    # The degree=2 poly features were commented out in your previous code to speed things up. 
    # I'll keep them commented unless performance is a concern.
    # ('poly', PolynomialFeatures(degree=2, include_bias=False)) 
])


# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# --- 5. Model Training: Linear Regression + Bayesian Optimization ---
print("\n--- 5A. Baseline Linear Regression ---")

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold

# Baseline Linear Regression pipeline
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train baseline model
lr_pipeline.fit(X_train_fe, y_train_log)

# Evaluate baseline model
y_train_log_pred = lr_pipeline.predict(X_train_fe)
y_train_pred = np.expm1(y_train_log_pred)
y_train_pred[y_train_pred < 0] = 0

rmse_train_lr = root_mean_squared_error(y_train, y_train_pred)
r2_train_lr = r2_score(y_train, y_train_pred)

print(f"Linear Regression RMSE (Train): {rmse_train_lr:,.2f}")
print(f"Linear Regression R² (Train): {r2_train_lr:.4f}")


# --- 5B. Bayesian Optimization for Linear Regression ---
print("\n--- 5B. Bayesian Optimization for Linear Regression ---")

from skopt import BayesSearchCV
from skopt.space import Categorical, Real
from sklearn.linear_model import Ridge

# We'll wrap Linear Regression in Ridge to allow tuning small alpha (acts as regularization)
# Because pure LinearRegression has almost no hyperparameters to tune
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(random_state=42, max_iter=10000))
])

# Define Bayesian search space
search_space = {
    'regressor__alpha': Real(1e-6, 1e1, prior='log-uniform'),  # small regularization
    'regressor__fit_intercept': Categorical([True, False]),
    'regressor__tol': Real(1e-5, 1e-2, prior='log-uniform')
}

cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=ridge_pipeline,
    search_spaces=search_space,
    n_iter=30,
    cv=cv_strategy,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

bayes_search.fit(X_train_fe, y_train_log)

print("\nBest Bayesian Hyperparameters:")
print(bayes_search.best_params_)

# Evaluate optimized model
best_model = bayes_search.best_estimator_

y_train_log_pred_bayes = best_model.predict(X_train_fe)
y_train_pred_bayes = np.expm1(y_train_log_pred_bayes)
y_train_pred_bayes[y_train_pred_bayes < 0] = 0

rmse_train_bayes = root_mean_squared_error(y_train, y_train_pred_bayes)
r2_train_bayes = r2_score(y_train, y_train_pred_bayes)

print(f"\nBayesian Optimized Linear Regression RMSE (Train): {rmse_train_bayes:,.2f}")
print(f"Bayesian Optimized Linear Regression R² (Train): {r2_train_bayes:.4f}")


# --- Compare and Select Best Model ---
print("\n--- Model Comparison ---")
print(f"Baseline Linear Regression RMSE: {rmse_train_lr:,.2f} | R²: {r2_train_lr:.4f}")
print(f"Bayesian Optimized Regression RMSE: {rmse_train_bayes:,.2f} | R²: {r2_train_bayes:.4f}")

if rmse_train_bayes < rmse_train_lr:
    final_model = best_model
    print("✅ Using Bayesian-optimized Linear Regression as final model.")
else:
    final_model = lr_pipeline
    print("✅ Using baseline Linear Regression as final model (performed better or equal).")


# --- NEW SECTION: Display Final Features ---
print("\n--- Final Features Considered by the Model ---")
try:
    # Access the 'preprocessor' step from the final fitted pipeline
    preprocessor_step = final_model.named_steps['preprocessor']
    
    # Get the feature names out
    final_feature_names = preprocessor_step.get_feature_names_out()
    
    print(f"Total number of features after preprocessing: {len(final_feature_names)}")
    print("List of all features fed into the regressor:")
    
    # Print all feature names
    for i, name in enumerate(final_feature_names):
        print(f"  {i+1}: {name}")

except Exception as e:
    print(f"Could not retrieve feature names: {e}")
# --- END OF NEW SECTION ---


# --- 6. Prediction and Submission File Creation ---
print("\n--- 6. Prediction & Submission ---")
y_test_log_pred = final_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 # Final check to ensure non-negative values

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'linear_selfMod2Parking_submission_cleaned.csv' # Changed filename to reflect cleaning
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merging basement features...
Engineering Pool features...
Merging porch features...
Cleaned training data shape: (1194, 67)
Test data shape: (260, 67)

--- 3. Feature Engineering Complete ---
Number of numerical features: 44
Number of categorical features: 27
Final training features shape: (1194, 71)

--- 5A. Baseline Linear Regression ---
Linear Regression RMSE (Train): 16,246.52
Linear Regression R² (Train): 0.9528

--- 5B. Bayesian Optimization for Linear Regression ---
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fittin

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

# Define file paths
train_data_path = 'train.csv'
test_data_path = 'test.csv'
TARGET_COLUMN = 'HotelValue'

# --- 1. Data Loading and Initial Setup ---
print("--- 1. Data Loading ---")
try:
    # Attempt to load from a common nested folder structure
    df_train = pd.read_csv('Hotel-Property-Value-Dataset/train.csv')
    df_test = pd.read_csv('Hotel-Property-Value-Dataset/test.csv')
    print("Loaded data from 'Hotel-Property-Value-Dataset/' folder.")
except FileNotFoundError:
    # Fallback to the current directory
    try:
        df_train = pd.read_csv(train_data_path)
        df_test = pd.read_csv(test_data_path)
        print("Loaded data from current directory.")
    except FileNotFoundError as e:
        print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are in the correct location.")
        print(f"Details: {e}")
        # Exit or raise error if data can't be loaded
        raise

# Separate features and target before dropping/cleaning
X_train_raw = df_train.drop(columns=[TARGET_COLUMN])
y_train_raw = df_train[TARGET_COLUMN]
X_test = df_test.copy()
test_ids = X_test['Id']

print(f"Initial training data shape: {X_train_raw.shape}")


# --- 1.5 Outlier Removal (New Step) ---
# Remove samples based on Target Value (extremely low/high values)
# and based on large/extreme values in key predictor columns (UsableArea and OverallQuality).
initial_row_count = len(df_train)

# 1. Target-based cleaning: Remove extreme values (e.g., bottom 0.1% and top 0.1% of prices)
y_lower_bound = y_train_raw.quantile(0.001)
y_upper_bound = y_train_raw.quantile(0.999)
outlier_mask = (y_train_raw >= y_lower_bound) & (y_train_raw <= y_upper_bound)

# 2. Predictor-based cleaning (Common for this type of dataset)
# Remove properties with extremely large UsableArea (e.g., > 4000 sq ft)
if 'UsableArea' in X_train_raw.columns:
    outlier_mask &= (X_train_raw['UsableArea'] < 4000)

# Remove properties with poor OverallQuality and high UsableArea (often errors)
if 'OverallQuality' in X_train_raw.columns and 'UsableArea' in X_train_raw.columns:
    outlier_mask &= ~((X_train_raw['OverallQuality'] < 3) & (X_train_raw['UsableArea'] > 3000))

# Apply the mask to both features and target
X_train = X_train_raw[outlier_mask].copy()
y_train = y_train_raw[outlier_mask].copy()

# Sync test_ids for the remaining rows
test_ids_cleaned = X_test['Id'] # No change to test IDs as we don't drop test rows

print(f"Rows removed due to extreme outliers: {initial_row_count - len(X_train)}")


# --- 1.6 Advanced Feature Merging (Before Dropping Columns) ---
# Merge Basement Features into Weighted Quality Score
print("\nMerging basement features...")
basement_quality_map = {
    'GLQ': 5, 'ALQ': 4, 'BLQ': 3, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0
}

for df in [X_train, X_test]:
    # Fill NaN values
    df['BasementFacilitySF1'] = df['BasementFacilitySF1'].fillna(0)
    df['BasementFacilitySF2'] = df['BasementFacilitySF2'].fillna(0)
    
    # Map types to scores
    df['Type1_Score'] = df['BasementFacilityType1'].fillna('None').map(basement_quality_map).fillna(0)
    df['Type2_Score'] = df['BasementFacilityType2'].fillna('None').map(basement_quality_map).fillna(0)
    
    # Calculate weighted quality score
    df['TotalBasementScore'] = (df['Type1_Score'] * df['BasementFacilitySF1']) + (df['Type2_Score'] * df['BasementFacilitySF2'])
    df['BasementFinishedSF'] = df['BasementFacilitySF1'] + df['BasementFacilitySF2']
    
    # Drop original basement facility columns
    df.drop(columns=['BasementFacilityType1', 'BasementFacilityType2', 
                     'BasementFacilitySF1', 'BasementFacilitySF2',
                     'Type1_Score', 'Type2_Score'], errors='ignore', inplace=True)


# --- NEW SECTION: Feature Engineering for Pool ---
print("Engineering Pool features...")
# Define a quality map for PoolQuality. 
# 'None' (or NaN) = 0, 'Fa' (Fair) = 1, 'Ex' (Excellent) = 4.
# Added 'TA' (Typical) and 'Gd' (Good) as they are common.
pool_quality_map = {
    'None': 0,
    'Fa': 1,
    'Ex': 2,
}

for df in [X_train, X_test]:
    # Fill NaN values first. 'PoolArea' NaNs mean 0 area.
    df['SwimmingPoolArea'] = df['SwimmingPoolArea'].fillna(0)
    df['PoolQuality'] = df['PoolQuality'].fillna('None')
    
    # Map quality strings to numeric scores
    df['PoolQuality_Score'] = df['PoolQuality'].map(pool_quality_map).fillna(0)
    
    # Create the new feature by multiplying quality by area
    df['TotalPoolScore'] = df['PoolQuality_Score'] * df['SwimmingPoolArea']
    
    # Now drop the original columns since they are combined
    df.drop(columns=['PoolQuality', 'SwimmingPoolArea','PoolQuality_Score'],
             errors='ignore', inplace=True)
    
# Merge Porch/Veranda Features
print("Merging porch features...")
for df in [X_train, X_test]:
    df['TotalPorchArea'] = (
        df['OpenVerandaArea'].fillna(0)+
        df['EnclosedVerandaArea'].fillna(0) + 
        df['SeasonalPorchArea'].fillna(0) + 
        df['ScreenPorchArea'].fillna(0)
    )
    df.drop(columns=['OpenVerandaArea','EnclosedVerandaArea', 'SeasonalPorchArea', 'ScreenPorchArea'], 
             errors='ignore', inplace=True)
# --- END OF NEW SECTION ---
# Columns to drop (including multicollinearity removal)
columns_to_drop = [
    'Id',  'BoundaryFence', 'ExtraFacility', 'ServiceLaneType', 
    'BasementHalfBaths', 'LowQualityArea','FacadeType',
    # Multicollinearity removal
    'ParkingArea',  # Keep ParkingCapacity
    # 'GroundFloorArea',  # Keep UsableArea
    # 'TotalRooms',      # Keep FullBaths
    # 'UpperFloorArea',   # Captured in UsableArea
]
X_train = X_train.drop(columns=columns_to_drop, errors='ignore')
X_test = X_test.drop(columns=columns_to_drop, errors='ignore')

print(f"Cleaned training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# --- 2. Target Transformation ---
y_train_log = np.log1p(y_train)


# --- 3. Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    
# --- NEW ORDINAL PARKING MAPPING ---
    # Define maps for ordinal parking features
    quality_map_5pt = {
        'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0
    }
    parking_finish_map = {
        'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0
    }
    
    # Overwrite categorical columns with their new numerical scores
    
    # Impute and map Quality
    df['ParkingQuality'] = df['ParkingQuality'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # Impute and map Condition
    df['ParkingCondition'] = df['ParkingCondition'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # Impute and map Finish
    df['ParkingFinish'] = df['ParkingFinish'].fillna('None').map(parking_finish_map).fillna(0)
    

    # --- NEW PROPERTY FUNCTIONALITY MAPPING ---
    # This feature represents deductions from 'Typical'
    functionality_map = {
        'Typ': 7,  # Typical
        'Min1': 6, # Minor Deductions 1
        'Min2': 5, # Minor Deductions 2
        'Mod': 4,  # Moderate Deductions
        'Maj1': 3, # Major Deductions 1
        'Maj2': 2, # Major Deductions 2
        'Sev': 1,  # Severely Damaged
        'None': 0  # Assuming 'None' is worse than 'Sev' or not applicable
    }
    ##--- NEW EXTERIOR QUALITY/CONDITION MAPPING ---
    df['ExteriorQuality'] = df['ExteriorQuality'].fillna('None').map(quality_map_5pt).fillna(0)
    df['ExteriorCondition'] = df['ExteriorCondition'].fillna('None').map(quality_map_5pt).fillna(0)
    

    # BasementCondition (uses 5-point map)
    df['BasementCondition'] = df['BasementCondition'].fillna('None').map(quality_map_5pt).fillna(0)
    
    # BasementExposure (custom map)
    exposure_map = {
        'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0
    }
    df['BasementExposure'] = df['BasementExposure'].fillna('None').map(exposure_map).fillna(0)
    

    # --- NEW KITCHEN/HEATING QUALITY MAPPING ---
    df['KitchenQuality'] = df['KitchenQuality'].fillna('None').map(quality_map_5pt).fillna(0)
    df['HeatingQuality'] = df['HeatingQuality'].fillna('None').map(quality_map_5pt).fillna(0)
    # --- END NEW KITCHEN/HEATING SECTION ---

    # Impute and map PropertyFunctionality. 
    # Use fillna('Typ') if 'None' should be treated as 'Typical'
    df['PropertyFunctionality'] = df['PropertyFunctionality'].fillna('None').map(functionality_map).fillna(0)
    # --- END NEW FUNCTIONALITY SECTION ---
    # By overwriting the columns, they will now be automatically
    # treated as 'numerical' features by the rest of the script.
    # Time-based features
    df['HouseAge'] = df['YearSold'] - df['ConstructionYear']
    
    # Handle RenovationYear: if 0 or missing, use ConstructionYear
    df['RenovationYear'] = df['RenovationYear'].fillna(df['ConstructionYear'])
    df.loc[df['RenovationYear'] == 0, 'RenovationYear'] = df.loc[df['RenovationYear'] == 0, 'ConstructionYear']
    
    df['YearsSinceModification'] = df['YearSold'] - df[['ConstructionYear', 'RenovationYear']].max(axis=1)
    
    # Interaction features
    df['QualityArea'] = df['OverallQuality'] * df['UsableArea']
    
    # Bathroom quality feature
    if 'FullBaths' in df.columns and 'HalfBaths' in df.columns:
        df['TotalBathrooms'] = df['FullBaths'] + (0.5 * df['HalfBaths'])
    
    # Log transformation for skewed numerical features
    for col in ['RoadAccessLength', 'LandArea', 'FacadeArea', 'BasementTotalSF', 'ParkingArea']:
        if col in df.columns:
            temp_df = df[col].fillna(0)
            df[col + '_Log'] = np.log1p(temp_df)
    
    # Drop source columns used for feature engineering
    df = df.drop(columns=['ConstructionYear', 'RenovationYear', 'YearSold', 'MonthSold'], errors='ignore')
    return df

X_train_fe = engineer_features(X_train)
X_test_fe = engineer_features(X_test)

numerical_features = X_train_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

print("\n--- 3. Feature Engineering Complete ---")
print(f"Number of numerical features: {len(numerical_features)}")
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Final training features shape: {X_train_fe.shape}")

# --- 4. Preprocessing Pipelines ---

# Numerical Transformer: Impute, Scale, and add Polynomial Features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    # The degree=2 poly features were commented out in your previous code to speed things up. 
    # I'll keep them commented unless performance is a concern.
    # ('poly', PolynomialFeatures(degree=2, include_bias=False)) 
])


# Categorical Transformer: Impute and One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)
# --- 4.5. Train/Validation Split ---
print("\n--- 4.5. Creating Train/Validation Split ---")
from sklearn.model_selection import KFold, train_test_split
# We split the FE-applied data (X_train_fe) and both log/original y_train
# This gives us a new, smaller training set (fit) and a validation set (val)
X_train_fit, X_val, y_train_fit_log, y_val_log, y_train_fit, y_val = train_test_split(
    X_train_fe, 
    y_train_log, 
    y_train,         # Include the original y_train for scoring
    test_size=0.15,    # 20% of the data will be for validation
    random_state=42
)

print(f"Training split shape: {X_train_fit.shape}")
print(f"Validation split shape: {X_val.shape}")

# --- 5. Model Training (on 80% Split) ---
print("\n--- 5A. Baseline Linear Regression ---")

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
from skopt import BayesSearchCV
from skopt.space import Categorical, Real
from sklearn.linear_model import Ridge

# Baseline Linear Regression pipeline
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train baseline model ON THE 80% TRAINING SPLIT
print("Training baseline model on 80% training data...")
lr_pipeline.fit(X_train_fit, y_train_fit_log)

# --- Evaluate baseline model ON VALIDATION SET ---
print("Evaluating baseline on 20% validation data...")
y_val_log_pred_lr = lr_pipeline.predict(X_val)
y_val_pred_lr = np.expm1(y_val_log_pred_lr)
y_val_pred_lr[y_val_pred_lr < 0] = 0

rmse_val_lr = root_mean_squared_error(y_val, y_val_pred_lr)
r2_val_lr = r2_score(y_val, y_val_pred_lr)

print(f"Baseline Linear Regression RMSE (Validation): {rmse_val_lr:,.2f}")
print(f"Baseline Linear Regression R² (Validation): {r2_val_lr:.4f}")


# --- 5B. Bayesian Optimization for Linear Regression (on 80% Split) ---
print("\n--- 5B. Bayesian Optimization for Linear Regression ---")

ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(random_state=42, max_iter=10000))
])

# Define Bayesian search space
search_space = {
    'regressor__alpha': Real(1e-6, 1e1, prior='log-uniform'),
    'regressor__fit_intercept': Categorical([True, False]),
    'regressor__tol': Real(1e-5, 1e-2, prior='log-uniform')
}

cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=ridge_pipeline,
    search_spaces=search_space,
    n_iter=30,
    cv=cv_strategy,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Train bayesian search ON THE 80% TRAINING SPLIT
print("Running Bayesian Search on 80% training data...")
bayes_search.fit(X_train_fit, y_train_fit_log)

print("\nBest Bayesian Hyperparameters:")
print(bayes_search.best_params_)

# Get the best model found by the search
best_bayes_model = bayes_search.best_estimator_

# --- Evaluate bayesian model ON VALIDATION SET ---
print("Evaluating Bayesian model on 20% validation data...")
y_val_log_pred_bayes = best_bayes_model.predict(X_val)
y_val_pred_bayes = np.expm1(y_val_log_pred_bayes)
y_val_pred_bayes[y_val_pred_bayes < 0] = 0

rmse_val_bayes = root_mean_squared_error(y_val, y_val_pred_bayes)
r2_val_bayes = r2_score(y_val, y_val_pred_bayes)

print(f"Bayesian Optimized RMSE (Validation): {rmse_val_bayes:,.2f}")
print(f"Bayesian Optimized R² (Validation): {r2_val_bayes:.4f}")


# --- 6. Final Model Selection and Retraining ---
print("\n--- 6. Final Model Selection & Retraining ---")
print("--- Validation Set Performance Comparison ---")
print(f"Baseline Linear Regression RMSE (Validation): {rmse_val_lr:,.2f} | R²: {r2_val_lr:.4f}")
print(f"Bayesian Optimized Regression RMSE (Validation): {rmse_val_bayes:,.2f} | R²: {r2_val_bayes:.4f}")

# Decide which model pipeline to use based on VALIDATION score
if rmse_val_bayes < rmse_val_lr:
    print("\n✅ Bayesian-optimized model performed better. Retraining on FULL training data for submission...")
    # Get the best parameters from the bayes search
    best_params = bayes_search.best_params_
    
    # Create a new, final pipeline
    final_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Ridge(random_state=42, max_iter=10000))
    ])
    
    # Set the best parameters found by the search
    final_model.set_params(**best_params)
    
    # Retrain on the ENTIRE (100%) training dataset
    final_model.fit(X_train_fe, y_train_log)
    
else:
    print("\n✅ Baseline Linear Regression performed better. Retraining on FULL training data for submission...")
    # The baseline pipeline is already defined, just retrain it on 100% of data
    final_model = lr_pipeline
    final_model.fit(X_train_fe, y_train_log)


# --- 7. Display Final Features ---
print("\n--- 7. Final Features Considered by the Model ---")
try:
    # Access the 'preprocessor' step from the final fitted pipeline
    preprocessor_step = final_model.named_steps['preprocessor']
    
    # Get the feature names out
    final_feature_names = preprocessor_step.get_feature_names_out()
    
    print(f"Total number of features after preprocessing: {len(final_feature_names)}")
    print("List of all features fed into the regressor:")
    
    # Print all feature names
    for i, name in enumerate(final_feature_names):
        print(f"  {i+1}: {name}")

except Exception as e:
    print(f"Could not retrieve feature names: {e}")
# --- END OF NEW SECTION ---


# --- 8. Prediction and Submission File Creation ---
print("\n--- 8. Prediction & Submission ---")
y_test_log_pred = final_model.predict(X_test_fe)

# Reverse log-transformation
y_test_pred = np.expm1(y_test_log_pred)
y_test_pred[y_test_pred < 0] = 0 # Final check

submission_df = pd.DataFrame({
    'Id': test_ids_cleaned,
    TARGET_COLUMN: y_test_pred
})

submission_filename = 'linear&Bayesian.csv' # Changed filename
submission_df.to_csv(submission_filename, index=False)

print("Prediction process complete.")
print(f"Submission file '{submission_filename}' created with {len(submission_df)} predictions.")
print("First 5 test predictions:")
print(submission_df.head())

--- 1. Data Loading ---
Loaded data from 'Hotel-Property-Value-Dataset/' folder.
Initial training data shape: (1200, 80)
Rows removed due to extreme outliers: 6

Merging basement features...
Engineering Pool features...
Merging porch features...
Cleaned training data shape: (1194, 66)
Test data shape: (260, 66)

--- 3. Feature Engineering Complete ---
Number of numerical features: 44
Number of categorical features: 26
Final training features shape: (1194, 70)

--- 4.5. Creating Train/Validation Split ---
Training split shape: (1014, 70)
Validation split shape: (180, 70)

--- 5A. Baseline Linear Regression ---
Training baseline model on 80% training data...
Evaluating baseline on 20% validation data...
Baseline Linear Regression RMSE (Validation): 22,724.25
Baseline Linear Regression R² (Validation): 0.9263

--- 5B. Bayesian Optimization for Linear Regression ---
Running Bayesian Search on 80% training data...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds fo