In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from  sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, mean_absolute_percentage_error, mean_squared_error, r2_score, mean_absolute_error
#from  typing import
import warnings
warnings.filterwarnings('ignore')
from scipy import sparse
from lightgbm import early_stopping, log_evaluation

In [3]:
df = pd.read_csv("data/nonull_cleaned_data.csv")

# Define numeric and categorical columns
numeric_columns = ["bedroomCount", "toilet_and_bath", "habitableSurface", "facedeCount", "hasTerrace", "totalParkingCount"]
categorical_columns = ["type", "subtype", "province", "locality", "postCode", "buildingCondition", "epcScore"]

# Fill missing categorical values with "nan" and convert to string
for col in categorical_columns:
    df[col] = df[col].astype(str).fillna("nan")

# One-hot encode categorical variables
encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded = encoder.fit_transform(df[categorical_columns])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_columns))
# Check for duplicates:
def merge_duplicate_columns(df):
    # Group columns by name and sum their values
    df_merged = df.groupby(df.columns, axis=1).sum()
    return df_merged
# After one-hot encoding
encoded_df.columns = encoded_df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

# Merge duplicate columns by summing
encoded_df = encoded_df.groupby(encoded_df.columns, axis=1).sum()
# Combine features
y = df["price"]

# Combine features (X only, without target)
X = pd.concat([df[numeric_columns], encoded_df], axis=1)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234
)

# LightGBM dataset objects
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_test, label=y_test)

# LightGBM parameters
params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.1,
    "max_depth": 4,
    "subsample": 0.6,
    "colsample_bytree": 0.6,
    "verbosity": -1,
    "seed": 42
}

# Train model with early stopping
model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[val_data],
    valid_names=["valid"],
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=100)
    ]
)

# Predictions
train_preds = model.predict(X_train, num_iteration=model.best_iteration)
test_preds = model.predict(X_test, num_iteration=model.best_iteration)

# Evaluation
print("\n📊 Evaluation Metrics:")
print("MSE (test):", mean_squared_error(y_test, test_preds))
print("R² (train):", r2_score(y_train, train_preds))
print("R² (test):", r2_score(y_test, test_preds))
print("MAE (train):", mean_absolute_error(y_train, train_preds))
print("MAE (test):", mean_absolute_error(y_test, test_preds))
print("MAPE (train):", mean_absolute_percentage_error(y_train, train_preds))
print("MAPE (test):", mean_absolute_percentage_error(y_test, test_preds))

Training until validation scores don't improve for 50 rounds
[100]	valid's rmse: 234106
[200]	valid's rmse: 225897
[300]	valid's rmse: 222395
[400]	valid's rmse: 220236
[500]	valid's rmse: 218864
[600]	valid's rmse: 218499
[700]	valid's rmse: 217335
[800]	valid's rmse: 216737
Early stopping, best iteration is:
[848]	valid's rmse: 216615

📊 Evaluation Metrics:
MSE (test): 46921995587.74079
R² (train): 0.8557609783870117
R² (test): 0.8205358300918923
MAE (train): 94178.061952273
MAE (test): 102439.42029653097
MAPE (train): 0.2096319497680936
MAPE (test): 0.2224225398987471
