In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [31]:
df = pd.read_csv('datasets/20250623_dataset_cleaned.csv')
df_test = df.tail(5000).copy()
df = df.iloc[:-5000]

X = df.drop(columns=['price'])
y = df['price']

text_cols = X.select_dtypes(include=["object"]).columns.tolist()
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X[text_cols] = encoder.fit_transform(X[text_cols])

model = GradientBoostingRegressor(n_estimators=1000, max_depth=6, learning_rate=0.01)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

neg_mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

cv_scores = cross_val_score(model, X, y, scoring=neg_mse_scorer, cv=kf, n_jobs=-1)

mse_scores = -cv_scores

print("5-Fold CV MSE scores:", mse_scores)
print(f"Mean CV MSE: {mse_scores.mean():.4f}")
print(f"Std CV MSE: {mse_scores.std():.4f}")

5-Fold CV MSE scores: [2521827.30332444 2445741.90193068 2530573.197267   2509035.75475753
 2333283.10507444]
Mean CV MSE: 2468092.2525
Std CV MSE: 73675.9648


In [36]:
print(f"Train RMSE: {np.sqrt(mse_scores.mean()):.4f}")

Train RMSE: 1571.0163


In [33]:
model.fit(X, y)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Test RMSE on last 5000 rows: {rmse:.4f}")

Test RMSE on last 5000 rows: 1487.8863


In [34]:
mae = mean_absolute_error(y_test, y_pred)
print(f"Test MAE on last 5000 rows: {mae:.4f}")

Test MAE on last 5000 rows: 1151.0908


In [35]:
r2 = r2_score(y_test, y_pred)
print(f"R² score on test set: {r2:.4f}")

R² score on test set: 0.8919
