In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


In [74]:
# cleaned dataset
df = pd.read_csv("/Users/sa2/Desktop/TKH/House-Prices-Phase-2/code/cleanedhousing.csv")

# separate features and target
X = df.drop(columns=["SalePrice"])
y = df["SalePrice"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [75]:
# Linear Regression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
lin_preds = linreg.predict(X_test)

print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, lin_preds)):.2f}")
print(f"  MAE: {mean_absolute_error(y_test, lin_preds):.2f}")
print(f"  R2: {r2_score(y_test, lin_preds):.2f}\n")

  RMSE: 35914.39
  MAE: 22750.15
  R2: 0.83



In [76]:
# Random Forest (baseline)
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, rf_preds)):.2f}")
print(f"  MAE: {mean_absolute_error(y_test, rf_preds):.2f}")
print(f"  R2: {r2_score(y_test, rf_preds):.2f}\n")

  RMSE: 28755.61
  MAE: 18890.08
  R2: 0.89



In [77]:
# GridSearchCV for Random Forest
param_grid = {
    'n_estimators': [100, 113, 114, 115, 116],
    'max_depth': [None, 27, 26, 28]
}
#ran this a few times and got luckyish with numbers choosen

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best RandomForest Params:", grid_search.best_params_)




Best RandomForest Params: {'max_depth': 26, 'n_estimators': 115}


In [79]:
# best RF model 
best_rf = grid_search.best_estimator_
best_preds = best_rf.predict(X_test)

print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, best_preds)):.2f}")
print(f"  MAE: {mean_absolute_error(y_test, best_preds):.2f}")
print(f"  R2: {r2_score(y_test, best_preds):.2f}")

  RMSE: 28841.45
  MAE: 18935.15
  R2: 0.89


Linear Regression:
  RMSE: 35914.39
  MAE: 22750.15
  R2: 0.83

RF (Baseline):
  RMSE: 28755.61
  MAE: 18890.08
  R2: 0.89

RF (Tuned):
  RMSE: 28841.45
  MAE: 18935.15
  R2: 0.89


RMSE (Root Mean Squared Error):
- Lower is better
RF (Baseline) 7158.78 lower than Linear Regression, mean prediction error is significantly smaller

MAE (Mean Absolute Error):
- lower is better
RF (Baseline) mean absolute error is 3860.07 lower than Linear Regression, its predictions closer to the true values

R² Score (Coefficient of Determination):
- closer to 1 is best
both RF's explain 6% more of the variance compared to Linear Regression


The Baseline Random Forest outperforms Linear Regression across all metrics. GridSearchCV showed default parameters were best (n_estimators=100, max_depth=None); Tuning Random Forest with max_depth=26 and n_estimators=115 resulted in slightly worse performance

In [81]:
# cleaned test set (no SalePrice)
df_test = pd.read_csv("/Users/sa2/Desktop/TKH/House-Prices-Phase-2/code/cleanedhousingtest.csv")

# checking columns match
print("Train shape:", X_train.shape)
print("Test shape:", df_test.shape)

# predict SalePrice using Random Forest
test_preds = rf.predict(df_test)

# submission dataframe
submission = pd.DataFrame({
    "Id": df_test.index, 
    "SalePrice": test_preds
})

submission.to_csv("rf_submission.csv", index=False)


Train shape: (1168, 17)
Test shape: (1459, 17)
