# House Price Prediction — End-to-End Notebook

This notebook performs an end-to-end house price prediction workflow:

- Data loading & quick EDA
- Preprocessing (categorical handling, missing values)
- Train/test split
- Train multiple models:
  - Linear Regression
  - Random Forest Regressor
  - Gradient Boosting Regressor (from sklearn)
  - XGBoost (if available)
  - Support Vector Regressor (SVR)
- Evaluation metrics: **MAE, MSE, RMSE, MRE (MAPE), R²**
- Visualizations:
  - Correlation heatmap
  - Actual vs Predicted
  - Residual plot
  - Feature importance (tree-based)
  - Distribution of prediction errors

**Notes**
- Expected dataset path: `'house price prediction dataset.zip/Housing.csv'`
- If `price` column is named differently, change the `TARGET_COL` variable below.


In [None]:

# Imports and helper functions
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

def mape(y_true, y_pred):
    # Mean Absolute Percentage Error (used here as MRE)
    # avoid division by zero by adding a tiny epsilon where y_true == 0
    eps = 1e-8
    y_true_safe = np.where(y_true == 0, eps, y_true)
    return np.mean(np.abs((y_true - y_pred) / y_true_safe)) * 100

def evaluate_model(name, y_true, y_pred):
    MAE = mean_absolute_error(y_true, y_pred)
    MSE = mean_squared_error(y_true, y_pred)
    RMSE = np.sqrt(MSE)
    MRE = mape(y_true, y_pred)
    R2 = r2_score(y_true, y_pred)
    return {"model": name, "MAE": MAE, "MSE": MSE, "RMSE": RMSE, "MRE(%)": MRE, "R2": R2}


In [None]:

# 1) Load dataset
DATA_PATH = 'house price prediction dataset.zip/Housing.csv'  # change if needed
TARGET_COL = 'price'  # change if your target column has a different name

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Please upload the dataset to that path.")

df = pd.read_csv(DATA_PATH)
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 5 rows:")
display(df.head())

print("\nData types and non-null counts:")
display(df.info())

print("\nSummary statistics for numeric columns:")
display(df.describe(include=[np.number]).T)


In [None]:

# 2) Preprocessing
# If the target column isn't present, try to guess common names
if TARGET_COL not in df.columns:
    possible = [c for c in df.columns if 'price' in c.lower()]
    if len(possible) == 1:
        TARGET_COL = possible[0]
        print(f"Using discovered target column: {TARGET_COL}")
    else:
        raise ValueError(f"Target column '{TARGET_COL}' not found. Found candidates: {possible}")

# Drop rows where target is missing
df = df.dropna(subset=[TARGET_COL]).copy()

# Encode categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

# Drop columns with too many missing values (optional)
# For now, drop rows with any remaining NA
df_encoded = df_encoded.dropna()

print("After encoding and dropping NA, shape:", df_encoded.shape)

# Features and target
X = df_encoded.drop(columns=[TARGET_COL])
y = df_encoded[TARGET_COL].astype(float)

# Save feature names for later use
FEATURE_NAMES = X.columns.tolist()


In [None]:

# 3) Correlation heatmap (numeric features + target)
corr = df_encoded.corr()
# We'll show correlations of top correlated features with the target
target_corr = corr[TARGET_COL].abs().sort_values(ascending=False)
top_features = target_corr.index.tolist()[:20]  # pick top 20 for visualization

plt.figure(figsize=(10,8))
plt.imshow(corr.loc[top_features, top_features], aspect='auto', interpolation='nearest')
plt.colorbar()
plt.xticks(range(len(top_features)), top_features, rotation=90)
plt.yticks(range(len(top_features)), top_features)
plt.title('Correlation matrix (top features)')
plt.tight_layout()
plt.show()


In [None]:

# 4) Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


In [None]:

# 5) Train multiple models
models = {}

# Linear Regression (no scaling required typically, but we'll add a pipeline in case)
models['LinearRegression'] = make_pipeline(StandardScaler(), LinearRegression())

# Random Forest
models['RandomForest'] = RandomForestRegressor(n_estimators=100, random_state=42)

# Gradient Boosting (sklearn)
models['GradientBoosting'] = GradientBoostingRegressor(n_estimators=200, random_state=42)

# HistGradientBoosting (fast alternative)
models['HistGradientBoosting'] = HistGradientBoostingRegressor(random_state=42)

# SVR (scale features)
models['SVR'] = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.1))

# Try to include XGBoost if available
try:
    import xgboost as xgb
    models['XGBoost'] = xgb.XGBRegressor(n_estimators=200, random_state=42, verbosity=0)
    print("XGBoost available and will be used.")
except Exception as e:
    print("XGBoost not available in this environment - skipping XGBoost. Error:", e)

results = []
fitted_models = {}

for name, m in models.items():
    print(f"\nTraining {name} ...")
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    res = evaluate_model(name, y_test, y_pred)
    results.append(res)
    fitted_models[name] = m
    # store predictions for later analysis
    if name == 'LinearRegression':
        linear_pred = y_pred
    if name == 'RandomForest':
        rf_pred = y_pred


In [None]:

# 6) Results summary
results_df = pd.DataFrame(results).sort_values(by='RMSE')
results_df = results_df.reset_index(drop=True)
from caas_jupyter_tools import display_dataframe_to_user
display_dataframe_to_user("Model performance comparison", results_df)
results_df


In [None]:

# 7) Visualizations for the best model (lowest RMSE)
best_model_name = results_df.loc[0, 'model']
best_model = fitted_models[best_model_name]
print("Best model:", best_model_name)

y_best_pred = best_model.predict(X_test)

# Actual vs Predicted scatter
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_best_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linewidth=2)  # 45-degree line
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title(f"Actual vs Predicted — {best_model_name}")
plt.grid(True)
plt.tight_layout()
plt.show()

# Residual plot
residuals = y_test - y_best_pred
plt.figure(figsize=(8,5))
plt.scatter(y_best_pred, residuals)
plt.axhline(0, linestyle='--', linewidth=2)
plt.xlabel("Predicted Prices")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title(f"Residuals vs Predicted — {best_model_name}")
plt.grid(True)
plt.tight_layout()
plt.show()

# Distribution of residuals (errors)
plt.figure(figsize=(8,5))
plt.hist(residuals, bins=40)
plt.xlabel("Residual")
plt.ylabel("Count")
plt.title("Distribution of Residuals")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

# 8) Feature importance for tree-based models (if available)
import numpy as _np

def plot_feature_importance(model, model_name, top_n=20):
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        indices = _np.argsort(importances)[::-1][:top_n]
        names = [FEATURE_NAMES[i] for i in indices]
        vals = importances[indices]

        plt.figure(figsize=(10,6))
        plt.bar(range(len(vals)), vals)
        plt.xticks(range(len(vals)), names, rotation=90)
        plt.title(f"Feature importances — {model_name} (top {top_n})")
        plt.tight_layout()
        plt.show()
    else:
        print(f"{model_name} does not expose feature_importances_")

# Try RandomForest, GradientBoosting, XGBoost
for candidate in ['RandomForest', 'GradientBoosting', 'XGBoost', 'HistGradientBoosting']:
    if candidate in fitted_models:
        print("\nFeature importance for:", candidate)
        try:
            plot_feature_importance(fitted_models[candidate], candidate, top_n=20)
        except Exception as e:
            print("Could not plot feature importance for", candidate, ":", e)


In [None]:

# 9) Save test predictions and model comparison
pred_df = X_test.copy()
pred_df['Actual'] = y_test
for name, model in fitted_models.items():
    pred_df[f'Pred_{name}'] = model.predict(X_test)

predictions_file = '/mnt/data/house_price_predictions.csv'
pred_df.to_csv(predictions_file, index=True)
print("Saved predictions to:", predictions_file)

# Save results_df
results_file = '/mnt/data/model_comparison_results.csv'
results_df.to_csv(results_file, index=False)
print("Saved model comparison results to:", results_file)
