In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

file_path = 'HousingData.csv'
housing_data = pd.read_csv(file_path)

# Data Preprocessing
# Fill missing values with the median of each column
housing_data_filled = housing_data.fillna(housing_data.median())

# Separate features (X) and target (y)
X = housing_data_filled.drop(columns=['MEDV'])
y = housing_data_filled['MEDV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Selection
# Linear Regression, Decision Trees, and Gradient Boosting Regressors
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Train and Evaluate each model
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{name} - MSE: {mse:.2f}, R²: {r2:.2f}")

# Fine-tuning Gradient Boosting model
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Perform Grid Search for Gradient Boosting Regressor
grid_search = GridSearchCV(GradientBoostingRegressor(), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test_scaled)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Best Gradient Boosting Model after tuning - MSE: {mse_best:.2f}, R²: {r2_best:.2f}")


Linear Regression - MSE: 25.00, R²: 0.66
Decision Tree - MSE: 14.32, R²: 0.80
Gradient Boosting - MSE: 7.24, R²: 0.90
Best Gradient Boosting Model after tuning - MSE: 7.19, R²: 0.90
