In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the California Housing dataset
california = fetch_california_housing()

# Convert to pandas DataFrame
df = pd.DataFrame(data=california.data, columns=california.feature_names)
df['MedHouseVal'] = california.target

# Display basic info
print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head())

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# In this dataset, there are no missing values; if there were any, we would treat them accordingly.
# We perform feature scaling on the numerical features as their ranges vary considerably.
features = california.feature_names  # List of features
target = 'MedHouseVal'

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target],
                                                    test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for convenience
X_train_scaled = pd.DataFrame(X_train_scaled, columns=features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features)

print("\nPreprocessing complete!")


Dataset Shape: (20640, 9)
Columns: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MedHouseVal']
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  

Missing values per column:
MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

Preprocessing complete!


In [2]:
# Import regression algorithms and evaluation metrics
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define models in a dictionary for easier iteration
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR()
}

# Dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    # For models that require scaled data (SVR), we use the scaled X_train; others can use X_train_scaled too.
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {'MSE': mse, 'MAE': mae, 'R2': r2}
    print(f"{name} -- MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


Linear Regression -- MSE: 0.5559, MAE: 0.5332, R2: 0.5758
Decision Tree -- MSE: 0.4940, MAE: 0.4539, R2: 0.6230
Random Forest -- MSE: 0.2552, MAE: 0.3274, R2: 0.8053
Gradient Boosting -- MSE: 0.2940, MAE: 0.3717, R2: 0.7756
Support Vector Regressor -- MSE: 0.3570, MAE: 0.3986, R2: 0.7276


In [3]:
# Display results in a DataFrame for easier comparison
results_df = pd.DataFrame(results).T
results_df = results_df[['MSE', 'MAE', 'R2']]
print("Model Evaluation Results:")
print(results_df)

# Identify best and worst models based on R2 (higher is better)
best_model = results_df['R2'].idxmax()
worst_model = results_df['R2'].idxmin()

print(f"\nBest Performing Model: {best_model} (Highest R2)")
print(f"Worst Performing Model: {worst_model} (Lowest R2)")


Model Evaluation Results:
                               MSE       MAE        R2
Linear Regression         0.555892  0.533200  0.575788
Decision Tree             0.493969  0.453904  0.623042
Random Forest             0.255170  0.327425  0.805275
Gradient Boosting         0.293999  0.371650  0.775643
Support Vector Regressor  0.357004  0.398599  0.727563

Best Performing Model: Random Forest (Highest R2)
Worst Performing Model: Linear Regression (Lowest R2)


In [4]:
# You can also add markdown cells to document your process and justification for each step.
print("Regression Assignment 3 completed! Please refer to the notebook documentation for detailed explanations.")

Regression Assignment 3 completed! Please refer to the notebook documentation for detailed explanations.
