In [3]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [7]:
# Load the dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['MedHouseVal'] = data.target  # Target variable

# Display basic information
display(df.head())
display(df.info())

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(df.drop(columns=['MedHouseVal']))
y = df['MedHouseVal']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


None

Missing values:
 MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


In [8]:
# Explanation of Preprocessing Steps:
# 1. Loading and DataFrame Conversion: The dataset was loaded using fetch_california_housing and converted to a Pandas DataFrame for easier manipulation and analysis.
# 2. Missing Value Check: We checked for missing values using df.isnull().sum(). In this dataset, no missing values were found, so no imputation was necessary.
# 3. Feature Scaling: Feature scaling (standardization) was performed using StandardScaler. This is crucial for regression algorithms that are sensitive to feature scales, such as Linear Regression and Support Vector Regressor. Standardization transforms the features to have zero mean and unit variance, which helps to improve the convergence and performance of these models.
# 4. Train-Test Split: The dataset was split into training and testing sets to evaluate the performance of the models on unseen data. A test size of 20% was used.

In [18]:
# Models dictionary
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42, n_estimators=100),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42, n_estimators=100),
    "Support Vector Regressor": SVR()
}

# Dictionary to store results
results = {}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MSE": mse, "MAE": mae, "R2 Score": r2}
    print(f"{name} Results:")
    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R2 Score: {r2:.4f}\n")

# Convert results to DataFrame
results_df = pd.DataFrame(results).T

Linear Regression Results:
MSE: 0.5559
MAE: 0.5332
R2 Score: 0.5758

Decision Tree Results:
MSE: 0.4943
MAE: 0.4538
R2 Score: 0.6228

Random Forest Results:
MSE: 0.2555
MAE: 0.3276
R2 Score: 0.8050

Gradient Boosting Results:
MSE: 0.2940
MAE: 0.3717
R2 Score: 0.7756

Support Vector Regressor Results:
MSE: 0.3552
MAE: 0.3978
R2 Score: 0.7289



In [None]:
# Explanation:
# This section implements five regression algorithms: Linear Regression, Decision Tree Regressor, Random Forest Regressor, Gradient Boosting Regressor, and Support Vector Regressor (SVR).   
# For each algorithm:
# An instance of the model is created.
# The model is trained using the training data (fit).
# Predictions are made on the test data (predict).

# MSE: Measures the average squared difference between predicted and actual values. Lower is better.
# MAE: Measures the average absolute difference between predicted and actual values. Lower is better.
# R²: Measures the proportion of the variance in the dependent variable that is predictable from the independent variables. Higher is better (closer to 1).   
# From the output, you can see that the Random Forest Regressor has the lowest MSE and highest R², indicating the best performance. The Decision Tree Regressor has the highest MSE, indicating the worst performance.

# Sources and related content


In [22]:
# Convert results to DataFrame
results_df = pd.DataFrame(results).T

# Display results
print("Model Performance Comparison:")
display(results_df)

# Identify best and worst models
best_model = results_df['R2 Score'].idxmax()
worst_model = results_df['R2 Score'].idxmin()
print(f"Best Performing Model: {best_model}")
print(f"Worst Performing Model: {worst_model}")


Model Performance Comparison:


Unnamed: 0,MSE,MAE,R2 Score
Linear Regression,0.555892,0.5332,0.575788
Decision Tree,0.494272,0.453784,0.622811
Random Forest,0.255498,0.327613,0.805024
Gradient Boosting,0.293999,0.37165,0.775643
Support Vector Regressor,0.355198,0.397763,0.728941


Best Performing Model: Random Forest
Worst Performing Model: Linear Regression
