### Data Exploration and Preprocessing and Clean up

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Loading the dataset downloaded from Kaggle
data = pd.read_csv('housing.csv')
# Cleaning up the data and filling the missing values by filling with the median
data['total_bedrooms'].fillna(data['total_bedrooms'].median(), inplace=True)

data_encoded = pd.get_dummies(data, columns=['ocean_proximity'])

X = data_encoded.drop('median_house_value', axis=1)
y = data_encoded['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train).flatten()
y_test_scaled = scaler_y.transform(y_test).flatten()

models = {
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(alpha=0.1),
    'Ridge Regression': Ridge(alpha=1.0),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(n_estimators=100, random_state=42)
}

model_metrics = {}
model_mse = {}
model_r2 = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train_scaled)
    predictions_scaled = model.predict(X_test_scaled)
    
    # Calculate MSE on scaled predictions and scaled test target values
    mse_scaled = mean_squared_error(y_test_scaled, predictions_scaled)
    r2_scaled = r2_score(y_test_scaled, predictions_scaled)
    
    # Store the scaled MSE and R² score
    model_mse[name] = mse_scaled
    model_r2[name] = r2_scaled

    # Convert MSE to original scale for interpretable error margin
    mse_original_scale = mean_squared_error(y_test, scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten())
    error_margin_original_scale = math.sqrt(mse_original_scale)
    
    # Store all metrics in model_metrics dictionary
    model_metrics[name] = {'MSE_Scaled': mse_scaled, 'R²_Scaled': r2_scaled, 'Error Margin ($)': error_margin_original_scale}

# Visualizing the Error Margin in Original Scale for each model
names = list(model_metrics.keys())
error_margins = [model_metrics[name]['Error Margin ($)'] for name in names]

plt.figure(figsize=(10, 6))
plt.bar(names, error_margins, color='skyblue')
plt.xlabel('Model')
plt.ylabel('Error Margin ($)')
plt.title('Error Margin in Original Scale by Model')
plt.xticks(rotation=45)
plt.show()

models = list(model_mse.keys())
mse_values = list(model_mse.values())
r2_values = list(model_r2.values())

# MSE Visualization
plt.figure(figsize=(10, 6))
plt.bar(models, mse_values, color='skyblue')
plt.xlabel('Model')
plt.ylabel('MSE (Scaled)')
plt.title('MSE of Different Models (Scaled)')
plt.xticks(rotation=45)
plt.show()

# R² Score Visualization
plt.figure(figsize=(10, 6))
plt.bar(models, r2_values, color='lightgreen')
plt.xlabel('Model')
plt.ylabel('R² Score (Scaled)')
plt.title('R² Score Comparison of Different Models (Scaled)')
plt.xticks(rotation=45)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Loading the dataset downloaded from kaggle - https://www.kaggle.com/datasets/camnugent/california-housing-prices/data
data = pd.read_csv('housing.csv')

# Initial data exploration and understanding
print(data.head())
print(data.describe())
print(data.info())

# Handling missing values
# Using the data.isnull().sum(), shows total_bedrooms is missing 207 data
print(data.isnull().sum())

# Cleaning up the data and filling the missing values by filling with the median
data['total_bedrooms'].fillna(data['total_bedrooms'].median(), inplace=True)

# Visualizations for Initial Data Understanding
# Distribution of the target variable
plt.figure(figsize=(10, 6))
sns.histplot(data['median_house_value'], bins=50, kde=True)
plt.title('Distribution of Median House Values')
plt.xlabel('Median House Value')
plt.ylabel('Frequency')
plt.show()

# Removing outliers, Removing outliers in 'median_house_value' using IQR
# q1 = data['median_house_value'].quantile(0.25)
# q3 = data['median_house_value'].quantile(0.75)
# iqr = q3 - q1
# lower_bound = q1 - 1.5 * iqr
# upper_bound = q3 + 1.5 * iqr
# data = data[(data['median_house_value'] >= lower_bound) & (data['median_house_value'] <= upper_bound)]

# # Standardizing 'median_income' to have a mean of 0 and a standard deviation of 1
# scaler = StandardScaler()
# data['median_income'] = scaler.fit_transform(data[['median_income']])


# Confirm the dataset's final structure after cleaning and preprocessing
# print(data.head())
# print(data.describe())
# print(data.info())

### Feature Engineering

In [None]:
# Adding feature engineering
# Creating new features that might help improve the model's predictions
data['rooms_per_household'] = data['total_rooms'] / data['households']
data['bedrooms_per_room'] = data['total_bedrooms'] / data['total_rooms']
data['population_per_household'] = data['population'] / data['households']

### Model Implementation and Baseline Comparison

In [None]:
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Using One-hot to encode the 'ocean_proximity' column - Changing Value to True or False
data_encoded = pd.get_dummies(data, columns=['ocean_proximity'])

# Preparing data for training with the encoded dataset
X = data_encoded.drop('median_house_value', axis=1)
y = data_encoded['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Implementing and training the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Preparing scaled data for Linear Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Implementing and training the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Making predictions with both models
rf_predictions = rf_model.predict(X_test)  # Random Forest uses original data
lr_predictions = lr_model.predict(X_test_scaled)  # Linear Regression uses scaled data

# Evaluating both models
rf_mse = mean_squared_error(y_test, rf_predictions)
lr_mse = mean_squared_error(y_test, lr_predictions)
print(f"Random Forest MSE: {rf_mse}")
print(f"Linear Regression MSE: {lr_mse}")

rf_r2 = r2_score(y_test, rf_predictions)
lr_r2 = r2_score(y_test, lr_predictions)
print(f"Random Forest R² score: {rf_r2}")
print(f"Linear Regression R² score: {lr_r2}")

### Visualizations for Data Exploration

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Distribution of the target variable (median house value)
# data['median_house_value'] = data['median_house_value'].replace([np.inf, -np.inf], np.nan)

# Now you can plot without using 'mode.use_inf_as_na'
plt.figure(figsize=(10, 6))
sns.histplot(data['median_house_value'], bins=50, kde=True)
plt.title('Distribution of Median House Values')
plt.xlabel('Median House Value')
plt.ylabel('Frequency')
plt.show()

# Correlation matrix heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(data_encoded.corr(), annot=True, fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Matrix of Housing Features')
plt.show()

# Scatter plot of geographical data colored by median house value
plt.figure(figsize=(10, 8))
sns.scatterplot(x='longitude', y='latitude', data=data_encoded, hue='median_house_value', palette='viridis', alpha=0.5)
plt.title('Geographical Distribution of Median House Values')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.show()

# Box plot for median house value by ocean proximity
plt.figure(figsize=(10, 6))
sns.boxplot(x='ocean_proximity', y='median_house_value', data=data)
plt.title('Median House Value by Ocean Proximity')
plt.xlabel('Ocean Proximity')
plt.ylabel('Median House Value')
plt.xticks(rotation=45)
plt.show()

# Scatter plot of actual vs. predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, rf_predictions, alpha=0.3, label='Random Forest', color='green')
plt.scatter(y_test, lr_predictions, alpha=0.3, label='Linear Regression', color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.title('Actual vs. Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()

# Histogram of the residuals
fig, axs = plt.subplots(ncols=2, figsize=(16, 6))
sns.histplot(y_test - rf_predictions, bins=50, kde=True, color='red', ax=axs[0], label='RF Residuals')
axs[0].set_title('Random Forest Residuals')
axs[0].set_xlabel('Residuals')
axs[0].set_ylabel('Frequency')
axs[0].legend()

sns.histplot(y_test - lr_predictions, bins=50, kde=True, color='green', ax=axs[1], label='LR Residuals')
axs[1].set_title('Linear Regression Residuals')
axs[1].set_xlabel('Residuals')
axs[1].legend()
plt.show()


# Histogram of the residuals
plt.figure(figsize=(10, 6))
sns.histplot(y_test - rf_predictions, bins=50, kde=True, color='red', label='Random Forest Residuals')
sns.histplot(y_test - lr_predictions, bins=50, kde=True, color='green', alpha=0.7, label='Linear Regression Residuals')
plt.title('Distribution of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.legend()
plt.show()

### Feature Importance Visualization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import learning_curve

feature_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

plt.figure(figsize=(12, 8))
feature_importances.plot(kind='bar')
plt.title('Feature Importance from Random Forest Model')
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.show()

# Visualizing pairwise relationships with a pair plot
features = ['median_income', 'total_rooms', 'population_per_household', 'median_house_value']
subset_data = data[features]

sns.pairplot(subset_data)
plt.show()

# Plotting the learning curve
train_sizes, train_scores, validation_scores = learning_curve(
    estimator=RandomForestRegressor(n_estimators=100, random_state=42),
    X=X_train,
    y=y_train,
    train_sizes=np.linspace(0.01, 1.0, 50),
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

train_scores_mean = -train_scores.mean(axis=1)
validation_scores_mean = -validation_scores.mean(axis=1)

plt.figure(figsize=(12, 8))
plt.plot(train_sizes, train_scores_mean, label='Training error')
plt.plot(train_sizes, validation_scores_mean, label='Validation error')
plt.title('Learning Curve')
plt.xlabel('Training Set Size')
plt.ylabel('MSE')
plt.legend()
plt.show()

# Error Distribution Plot for Random Forest model
rf_predictions = rf_model.predict(X_test)
rf_residuals = y_test - rf_predictions

plt.figure(figsize=(10, 6))
sns.kdeplot(rf_residuals, fill=True)
plt.title('Distribution of Random Forest Prediction Errors')
plt.xlabel('Prediction Error')
plt.ylabel('Density')
plt.show()

### Actual vs. Predicted Values with Confidence Interval

In [None]:
import numpy as np
# Calculate residuals
residuals = y_test - rf_predictions

# Calculate the standard deviation of the residuals
residual_std = np.std(residuals)

# Plot actual vs. predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, rf_predictions, alpha=0.5, label='Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, label='Ideal Fit')

# Overlaying the approximation of the confidence interval
# Assuming a normal distribution, we use 1.96 * std for the 95% confidence interval
lower_bound = rf_predictions - 1.96 * residual_std
upper_bound = rf_predictions + 1.96 * residual_std

# Plotting the confidence interval
plt.fill_between(y_test, lower_bound, upper_bound, color='b', alpha=0.1, label='95% Confidence Interval')

plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values with Approximated Confidence Interval')
plt.legend()
plt.show()