# California Housing Price Prediction
**Author:** Katie McGaughey
**Date:** 3-15-2025
**Objective:** Predict the median house price in California using available housing features.

In [None]:
# Import necessary libraries
import pandas as pd  # For data manipulation
import numpy as np  # For numerical computations
import matplotlib.pyplot as plt  # For static visualizations
import seaborn as sns  # For statistical data visualization

# Import the California housing dataset from sklearn
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split  # Splitting data
from sklearn.linear_model import LinearRegression  # Linear regression model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score  # Model evaluation

In [None]:
# Load the California housing dataset
data = fetch_california_housing(as_frame=True)  # Ensure data is loaded as a DataFrame
data_frame = data.frame  # Correct way to access the DataFrame

# Display first 10 rows
data_frame.head(10)

In [None]:
# Check for missing values and data types
data_frame.info()

# Summary statistics
data_frame.describe()

# Missing values
data_frame.isnull().sum()

In [None]:
# Histograms of numerical features
data_frame.hist(bins=30, figsize=(12, 8))
plt.suptitle("Feature Distributions")
plt.show()

In [None]:
# Select input features and target variable
features = ['MedInc', 'AveRooms']  # Predictor variables
target = 'MedHouseVal'  # Target variable

# Create feature matrix (X) and target vector (y)
df_X = data_frame[features]
df_y = data_frame[target]

In [None]:
# Split dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

In [None]:
# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Compute evaluation metrics
r2 = r2_score(y_test, y_pred)  # R² Score
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
rmse = mean_squared_error(y_test, y_pred, squared=False)  # Root Mean Squared Error

# Print evaluation results
print(f'R² Score: {r2:.2f}')
print(f'MAE: {mae:.2f}')
print(f'RMSE: {rmse:.2f}')