# Loading and Inspecting Data

In [1]:
import sys
sys.path.append("..")

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
# from sklearn.metrics import make_scorer, mean_squared_error

from housing_predictor.dataset import load_data
from housing_predictor.features import clean_data, engineer_features
from housing_predictor.modeling.train import train_linear_regression
from housing_predictor.modeling.train import train_random_forest
from housing_predictor.modeling.evaluate import evaluate_model

# Load data
train_df, test_df = load_data()

# Inspect data
print("Train Dataset:")
display(train_df.head())

print("\nTest Dataset:")
display(test_df.head())

print("\nTrain Data Info:")
train_df.info()

print("\nTest Data Info:")
test_df.info()

print("\nMissing values in Train Data:")
print(train_df.isnull().sum())

print("\nMissing values in Test Data:")
print(test_df.isnull().sum())

print("\nTrain Dataset Statistics:")
display(train_df.describe())

print("\nTest Dataset Statistics:")
display(test_df.describe())

ImportError: cannot import name 'train_linear_regression' from 'housing_predictor.modeling.train' (/Users/jaylencarrillo/Downloads/housing-price-prediction/notebooks/../housing_predictor/modeling/train.py)

# Data Dictionary

| Column Name     | Data Type | Description                                             |
|-----------------|-----------|---------------------------------------------------------|
| beds            | Integer   | Number of bedrooms                                      |
| baths           | Float     | Number of bathrooms                                     |
| size            | Float     | Size of the property (numeric)                          |
| size_units      | String    | Units for property size (sqft)                          |
| lot_size        | Float     | Size of the lot (numeric)                               |
| lot_size_units  | String    | Units for lot size (sqft, acre)                         |
| zip_code        | Integer   | Zip code of property location                           |
| price           | Float     | Property price in USD (target variable for prediction)  |

# Data Cleaning and Preprocessing

In [None]:
train_df = clean_data(train_df)

# Feature Engineering

In [None]:
train_df = engineer_features(train_df)

# Data Visualization

In [None]:
# Price distribution
plt.figure(figsize=(10,5))
plt.hist(train_df['price'], bins=50, edgecolor='black')
plt.xlabel('Price (Normalized)')
plt.ylabel('Frequency')
plt.title('Distribution of House Prices')
plt.show()

# Feature correlation heatmap
plt.figure(figsize=(12,8))
sns.heatmap(train_df.drop(columns=['price']).corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Heatmap (Excluding Price)')
plt.show()

# Scatter plot: Size vs Price
plt.figure(figsize=(10,6))
plt.scatter(train_df['size'], train_df['price'], alpha=0.5)
plt.xlabel('Size (sqft)')
plt.ylabel('Price (USD)')
plt.title('Scatter Plot of Size vs. Price')
plt.show()

# Scatter plot: Lot Size vs Price
plt.figure(figsize=(10,6))
plt.scatter(train_df['lot_size'], train_df['price'], alpha=0.5, color='red')
plt.xlabel('Lot Size (sqft)')
plt.ylabel('Price (USD)')
plt.title('Scatter Plot of Lot Size vs. Price')
plt.show()

# Cross Validation

In [None]:
# Separate features and target
X = train_df.drop(columns=['price'])
y = train_df['price']

# Train models
lr = train_linear_regression(X, y)
rf = train_random_forest(X, y)

# Setup cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate Linear Regression
print("\nLinear Regression Performance:")
r2_lr, r2_std_lr, mse_lr, mse_std_lr = evaluate_model(lr, X, y, kf)
print(f"Mean R2 Score: {r2_lr:.4f} (Std: {r2_std_lr:.4f})")
print(f"Mean MSE: {mse_lr:.4f} (Std: {mse_std_lr:.4f})")

# Evaluate Random Forest
print("\nRandom Forest Performance:")
r2_rf, r2_std_rf, mse_rf, mse_std_rf = evaluate_model(rf, X, y, kf)
print(f"Mean R2 Score: {r2_rf:.4f} (Std: {r2_std_rf:.4f})")
print(f"Mean MSE: {mse_rf:.4f} (Std: {mse_std_rf:.4f})")

# Model comparison plot
models = ["Linear Regression", "Random Forest"]
r2_scores_list = [r2_lr, r2_rf]

plt.figure(figsize=(10,5))
plt.bar(models, r2_scores_list)
plt.xlabel("Model")
plt.ylabel("Mean R2 Score")
plt.title("Model Performance (Higher is Better)")
plt.show()