# 🏠 House Price Modeling with Linear & Non-Linear Models
This notebook includes EDA, a Linear Regression model, and a lightweight Random Forest model to predict house prices.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

sns.set(style='whitegrid')

# Load dataset
df = pd.read_csv('../data/raw/house_data.csv')
df.head()

ModuleNotFoundError: No module named 'seab'

In [None]:
# Dataset overview
print('Dataset shape:', df.shape)
print('\nMissing values:\n', df.isnull().sum())
df.describe()

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Price distribution
sns.histplot(df['price'], bins=20, kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Distribution of House Prices')

# Price vs. Square Footage
sns.scatterplot(data=df, x='sqft', y='price', hue='bedrooms', ax=axes[0, 1])
axes[0, 1].set_title('Price vs. Square Footage')

# Average price by location
avg_price_location = df.groupby('location')['price'].mean().sort_values()
sns.barplot(x=avg_price_location.values, y=avg_price_location.index, ax=axes[1, 0])
axes[1, 0].set_title('Average Price by Location')

# Price by condition
sns.boxplot(data=df, x='condition', y='price', ax=axes[1, 1])
axes[1, 1].set_title('Price by House Condition')

plt.tight_layout()
plt.show()

In [None]:
# Linear Regression Modeling
X = df[['sqft', 'bedrooms', 'bathrooms']]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

print('Linear Regression Intercept:', lr_model.intercept_)
print('Linear Regression Coefficients:', list(zip(X.columns, lr_model.coef_)))

In [None]:
# Linear Regression Evaluation
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(lr_mse)
lr_r2 = r2_score(y_test, y_pred_lr)

print(f'Linear Regression RMSE: ₹{lr_rmse:,.2f}')
print(f'Linear Regression R² Score: {lr_r2:.2f}')

In [None]:
# Random Forest Regression (Lightweight, Non-linear)
rf_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, y_pred_rf)

print(f'Random Forest RMSE: ₹{rf_rmse:,.2f}')
print(f'Random Forest R² Score: {rf_r2:.2f}')

## ✅ Summary
- RMSE is preferred over MSE for readability (₹ instead of ₹²).
- Linear Regression is easy to interpret, but Random Forest performed better here.
- **Random Forest RMSE: ~₹67k**, **R²: 0.94** — vs **Linear RMSE: ~₹81k**, **R²: 0.91**.
- Great for MLOps learners to compare models and understand trade-offs.

👉 Next steps: Add MLflow tracking, serve the model via FastAPI, or wrap it in Streamlit!