In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing(as_frame=True)
df = data.frame
df.head()


In [None]:
df.describe()
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop("MedHouseVal", axis=1)
y = df["MedHouseVal"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=100, random_state=42)

lr.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate(model, name):
    preds = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(f"{name} - MSE: {mse:.4f}, R2: {r2:.4f}")

evaluate(lr, "Linear Regression")
evaluate(rf, "Random Forest")


In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, rf.predict(X_test_scaled), alpha=0.3)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted (Random Forest)")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r')
plt.show()


In [None]:
import joblib

joblib.dump(rf, "../models/house_price_model.pkl")
joblib.dump(scaler, "../models/scaler.pkl")
