In [None]:
import numpy as np
import pandas as pd
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict

#### Laod Data

In [None]:
df = pd.read_csv("House-Price.csv")

print("📊 Dataset Info:")
print(df.info())
print(df.head())

#### EDA

In [None]:
numeric_df = df.select_dtypes(include='number')

plt.figure(figsize=(12, 10))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Heatmap - Correlation Between Numeric Features in House Price Data")
plt.show()

# Feature Selection
features = ["bedrooms", "bathrooms", "sqft_living", "sqft_above", "floors"]
X = df[features].values
y = df["price"].values

#### Visualization

In [None]:
for i, feat in enumerate(features):
    plt.scatter(X[:, i], y, alpha=0.5)
    plt.xlabel(feat)
    plt.ylabel("Price")
    plt.title(f"Price vs {feat}")
    plt.show()

#### Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")

##### Evaluation

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test, name="Model"):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    metrics = {
        "Train RMSE": np.sqrt(mean_squared_error(y_train, y_train_pred)),
        "Test RMSE": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        "Train MAE": mean_absolute_error(y_train, y_train_pred),
        "Test MAE": mean_absolute_error(y_test, y_test_pred),
        "Train R²": r2_score(y_train, y_train_pred),
        "Test R²": r2_score(y_test, y_test_pred),
    }

    print(f"\n📊 {name} Evaluation:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    # Plot
    plt.figure(figsize=(6,6))
    plt.scatter(y_test, y_test_pred, alpha=0.6, color='blue')
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
    plt.xlabel("Actual Price")
    plt.ylabel("Predicted Price")
    plt.title(f"Actual vs Predicted - {name}")
    plt.show()

    return metrics

##### Fit Model

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
metrics_lr = evaluate_model(lr, X_train, y_train, X_test, y_test, "Linear Regression")

In [None]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
metrics_ridge = evaluate_model(ridge, X_train, y_train, X_test, y_test, "Ridge Regression")

##### Optuna to find hyperparameter

In [None]:
def objective(trial):
    degree = trial.suggest_int("degree", 1, 5)
    alpha = trial.suggest_float("alpha", 1e-4, 10.0, log=True)

    model = Pipeline([
        ("poly", PolynomialFeatures(degree=degree, include_bias=False)),
        ("scaler", StandardScaler()),
        ("reg", Ridge(alpha=alpha))
    ])

    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    return -score.mean()

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("\n🏆 Best Params:", study.best_params)
print("Best CV MSE:", study.best_value)

In [None]:
best_params = study.best_params
best_model = Pipeline([
    ("poly", PolynomialFeatures(degree=best_params["degree"], include_bias=False)),
    ("scaler", StandardScaler()),
    ("reg", Ridge(alpha=best_params["alpha"]))
])

In [None]:
best_model.fit(X_train, y_train)
metrics_best = evaluate_model(best_model, X_train, y_train, X_test, y_test,
                              f"Ridge Poly (deg={best_params['degree']}, alpha={best_params['alpha']:.4f})")
