# === 1. Imports ===
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from src.load_data import load_train_data, save_processed_data
from src.preprocess.preprocessing_pipeline import PreprocessingPipeline

In [None]:
# === 2. Load data ===
df = load_train_data()

In [None]:
# === 3. Prepare features and target (before preprocessing) ===
X_raw = df.drop(columns=["price"])
y_raw = df["price"]

In [None]:
# === 4. Train-test split (on raw data) ===
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=42
)

In [None]:
# === 4a. Preprocess training data ===
train_df = X_train_raw.copy()
train_df["price"] = y_train_raw

pipeline = PreprocessingPipeline(
    use_log_target=True,
    drop_low_importance=False,
    encode_data=True
)
train_processed = pipeline.fit_transform(train_df)
X_train = train_processed.drop(columns=["price", "log_price"], errors='ignore')
y_train = train_processed["log_price"]

In [None]:
# === 4b. Preprocess test data (using fitted pipeline) ===
test_df = X_test_raw.copy()
test_df["price"] = y_test_raw  # For consistency
test_processed = pipeline.transform(test_df)
X_test = test_processed.drop(columns=["price", "log_price"], errors='ignore')
y_test = test_processed["log_price"] if "log_price" in test_processed.columns else y_test_raw


In [None]:
# === 5. Train model ===
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [None]:
# === 6. Predict and evaluate ===
y_pred_log = lr_model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

In [None]:
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)

print(f"Linear Regression RMSE: ${rmse:.2f}")
print(f"Linear Regression RÂ² Score: {r2:.3f}")

# === 7. Save model (optional) ===
# import joblib
# joblib.dump(lr_model, "models/linear_regression.joblib")