In [3]:
# Imports
import os  # Added to handle directory operations
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define the directory to store pickle files
pickle_dir = '../model_pickles/'

# Create the directory if it doesn't exist
os.makedirs(pickle_dir, exist_ok=True)

# Import the encoded df
df_encoded = pd.read_csv('../data/processed/encoded_data.csv')

# Define features (X) and target (y)
# Assuming df_encoded is already preprocessed and loaded
X = df_encoded.drop(columns=['price'])
y = df_encoded['price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'K-Nearest Neighbors': KNeighborsRegressor(n_neighbors=5)
}

# Cross-validation parameters
cv_folds = 5
model_performance = {}

# Train, cross-validate, evaluate, and save each model
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate using test set
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Cross-validation
    cv_mse = -cross_val_score(model, X, y, cv=cv_folds, scoring='neg_mean_squared_error').mean()
    cv_mae = -cross_val_score(model, X, y, cv=cv_folds, scoring='neg_mean_absolute_error').mean()
    cv_r2 = cross_val_score(model, X, y, cv=cv_folds, scoring='r2').mean()
    
    # Store performance
    model_performance[name] = {
        'Test MSE': mse, 'CV MSE': cv_mse,
        'Test MAE': mae, 'CV MAE': cv_mae,
        'Test R^2': r2, 'CV R^2': cv_r2
    }
    
    # Define the file path for saving the model
    model_filename = f"{name.replace(' ', '_').lower()}_model.pkl"
    model_filepath = os.path.join(pickle_dir, model_filename)
    
    # Save the trained model
    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file)
    
    # Print results
    print(f"Model: {name}")
    print(f"Test Mean Squared Error: {mse:.2f}")
    print(f"Cross-Validated Mean Squared Error: {cv_mse:.2f}")
    print(f"Test Mean Absolute Error: {mae:.2f}")
    print(f"Cross-Validated Mean Absolute Error: {cv_mae:.2f}")
    print(f"Test R^2 Score: {r2:.2f}")
    print(f"Cross-Validated R^2 Score: {cv_r2:.2f}\n")

# Find the best-performing model based on cross-validated R^2
best_model_name = max(model_performance, key=lambda x: model_performance[x]['CV R^2'])
print(f"Best Model: {best_model_name} with Cross-Validated R^2 Score: {model_performance[best_model_name]['CV R^2']:.2f}")

# Define the file path for loading the best model
best_model_filename = f"{best_model_name.replace(' ', '_').lower()}_model.pkl"
best_model_filepath = os.path.join(pickle_dir, best_model_filename)

# Load the best model for deployment
with open(best_model_filepath, 'rb') as file:
    best_model = pickle.load(file)

# Example prediction using the best model
example_features = X_test.iloc[0].values.reshape(1, -1)
predicted_price = best_model.predict(example_features)
print(f"Predicted price for example features: {predicted_price[0]:.2f}")

Model: Linear Regression
Test Mean Squared Error: 1560.91
Cross-Validated Mean Squared Error: 1289.29
Test Mean Absolute Error: 4.26
Cross-Validated Mean Absolute Error: 4.04
Test R^2 Score: 0.99
Cross-Validated R^2 Score: 0.99

Model: Random Forest
Test Mean Squared Error: 1178.58
Cross-Validated Mean Squared Error: 1292.51
Test Mean Absolute Error: 4.15
Cross-Validated Mean Absolute Error: 4.03
Test R^2 Score: 0.99
Cross-Validated R^2 Score: 0.99

Model: Gradient Boosting
Test Mean Squared Error: 1323.61
Cross-Validated Mean Squared Error: 1171.38
Test Mean Absolute Error: 5.38
Cross-Validated Mean Absolute Error: 4.98
Test R^2 Score: 0.99
Cross-Validated R^2 Score: 0.99

Model: Decision Tree
Test Mean Squared Error: 1866.12
Cross-Validated Mean Squared Error: 1811.60
Test Mean Absolute Error: 4.14
Cross-Validated Mean Absolute Error: 3.92
Test R^2 Score: 0.98
Cross-Validated R^2 Score: 0.98

Model: K-Nearest Neighbors
Test Mean Squared Error: 129426.01
Cross-Validated Mean Squared E

