In [None]:
import pandas as pd
import numpy as np
import shap
from sklearn.ensemble import RandomForestRegressor
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings

warnings.filterwarnings('ignore')

# Load the preprocessed training data
train_data = pd.read_csv('../data/Processed/processed_train_data.csv').copy()

# 1. Select numerical features
# Select numerical columns and separate features (X) and target (y).
X = train_data.select_dtypes(include=['float64', 'int64']).drop(columns=['SalePrice'])
y = train_data['SalePrice']

# Split the data into training and testing sets
# 80-20 split for training and testing data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Create and train the model
# Instantiate and train the RandomForestRegressor model.
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 3. Calculate SHAP values
# Use SHAP to calculate the contribution of each feature to the model’s prediction.
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_train)

# 4. Visualize SHAP values
# Visualize the impact of features on predictions using SHAP summary plot.
shap.summary_plot(shap_values, X_train)

In [None]:
# Select the top 25 features
# This list contains the 25 features with the most influence on the target variable.
top_25_features = [
    'Overall_Quality_Impact', 'Overall_Quality', 'Neighborhood_avg_price', 'TotalBsmtSF',
    'BsmtQual_to_BsmtFinSF', 'TotalOutdoorArea', 'OverallQual', '2ndFlrSF', '1stFlrSF', 
    'BsmtFinSF1', 'YearBuilt', 'LotArea', 'Garage_Capacity_per_Square_Meter', 
    'FireplaceQu_OverallQuality_Interaction', 'GrLivArea', 'GarageCars', 'OpenPorchSF', 
    'PavedDrive_LotFrontage_Interaction', 'BsmtUnfSF', 'YearRemodAdd', 'BsmtFinSF_to_TotalArea',
    'Garage_Feature', 'GarageYrBlt', 'LotFrontage', 'Functional_OverallQuality_Interaction'
]

# Filter the data to keep only the selected features
# Create new datasets with the top 25 features for training and testing.
X_train_25 = X_train[top_25_features]
X_test_25 = X_test[top_25_features]

# Retrain the model using RandomForestRegressor
# Train the model on the filtered dataset (X_train_25).
rf_model_25 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_25.fit(X_train_25, y_train)

# Make predictions on the test set
# Predict the target values using the trained model and test data.
y_pred_25 = rf_model_25.predict(X_test_25)

# Calculate performance metrics
# Calculate MSE, RMSE, and MAE to evaluate the model's performance.
mse_25 = mean_squared_error(y_test, y_pred_25)
rmse_25 = np.sqrt(mse_25)
mae_25 = mean_absolute_error(y_test, y_pred_25)

# Print the results
# Display the calculated metrics.
print(f"Mean Squared Error (MSE): {mse_25}")
print(f"Root Mean Squared Error (RMSE): {rmse_25}")
print(f"Mean Absolute Error (MAE): {mae_25}")

# Show feature importance scores
# Retrieve and display the importance of each feature based on the trained model.
importances_25 = rf_model_25.feature_importances_
importances_25_df = pd.DataFrame({
    'Feature': top_25_features,
    'Importance': importances_25
})

# Sort the features by importance
# Sort the features to identify the most important ones.
importances_25_sorted = importances_25_df.sort_values(by='Importance', ascending=False)

print("\nTop 25 Important Features:\n", importances_25_sorted)

In [None]:
# Select top 20 features for training
top_20_features = [
    'Overall_Quality_Impact', 'Overall_Quality', 'Neighborhood_avg_price', 'TotalBsmtSF',
    'BsmtQual_to_BsmtFinSF', 'TotalOutdoorArea', 'OverallQual', '2ndFlrSF', '1stFlrSF', 
    'BsmtFinSF1', 'YearBuilt', 'LotArea', 'Garage_Capacity_per_Square_Meter', 
    'FireplaceQu_OverallQuality_Interaction', 'GrLivArea', 'GarageCars', 'OpenPorchSF', 
    'PavedDrive_LotFrontage_Interaction', 'BsmtUnfSF', 'YearRemodAdd', 'BsmtFinSF_to_TotalArea'
]

# Load and split data
X = train_data[top_20_features]  # Input features
y = train_data['SalePrice']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Split into training and test sets

# TPOTRegressor setup for automatic model optimization
tpot = TPOTRegressor(
    generations=20, 
    population_size=100, 
    random_state=42,
    max_time_mins=180,  # Run for 3 hours max
    max_eval_time_mins=20,  # Each model evaluation lasts 20 minutes
    n_jobs=-1,  # Use all processor cores
    verbosity=3,  # Detailed output
    crossover_rate=0.95,  # High crossover rate
    mutation_rate=0.05,  # Low mutation rate
    subsample=0.9,  # Use 90% of the data
    early_stop=10  # Stop after 10 generations without improvement
)

# Train the model
tpot.fit(X_train, y_train)

# Make predictions
y_pred = tpot.predict(X_test)

# Evaluate model performance using MSE, RMSE, and MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5  # RMSE is the square root of MSE
mae = mean_absolute_error(y_test, y_pred)

# Print evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")

# Print the best pipeline found by TPOT
print(tpot.fitted_pipeline_)

# Save the best model
import joblib
model_path = "../models/stacked_extra_trees_regressor_model.pkl"
joblib.dump(tpot.fitted_pipeline_, model_path)
print(f"Model saved to {model_path}")

In [None]:
# Load the saved model
# We are loading the previously trained and saved model using `joblib.load()`. This lets us reuse the model 
# for predictions without retraining it from scratch.
model = joblib.load('../models/stacked_extra_trees_regressor_model.pkl')

# Load the processed training data
# Here, we load the data from 'processed_train_data.csv' which contains the features and target variable.
# The `copy()` method ensures that we are working with a copy of the data, preserving the original data intact.
train_data = pd.read_csv('../data/Processed/processed_train_data.csv').copy()

# Select features and target variable
# We are selecting the most important features (`top_20_features`) and the target variable (`SalePrice`) for training.
X = train_data[top_20_features]  # Input features
y = train_data['SalePrice']  # Target variable to predict

# Split the data into training and testing sets
# We split the data into 80% training data and 20% testing data using `train_test_split()`.
# This ensures that the model is trained on one part of the data and evaluated on another, helping to avoid overfitting.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
# We train the loaded model on the training data (X_train and y_train) to learn the relationship between features and the target.
model.fit(X_train, y_train)

# Make predictions on the test data
# After training, we use the model to make predictions on the test data (X_test).
# These predictions are compared with actual values (y_test) to evaluate performance.
y_pred = model.predict(X_test)

# Calculate the R² score
# R² (R-squared) indicates how well the model explains the variance in the target variable.
# A higher R² score means the model fits the data better.
r2 = r2_score(y_test, y_pred)

# Print the R² score
# We print the R² score to evaluate how well the model performs on unseen test data.
print(f"R² Score: {r2}")