In [None]:
# notebooks/model_training/model_training.ipynb

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Configure display settings for better readability
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

# Load the cleaned dataset
data_path = 'data/processed/cleaned_data.csv'  # Adjust the path if needed
df = pd.read_csv(data_path)

# Display basic information about the dataset
df.info()

# Define the target variable and features
target_column = 'target'  # Adjust according to your dataset
X = df.drop(columns=[target_column])
y = df[target_column]

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (if necessary)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models for training
models = {
    "RandomForestRegressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "LinearRegression": LinearRegression()
}

# Training and evaluating models
results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train_scaled, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test_scaled)
    
    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the results
    results[model_name] = {
        "Mean Squared Error": mse,
        "Mean Absolute Error": mae,
        "R^2 Score": r2
    }
    
    print(f"{model_name} Results:")
    print(f"  MSE: {mse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  R^2: {r2:.4f}")
    print("\n")

# Convert the results dictionary to a DataFrame for easy comparison
results_df = pd.DataFrame(results).T
print(results_df)

# Hyperparameter tuning for Random Forest (Optional)
print("Tuning RandomForestRegressor...")
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Best parameters from grid search
print(f"Best parameters: {grid_search.best_params_}")

# Train the model with the best parameters
best_rf_model = grid_search.best_estimator_

# Predict on the test set using the tuned model
y_pred_tuned = best_rf_model.predict(X_test_scaled)

# Evaluate the tuned model
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
mae_tuned = mean_absolute_error(y_test, y_pred_tuned)
r2_tuned = r2_score(y_test, y_pred_tuned)

print("\nTuned RandomForestRegressor Results:")
print(f"  MSE: {mse_tuned:.4f}")
print(f"  MAE: {mae_tuned:.4f}")
print(f"  R^2: {r2_tuned:.4f}")

# Save the best model
import joblib
model_save_path = 'outputs/model/best_rf_model.pkl'
joblib.dump(best_rf_model, model_save_path)
print(f"Best model saved to {model_save_path}")

# Visualize model performance (Predictions vs Actual)
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred_tuned, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.title('Tuned Model Predictions vs Actual')
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.show()
