In [None]:
# Training Initial version of data using Random Forest Classifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Define the path to the dataset
DATA_FILE = "/Users/idrissdjiofack/Desktop/CU Boulder Projects/Spring2025/Big-Data-Analytics/machine_learning/data_warehouse/traffic_cleaned_data.csv"

def preprocess_data(data_file):
    """
    Preprocess the dataset for training.
    - Handles missing values.
    - Encodes categorical variables.
    - Scales numerical features.

    Args:
        data_file (str): Path to the dataset file.

    Returns:
        X (DataFrame): Features.
        y (Series): Target variable.
    """
    print(f"📂 Loading data from: {data_file}")
    df = pd.read_csv(data_file)
    print(f"✅ Data loaded successfully with {len(df)} rows and {len(df.columns)} columns.")

    # Handle missing values
    df.fillna({
        "v_Vel": 0.0,
        "v_Acc": 0.0,
        "Lane_ID": -1,
        "Section_ID": -1,
        "Space_Headway": 0.0,
        "Time_Headway": 0.0
    }, inplace=True)

    # Encode the target variable (Congestion_Level)
    label_encoder = LabelEncoder()
    df["Congestion_Level"] = label_encoder.fit_transform(df["Congestion_Level"])

    # Define features (X) and target (y)
    X = df.drop(columns=["Congestion_Level", "Location"])  # Drop target and non-numeric columns
    y = df["Congestion_Level"]

    # Scale numerical features
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    return X, y, label_encoder

def train_random_forest(X, y):
    """
    Train a Random Forest Classifier and evaluate it.

    Args:
        X (DataFrame): Features.
        y (Series): Target variable.

    Returns:
        model (RandomForestClassifier): Trained model.
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the Random Forest Classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    print("✅ Model Evaluation:")
    print(classification_report(y_test, y_pred, target_names=["Low", "Medium", "High"]))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

    # Generate and display the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Low", "Medium", "High"], yticklabels=["Low", "Medium", "High"])
    plt.title("Confusion Matrix", fontsize=16)
    plt.xlabel("Predicted Label", fontsize=12)
    plt.ylabel("True Label", fontsize=12)
    plt.tight_layout()
    plt.savefig("confusion_matrix.png")
    print("📊 Confusion matrix saved as confusion_matrix.png")
    plt.show()

    return model

if __name__ == "__main__":
    # Preprocess the data
    X, y, label_encoder = preprocess_data(DATA_FILE)

    # Train the Random Forest model
    model = train_random_forest(X, y)

    # Save the trained model and label encoder for future use
    import joblib
    joblib.dump(model, "./random_forest_model.pkl")
    joblib.dump(label_encoder, "./label_encoder.pkl")
    print("✅ Model and label encoder saved successfully.")
