In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import time
import joblib
import pickle
import os 

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    confusion_matrix, 
    classification_report
)

In [17]:
df = pd.read_csv('..\\data\\clean\\ordinal_encoded.csv')

X = df.drop('class_value', axis=1)
y = df['class_value']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

class_names = ['Unacceptable', 'Acceptable', 'Good', 'Very Good']
feature_names = X.columns.tolist()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

graph_directory = "..\\graphs\\"

In [18]:
# Define Hyperparameter Grids for Different Models
model_configs = {
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42),
        'param_grid': {
            'solver': ['lbfgs', 'liblinear'],
            'C': [0.1, 1, 10],
            'max_iter': [500, 1000, 1500]
        }
    },
    'Naive Bayes': {
        'model': GaussianNB(),
        'param_grid': {
            # Note: GaussianNB has very limited hyperparameters
            'var_smoothing': [1e-9, 1e-8, 1e-7]
        }
    },
    'SVC': {
        'model': SVC(random_state=42),
        'param_grid': {
            'C': [0.1, 1, 10],
            'kernel': ['rbf', 'linear'],
            'gamma': ['scale', 'auto']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'param_grid': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(random_state=42),
        'param_grid': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'param_grid': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    }
}

In [15]:
def evaluate_model_metrics(model, X_train, X_test, y_train, y_test, model_name, class_names):
    """
    Evaluate a model using various classification metrics
    """
    # Train the model and make predictions
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'model_name': model_name,
        'training_time': training_time,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='macro', zero_division=0),
        'recall': recall_score(y_test, y_pred, average='macro', zero_division=0),
        'f1_score': f1_score(y_test, y_pred, average='macro', zero_division=0)
    }
    
    # Print results
    print(f"\n{model_name} Model Performance:")
    for metric, value in metrics.items():
        if metric not in ['model_name']:
            if metric == 'training_time':
                print(f"{metric.capitalize()}: {value:.2f} seconds")
            else:
                print(f"{metric.capitalize()}: {value:.4f}")
    
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names, zero_division=0))
    
    return metrics, y_pred

def plot_confusion_matrix(y_test, y_pred, model_name, class_names, graph_directory):
    """
    Plot and save confusion matrix
    """
    plt.figure(figsize=(10, 7))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(
        cm, 
        annot=True, 
        fmt='d', 
        cmap='Blues', 
        xticklabels=class_names, 
        yticklabels=class_names
    )
    plt.title(f'{model_name} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    #plt.savefig(f"{graph_directory}confusion_matrices\\{model_name.lower()} confusion matrix")
    plt.show()

def plot_feature_importance(model, model_name, feature_names, graph_directory):
    """
    Plot and save feature importance if the model supports it
    """
    try:
        if hasattr(model, 'coef_'):
            feature_importance = np.abs(model.coef_[0])
        elif hasattr(model, 'feature_importances_'):
            feature_importance = model.feature_importances_
        else:
            print(f"Model {model_name} doesn't support feature importance visualization")
            return
        
        plt.figure(figsize=(10, 6))
        feature_importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': feature_importance
        }).sort_values('importance', ascending=False)
        
        sns.barplot(x='importance', y='feature', data=feature_importance_df)
        plt.title(f'{model_name} - Feature Importance')
        plt.xlabel('Importance')
        plt.ylabel('Features')
        plt.tight_layout()
        #plt.savefig(f"{graph_directory}feature_importance\\{model_name.lower()} feature importance")
        plt.show()
        
        return feature_importance_df
    except Exception as e:
        print(f"Could not plot feature importance: {e}")
        return None

def optimize_model(model, param_grid, X_train, y_train, cv_splits=5):
    """
    Perform hyperparameter tuning using GridSearchCV
    """
    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv,
        scoring='f1_macro',
        n_jobs=-1
    )
    
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    optimization_time = time.time() - start_time
    
    print("\nBest parameters:", grid_search.best_params_)
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    print(f"Optimization time: {optimization_time:.2f} seconds")
    
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

def evaluate_all_models(model_configs, X_train, X_test, y_train, y_test, class_names, feature_names, graph_directory):
    """
    Evaluate all models and return their metrics
    """
    all_metrics = []
    
    for model_name, config in model_configs.items():
        print(f"\nEvaluating {model_name}...")
        
        # Get the model from config
        model = config['model']
        
        # Evaluate model
        metrics, y_pred = evaluate_model_metrics(
            model, X_train, X_test, y_train, y_test, model_name, class_names
        )
        
        # Plot confusion matrix
        plot_confusion_matrix(
            y_test, y_pred, model_name, class_names, graph_directory
        )
        
        # Plot feature importance
        plot_feature_importance(
            model, model_name, feature_names, graph_directory
        )
        
        all_metrics.append(metrics)
    
    # Create DataFrame with all metrics
    metrics_df = pd.DataFrame(all_metrics)
    metrics_df.set_index('model_name', inplace=True)
    
    return metrics_df

def optimize_best_model(best_model_name, model_configs, X_train, y_train):
    """
    Optimize the best performing model using its predefined parameter grid
    """
    if best_model_name not in model_configs:
        raise ValueError(f"Model {best_model_name} not found in configurations")
    
    config = model_configs[best_model_name]
    model = config['model']
    param_grid = config['param_grid']
    
    print(f"\nOptimizing {best_model_name}...")
    return optimize_model(model, param_grid, X_train, y_train)

In [None]:
# 1. Evaluate all models first
metrics_df = evaluate_all_models(
    model_configs,
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test,
    class_names,
    feature_names,
    graph_directory
)

# 2. Find the best model based on F1 score
best_model_name = metrics_df['f1_score'].idxmax()
print(f"\nBest performing model: {best_model_name}")
print("\nAll models performance summary:")
print(metrics_df)

# 3. Optimize the best model
best_model, best_params, best_score = optimize_best_model(
    best_model_name,
    model_configs,
    X_train_scaled,
    y_train
)

# 4. Evaluate the optimized model
final_metrics, final_predictions = evaluate_model_metrics(
    best_model,
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test,
    f"Optimized {best_model_name}",
    class_names
)

# 5. Plot final confusion matrix and feature importance
plot_confusion_matrix(
    y_test,
    final_predictions,
    f"Optimized {best_model_name}",
    class_names,
    graph_directory
)

plot_feature_importance(
    best_model,
    f"Optimized {best_model_name}",
    feature_names,
    graph_directory
)