In [None]:
!pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install -q pandas matplotlib seaborn scikit-learn xgboost joblib tqdm opencv-python-headless pillow timm

In [None]:
!pip install numpy==1.26.4

In [None]:
import os
# Disable albumentations update warning
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as img
from PIL import Image
import cv2
from pathlib import Path
from typing import Dict, List, Tuple
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tqdm import tqdm

# [MỚI] Thêm thư viện cho benchmark
import time
import pandas as pd

# Import các mô hình Machine Learning của SKLEARN
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import joblib  # Thêm thư viện để lưu/tải mô hình

# [MỚI] Thử import thư viện CUML (RAPIDS) và CUPY
# Chúng sẽ chỉ được sử dụng nếu import thành công
try:
    import cuml
    import cupy as cp
    cuml_available = True
    print("Tìm thấy thư viện CUML và CUPY. Sẽ chạy benchmark trên GPU.")
except ImportError:
    cuml_available = False
    print("Không tìm thấy CUML. Sẽ chỉ chạy các mô hình Sklearn trên CPU.")

class Config:
    """Configuration for the model and training."""
    def __init__(self):
        # Basic configurations
        self.seed = 42
        self.image_size = 448
        self.batch_size = 16 
        self.num_workers = 2
        self.model_name = 'tradition_features' 
        
        # [MỚI] Tự động phát hiện thiết bị (mặc dù không dùng cho CNN nữa,
        # nhưng tốt để biết có GPU hay không)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # Paths
        self.data_dir = Path("/kaggle/input/orange-dataset/Processed_data")
        self.output_dir = Path("/kaggle/working")
        
        # Categories
        self.categories = ['citrus canker', 'healthy', 'melanose']
        self.num_classes = len(self.categories)
        
        # Create output directory
        os.makedirs(self.output_dir, exist_ok=True)

# ==============================================================================
# CÁC HÀM TIỆN ÍCH (Giữ nguyên)
# ==============================================================================
def plot_confusion_matrix(y_true: List[int], y_pred: List[int], class_names: List[str], save_path: str, model_name: str):
    """Plots and saves the confusion matrix for a specific model."""
    plt.rcParams.update({'font.size': 18})
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, annot_kws={"size": 18})
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()
    print(f"Confusion matrix for {model_name} saved to {save_path}")

def prepare_data(config: Config) -> Tuple[List, List]:
    """Prepare data paths and labels."""
    images, labels = [], []
    label_dict = {cat: idx for idx, cat in enumerate(config.categories)}
    
    print("Loading data paths...")
    for category in config.categories:
        category_path = config.data_dir / category
        image_paths = list(category_path.glob('*.jpg'))
        print(f"{category}: {len(image_paths)} images")
        images.extend(image_paths)
        labels.extend([label_dict[category]] * len(image_paths))
    
    return images, labels

# ==============================================================================
# HÀM TRÍCH XUẤT ĐẶC TRƯNG (Giữ nguyên)
# ==============================================================================
def get_lbp_features(image: np.ndarray, num_points: int = 24, radius: int = 8) -> np.ndarray:
    """Calculates Local Binary Patterns (LBP) features."""
    try:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        lbp = cv2.LBP(gray, radius, num_points, method=cv2.LBP_DEFAULT)
        hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, num_points + 3), range=(0, num_points + 2))
        hist = hist.astype("float")
        hist /= (hist.sum() + 1e-6)
        return hist
    except:
        return np.zeros(num_points + 2)

def get_hog_features(image: np.ndarray) -> np.ndarray:
    """Calculates Histogram of Oriented Gradients (HOG) features."""
    try:
        resized_image = cv2.resize(image, (64, 128)) 
        hog = cv2.HOGDescriptor()
        h = hog.compute(resized_image)
        return h.flatten()
    except:
        return np.zeros(3780) 

def get_color_histogram(image: np.ndarray) -> np.ndarray:
    """Calculates color histogram features (3 channels)."""
    try:
        hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        histograms = []
        for i in range(3):
            hist = cv2.calcHist([hsv], [i], None, [256], [0, 256])
            hist = cv2.normalize(hist, hist).flatten()
            histograms.append(hist)
        return np.concatenate(histograms)
    except:
        return np.zeros(256 * 3)

def extract_traditional_features(images: List[Path]) -> np.ndarray:
    """
    Extracts HOG, LBP, and Color Histogram features for a list of images.
    """
    features = []
    print("\nExtracting traditional features...")
    for image_path in tqdm(images):
        try:
            image = cv2.imread(str(image_path))
            if image is None:
                continue

            hog_features = get_hog_features(image)
            lbp_features = get_lbp_features(image)
            color_hist = get_color_histogram(image)

            combined_feature = np.concatenate([hog_features, lbp_features, color_hist])
            features.append(combined_feature)
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
            
    # [MỚI] Đảm bảo trả về kiểu float32 để tương thích tốt với cả sklearn và cuml
    return np.array(features, dtype=np.float32)

# [MỚI] Hàm để tạo biểu đồ benchmark
def plot_benchmark_results(df: pd.DataFrame, output_dir: Path):
    """Plots and saves benchmark results for time and accuracy."""
    
    # 1. Biểu đồ thời gian (Fit & Predict)
    try:
        df_melted = df.melt(id_vars=['model', 'library'], value_vars=['fit_time', 'pred_time'],
                            var_name='metric', value_name='time_seconds')
        
        plt.figure(figsize=(18, 10))
        sns.catplot(
            data=df_melted, 
            x='model', 
            y='time_seconds', 
            hue='library', 
            col='metric', 
            kind='bar', 
            sharey=False
        )
        plt.suptitle('Benchmark Thời Gian (Fit vs Predict)', y=1.05)
        time_plot_path = output_dir / "benchmark_times.png"
        plt.savefig(time_plot_path)
        plt.close()
        print(f"Biểu đồ benchmark thời gian đã lưu tại: {time_plot_path}")

        # 2. Biểu đồ độ chính xác
        plt.figure(figsize=(14, 7))
        sns.barplot(data=df, x='model', y='accuracy', hue='library')
        plt.title('Benchmark Độ Chính Xác (Accuracy)')
        plt.ylabel('Accuracy')
        plt.xlabel('Model')
        plt.legend(title='Library')
        plt.tight_layout()
        acc_plot_path = output_dir / "benchmark_accuracy.png"
        plt.savefig(acc_plot_path)
        plt.close()
        print(f"Biểu đồ benchmark độ chính xác đã lưu tại: {acc_plot_path}")
        
    except Exception as e:
        print(f"Lỗi khi vẽ biểu đồ benchmark: {e}")


def main():
    global cuml_available
    """Main function to execute the feature extraction and ML model training pipeline."""
    config = Config()
    
    # [MỚI] Khởi tạo danh sách lưu kết quả benchmark
    benchmark_results = []
    
    # Set random seeds for reproducibility
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    
    # 1. Chuẩn bị dữ liệu
    images, labels = prepare_data(config)
    
    # Chia dữ liệu
    X_train, X_temp, y_train, y_temp = train_test_split(
        images, labels, test_size=0.3, random_state=config.seed, stratify=labels
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=config.seed, stratify=y_temp
    )
    
    print(f"\nDataset splits:")
    print(f"Train: {len(X_train)} images")
    print(f"Val:   {len(X_val)} images")
    print(f"Test:  {len(X_test)} images")

    # Kết hợp train và val để có tập huấn luyện lớn hơn
    train_val_images = X_train + X_val
    train_val_labels = y_train + y_val
    
    # 2. Trích xuất đặc trưng truyền thống
    X_train_features = extract_traditional_features(train_val_images)
    X_test_features = extract_traditional_features(X_test)
    
    # [MỚI] Chuyển y_train và y_test sang numpy array để dùng chung
    y_train_cpu = np.array(train_val_labels, dtype=np.int32)
    y_test_cpu = np.array(y_test, dtype=np.int32)
    
    print(f"\nFeature extraction complete.")
    print(f"Train features shape: {X_train_features.shape}")
    print(f"Test features shape:  {X_test_features.shape}")
    
    # 3. [MỚI] Định nghĩa các mô hình
    
    # SKLEARN (CPU)
    sklearn_models = {
        'SVM_Linear': SVC(kernel='linear', random_state=config.seed, probability=True),
        'SVM_RBF': SVC(kernel='rbf', random_state=config.seed, probability=True),
        'Random_Forest': RandomForestClassifier(n_estimators=100, random_state=config.seed),
        'XGBoost': xgb.XGBClassifier(random_state=config.seed, use_label_encoder=False, eval_metric='mlogloss'),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'Logistic_Regression': LogisticRegression(max_iter=1000, random_state=config.seed)
    }
    
    # CUML (GPU) - [MỚI]
    cuml_models = {}
    if cuml_available:
        # [MỚI] Chuyển dữ liệu sang GPU (CUPY)
        print("\nĐang chuyển dữ liệu huấn luyện sang GPU cho CUML...")
        try:
            X_train_gpu = cp.asarray(X_train_features)
            y_train_gpu = cp.asarray(y_train_cpu)
            X_test_gpu = cp.asarray(X_test_features)
            # y_test_gpu không cần thiết vì predict trả về cupy array, 
            # chúng ta sẽ chuyển nó về CPU để so sánh với y_test_cpu
            
            # [MỚI] Định nghĩa các mô hình CUML tương ứng
            cuml_models = {
                'SVM_Linear': cuml.svm.SVC(kernel='linear', random_state=config.seed, probability=True),
                'SVM_RBF': cuml.svm.SVC(kernel='rbf', random_state=config.seed, probability=True),
                'Random_Forest': cuml.ensemble.RandomForestClassifier(n_estimators=100, random_state=config.seed),
                'KNN': cuml.neighbors.KNeighborsClassifier(n_neighbors=5),
                'Logistic_Regression': cuml.linear_model.LogisticRegression(max_iter=1000, random_state=config.seed)
                # Lưu ý: XGBoost không phải là một phần của CUML, 
                # nó là một thư viện riêng biệt có thể chạy trên GPU (tree_method='gpu_hist').
                # Chúng ta giữ XGBoost trong sklearn_models để so sánh với các mô hình CUML.
            }
        except Exception as e:
            print(f"Lỗi khi chuyển dữ liệu sang GPU hoặc khởi tạo mô hình CUML: {e}")
            cuml_available = False # Tắt cờ nếu có lỗi
    
    # 4. [MỚI] Vòng lặp huấn luyện SKLEARN (CPU)
    print("\n--- Training and Evaluating SKLEARN (CPU) Models ---")
    for model_name, model in sklearn_models.items():
        print(f"\n--- [SKLEARN] {model_name.replace('_', ' ')} ---")
        
        # Huấn luyện và đo thời gian
        print("Training...")
        start_fit = time.time()
        model.fit(X_train_features, y_train_cpu)
        fit_time = time.time() - start_fit
        
        # Dự đoán và đo thời gian
        print("Predicting...")
        start_pred = time.time()
        y_pred = model.predict(X_test_features)
        pred_time = time.time() - start_pred
        
        # Đánh giá
        accuracy = accuracy_score(y_test_cpu, y_pred)
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Fit time: {fit_time:.4f}s")
        print(f"Predict time: {pred_time:.4f}s")
        
        # [MỚI] Lưu kết quả benchmark
        benchmark_results.append({
            'library': 'sklearn (CPU)',
            'model': model_name,
            'fit_time': fit_time,
            'pred_time': pred_time,
            'accuracy': accuracy
        })
        
        report = classification_report(
            y_test_cpu, 
            y_pred, 
            target_names=config.categories,
            digits=4
        )
        print("\nClassification Report:")
        print(report)
        
        # Lưu classification report
        report_path = config.output_dir / f"classification_report_SKLEARN_{model_name}.txt"
        with open(report_path, 'w') as f:
            f.write(f"Model: {model_name} (SKLEARN)\n")
            f.write(f"Accuracy: {accuracy:.4f}\n")
            f.write(f"Fit time: {fit_time:.4f}s\n")
            f.write(f"Predict time: {pred_time:.4f}s\n\n")
            f.write(report)
        print(f"Classification report saved to {report_path}")

        # Vẽ và lưu confusion matrix
        cm_path = config.output_dir / f"confusion_matrix_SKLEARN_{model_name}.png"
        plot_confusion_matrix(
            y_true=y_test_cpu,
            y_pred=y_pred,
            class_names=config.categories,
            save_path=cm_path,
            model_name=f"SKLEARN_{model_name}"
        )

        # Lưu model đã huấn luyện
        model_save_path = config.output_dir / f"SKLEARN_{model_name}.joblib"
        joblib.dump(model, model_save_path)
        print(f"Model saved to {model_save_path}")

        # [ĐÃ BỎ] Phần ONNX đã được loại bỏ

    # 5. [MỚI] Vòng lặp huấn luyện CUML (GPU)
    if cuml_available:
        print("\n--- Training and Evaluating CUML (GPU) Models ---")
        for model_name, model in cuml_models.items():
            print(f"\n--- [CUML] {model_name.replace('_', ' ')} ---")
            
            try:
                # Huấn luyện và đo thời gian
                print("Training on GPU...")
                start_fit = time.time()
                model.fit(X_train_gpu, y_train_gpu)
                fit_time = time.time() - start_fit
                
                # Dự đoán và đo thời gian
                print("Predicting on GPU...")
                start_pred = time.time()
                y_pred_gpu = model.predict(X_test_gpu)
                pred_time = time.time() - start_pred
                
                # [MỚI] Chuyển kết quả dự đoán từ GPU về CPU để đánh giá
                y_pred = y_pred_gpu.get() # .get() chuyển cupy array về numpy array
                
                # Đánh giá
                accuracy = accuracy_score(y_test_cpu, y_pred)
                print(f"Accuracy: {accuracy:.4f}")
                print(f"Fit time: {fit_time:.4f}s")
                print(f"Predict time: {pred_time:.4f}s")
                
                # [MỚI] Lưu kết quả benchmark
                benchmark_results.append({
                    'library': 'cuml (GPU)',
                    'model': model_name,
                    'fit_time': fit_time,
                    'pred_time': pred_time,
                    'accuracy': accuracy
                })
                
                report = classification_report(
                    y_test_cpu, 
                    y_pred, 
                    target_names=config.categories,
                    digits=4
                )
                print("\nClassification Report:")
                print(report)
                
                # Lưu classification report
                report_path = config.output_dir / f"classification_report_CUML_{model_name}.txt"
                with open(report_path, 'w') as f:
                    f.write(f"Model: {model_name} (CUML)\n")
                    f.write(f"Accuracy: {accuracy:.4f}\n")
                    f.write(f"Fit time: {fit_time:.4f}s\n")
                    f.write(f"Predict time: {pred_time:.4f}s\n\n")
                    f.write(report)
                print(f"Classification report saved to {report_path}")

                # Vẽ và lưu confusion matrix
                cm_path = config.output_dir / f"confusion_matrix_CUML_{model_name}.png"
                plot_confusion_matrix(
                    y_true=y_test_cpu,
                    y_pred=y_pred,
                    class_names=config.categories,
                    save_path=cm_path,
                    model_name=f"CUML_{model_name}"
                )

                # Lưu model đã huấn luyện
                model_save_path = config.output_dir / f"CUML_{model_name}.joblib"
                joblib.dump(model, model_save_path)
                print(f"Model saved to {model_save_path}")

            except Exception as e:
                print(f"Lỗi khi huấn luyện/đánh giá mô hình CUML {model_name}: {e}")

    # 6. [MỚI] Tổng kết, lưu và vẽ benchmark
    print("\n--- Benchmark Results Summary ---")
    if not benchmark_results:
        print("Không có kết quả benchmark nào được ghi lại.")
    else:
        df_benchmark = pd.DataFrame(benchmark_results)
        df_benchmark = df_benchmark.sort_values(by=['model', 'library'])
        
        print(df_benchmark.to_string())
        
        # Lưu vào CSV
        csv_path = config.output_dir / "benchmark_results.csv"
        df_benchmark.to_csv(csv_path, index=False)
        print(f"\nBenchmark results saved to {csv_path}")
        
        # Vẽ biểu đồ
        plot_benchmark_results(df_benchmark, config.output_dir)

    print("\nAll models trained and evaluated successfully!")

if __name__ == '__main__':
    main()global cuml_available

In [13]:
import os
# Disable albumentations update warning
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as img
from PIL import Image
import cv2
from pathlib import Path
from typing import Dict, List, Tuple
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tqdm import tqdm

# Import benchmarking and timing libraries
import time
import pandas as pd
import joblib # For saving/loading models

# Import specific ML model
from sklearn.neighbors import KNeighborsClassifier

# Import RAPIDS (CUML/CUPY) libraries
# These will only be used if the import is successful
try:
    import cuml
    import cupy as cp
    cuml_available = True
    print("Found CUML and CUPY libraries. GPU benchmarking will be enabled.")
except ImportError:
    cuml_available = False
    print("CUML not found. Will run Sklearn models on CPU only.")

class Config:
    """Configuration class for data, models, and training parameters."""
    def __init__(self):
        # Basic configurations
        self.seed = 42
        self.image_size = 448
        self.batch_size = 16 
        self.num_workers = 2
        self.model_name = 'tradition_features' 
        
        # Auto-detect device (cuda or cpu)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # Paths
        self.data_dir = Path("/kaggle/input/orange-dataset/Processed_data")
        self.output_dir = Path("/kaggle/working")
        
        # Categories
        self.categories = ['citrus canker', 'healthy', 'melanose']
        self.num_classes = len(self.categories)
        
        # Create output directory
        os.makedirs(self.output_dir, exist_ok=True)

# ==============================================================================
# UTILITY FUNCTIONS
# ==============================================================================

def plot_confusion_matrix(y_true: List[int], y_pred: List[int], class_names: List[str], save_path: str, model_name: str):
    """Plots and saves the confusion matrix for a specific model."""
    plt.rcParams.update({'font.size': 18})
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, annot_kws={"size": 18})
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()
    print(f"Confusion matrix for {model_name} saved to {save_path}")

def prepare_data(config: Config) -> Tuple[List, List]:
    """Prepare data paths and labels."""
    images, labels = [], []
    label_dict = {cat: idx for idx, cat in enumerate(config.categories)}
    
    print("Loading data paths...")
    for category in config.categories:
        category_path = config.data_dir / category
        image_paths = list(category_path.glob('*.jpg'))
        print(f"{category}: {len(image_paths)} images")
        images.extend(image_paths)
        labels.extend([label_dict[category]] * len(image_paths))
    
    return images, labels

# ==============================================================================
# TRADITIONAL FEATURE EXTRACTION
# ==============================================================================

def get_lbp_features(image: np.ndarray, num_points: int = 24, radius: int = 8) -> np.ndarray:
    """Calculates Local Binary Patterns (LBP) features."""
    try:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        lbp = cv2.LBP(gray, radius, num_points, method=cv2.LBP_DEFAULT)
        hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, num_points + 3), range=(0, num_points + 2))
        hist = hist.astype("float")
        hist /= (hist.sum() + 1e-6)
        return hist
    except:
        return np.zeros(num_points + 2)

def get_hog_features(image: np.ndarray) -> np.ndarray:
    """Calculates Histogram of Oriented Gradients (HOG) features."""
    try:
        resized_image = cv2.resize(image, (64, 128)) 
        hog = cv2.HOGDescriptor()
        h = hog.compute(resized_image)
        return h.flatten()
    except:
        return np.zeros(3780) 

def get_color_histogram(image: np.ndarray) -> np.ndarray:
    """Calculates color histogram features (3 channels)."""
    try:
        hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        histograms = []
        for i in range(3):
            hist = cv2.calcHist([hsv], [i], None, [256], [0, 256])
            hist = cv2.normalize(hist, hist).flatten()
            histograms.append(hist)
        return np.concatenate(histograms)
    except:
        return np.zeros(256 * 3)

def extract_traditional_features(images: List[Path]) -> np.ndarray:
    """
    Extracts HOG, LBP, and Color Histogram features for a list of images.
    """
    features = []
    print("\nExtracting traditional features...")
    for image_path in tqdm(images):
        try:
            image = cv2.imread(str(image_path))
            if image is None:
                continue

            hog_features = get_hog_features(image)
            lbp_features = get_lbp_features(image)
            color_hist = get_color_histogram(image)

            combined_feature = np.concatenate([hog_features, lbp_features, color_hist])
            features.append(combined_feature)
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
            
    # Ensure float32 for compatibility with sklearn and cuml
    return np.array(features, dtype=np.float32)

# ==============================================================================
# BENCHMARK PLOTTING
# ==============================================================================

def plot_benchmark_results(df_benchmark: pd.DataFrame, output_dir: Path):
    """
    Plots and saves benchmark results for time and accuracy using the specified
    DataFrame. This function is based on the provided plotting script.
    """
    print("\nGenerating benchmark plots...")
    
    try:
        # ==============================================================
        # 1. PLOT TIME (FIT & PREDICT)
        # ==============================================================
        print("Generating Time Plot...")
        
        # Prepare data (melt)
        df_melted = df_benchmark.melt(id_vars=['model', 'library'], 
                                      value_vars=['fit_time', 'pred_time'],
                                      var_name='metric', 
                                      value_name='time_seconds')
        
        # Rename metrics for readability
        df_melted['metric'] = df_melted['metric'].replace({
            'fit_time': 'Training Time (Fit)',
            'pred_time': 'Prediction Time (Predict)'
        })

        # Use sns.catplot (FacetGrid)
        g = sns.catplot(
            data=df_melted, 
            x='model', 
            y='time_seconds', 
            hue='library', 
            col='metric', 
            kind='bar', 
            sharey=False, # Important, as fit and predict times differ significantly
            height=6, 
            aspect=1.1,
            legend=False # Disable the default legend to place it manually
        )
        
        # Set a common title
        g.fig.suptitle('Benchmark: Time Comparison', y=1.05, fontsize=16)
        g.set_axis_labels("Model", "Time (seconds)")
        g.set_titles("{col_name}")

        # Iterate through the axes of the grid and rotate labels
        for ax in g.axes.flat:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

        # Manually add and position the legend
        g.add_legend(title='Library', bbox_to_anchor=(1.02, 0.5), loc='center left', borderaxespad=0.)

        # Use tight_layout to ensure everything fits
        plt.tight_layout(rect=[0, 0, 0.9, 1]) # Adjust rect to make space for the legend

        # Save the new file
        time_plot_path = output_dir / "benchmark_times.png"
        g.savefig(time_plot_path)
        plt.close('all') # Close all plots
        print(f"Time plot saved at: {time_plot_path}")

        # ==============================================================
        # 2. PLOT ACCURACY
        # ==============================================================
        print("Generating Accuracy Plot...")
        
        # Slightly larger figure size to accommodate labels
        plt.figure(figsize=(14, 8)) 
        
        sns.barplot(data=df_benchmark, x='model', y='accuracy', hue='library')
        
        plt.title('Benchmark: Accuracy Comparison', fontsize=16)
        plt.ylabel('Accuracy')
        plt.xlabel('Model')
        plt.legend(title='Library', bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0.) # Position legend

        # Rotate X-axis labels
        plt.xticks(rotation=45, ha='right')

        # Use tight_layout to ensure labels are not cut off
        plt.tight_layout(rect=[0, 0, 0.9, 1]) # Adjust rect to make space for the legend
        
        # Save the new file
        acc_plot_path = output_dir / "benchmark_accuracy.png"
        plt.savefig(acc_plot_path)
        plt.close()
        print(f"Accuracy plot saved at: {acc_plot_path}")
        
        print("\nPlot generation complete! Check the .png files.")

    except Exception as e:
        print(f"An error occurred while plotting: {e}")


# ==============================================================================
# MAIN EXECUTION
# ==============================================================================

def main():
    """Main function to execute the feature extraction and ML model training pipeline."""
    global cuml_available 

    config = Config()
    
    # Initialize a list to store benchmark results
    benchmark_results = []
    
    # Set random seeds for reproducibility
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    
    # 1. Prepare data
    images, labels = prepare_data(config)
    
    # Split data
    X_train, X_temp, y_train, y_temp = train_test_split(
        images, labels, test_size=0.3, random_state=config.seed, stratify=labels
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=config.seed, stratify=y_temp
    )
    
    print(f"\nDataset splits:")
    print(f"Train: {len(X_train)} images")
    print(f"Val:   {len(X_val)} images")
    print(f"Test:  {len(X_test)} images")

    # Combine train and val for a larger training set
    train_val_images = X_train + X_val
    train_val_labels = y_train + y_val
    
    # 2. Extract traditional features
    X_train_features = extract_traditional_features(train_val_images)
    X_test_features = extract_traditional_features(X_test)
    
    # Convert labels to numpy arrays for compatibility
    y_train_cpu = np.array(train_val_labels, dtype=np.int32)
    y_test_cpu = np.array(y_test, dtype=np.int32)
    
    num_test_images = len(y_test_cpu)
    
    print(f"\nFeature extraction complete.")
    print(f"Train features shape: {X_train_features.shape}")
    print(f"Test features shape:  {X_test_features.shape}")
    
    # 3. Define models
    
    # SKLEARN (CPU)
    sklearn_models = {
        'KNN': KNeighborsClassifier(n_neighbors=5),
    }
    
    # CUML (GPU)
    cuml_models = {}
    if cuml_available:
        # Move data to GPU (CUPY)
        print("\nMoving training data to GPU for CUML...")
        try:
            X_train_gpu = cp.asarray(X_train_features)
            y_train_gpu = cp.asarray(y_train_cpu)
            X_test_gpu = cp.asarray(X_test_features)
            
            # Define CUML models
            cuml_models = {
                'KNN': cuml.neighbors.KNeighborsClassifier(n_neighbors=5),
            }
        except Exception as e:
            print(f"Error moving data to GPU or initializing CUML models: {e}")
            cuml_available = False # Disable if error
    
    # 4. SKLEARN (CPU) Training Loop
    print("\n--- Training and Evaluating SKLEARN (CPU) Models ---")
    for model_name, model in sklearn_models.items():
        print(f"\n--- [SKLEARN] {model_name} ---")
        
        # Train and measure time
        print("Training...")
        start_fit = time.time()
        model.fit(X_train_features, y_train_cpu)
        fit_time = time.time() - start_fit
        
        # Predict and measure time
        print(f"Predicting on {num_test_images} images...")
        start_pred = time.time()
        y_pred = model.predict(X_test_features)
        pred_time = time.time() - start_pred
        
        # Evaluate
        accuracy = accuracy_score(y_test_cpu, y_pred)
        time_per_image = pred_time / num_test_images
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Fit time: {fit_time:.4f}s")
        print(f"Predict time: {pred_time:.4f}s")
        print(f"Time per Image: {time_per_image:.6f}s")
        
        # Store benchmark results
        benchmark_results.append({
            'library': 'sklearn (CPU)',
            'model': model_name,
            'fit_time': fit_time,
            'pred_time': pred_time,
            'time_per_image': time_per_image,
            'accuracy': accuracy
        })
        
        report = classification_report(
            y_test_cpu, 
            y_pred, 
            target_names=config.categories,
            digits=4
        )
        print("\nClassification Report:")
        print(report)
        
        # Save classification report
        report_path = config.output_dir / f"classification_report_SKLEARN_{model_name}.txt"
        with open(report_path, 'w') as f:
            f.write(f"Model: {model_name} (SKLEARN)\n")
            f.write(f"Accuracy: {accuracy:.4f}\n")
            f.write(f"Fit time: {fit_time:.4f}s\n")
            f.write(f"Predict time: {pred_time:.4f}s\n")
            f.write(f"Time per Image: {time_per_image:.6f}s\n\n")
            f.write(report)
        print(f"Classification report saved to {report_path}")

        # Plot and save confusion matrix
        cm_path = config.output_dir / f"confusion_matrix_SKLEARN_{model_name}.png"
        plot_confusion_matrix(
            y_true=y_test_cpu,
            y_pred=y_pred,
            class_names=config.categories,
            save_path=cm_path,
            model_name=f"SKLEARN_{model_name}"
        )

        # Save trained model
        model_save_path = config.output_dir / f"SKLEARN_{model_name}.joblib"
        joblib.dump(model, model_save_path)
        print(f"Model saved to {model_save_path}")

    # 5. CUML (GPU) Training Loop
    if cuml_available:
        print("\n--- Training and Evaluating CUML (GPU) Models ---")
        for model_name, model in cuml_models.items():
            print(f"\n--- [CUML] {model_name} ---")
            
            try:
                # Train and measure time
                print("Training on GPU...")
                start_fit = time.time()
                model.fit(X_train_gpu, y_train_gpu)
                fit_time = time.time() - start_fit
                
                # Predict and measure time
                print(f"Predicting on {num_test_images} images (GPU)...")
                start_pred = time.time()
                y_pred_gpu = model.predict(X_test_gpu)
                pred_time = time.time() - start_pred
                
                # Move predictions from GPU to CPU for evaluation
                y_pred = y_pred_gpu.get() 
                
                # Evaluate
                accuracy = accuracy_score(y_test_cpu, y_pred)
                time_per_image = pred_time / num_test_images

                print(f"Accuracy: {accuracy:.4f}")
                print(f"Fit time: {fit_time:.4f}s")
                print(f"Predict time: {pred_time:.4f}s")
                print(f"Time per Image: {time_per_image:.6f}s")
                
                # Store benchmark results
                benchmark_results.append({
                    'library': 'cuml (GPU)',
                    'model': model_name,
                    'fit_time': fit_time,
                    'pred_time': pred_time,
                    'time_per_image': time_per_image,
                    'accuracy': accuracy
                })
                
                report = classification_report(
                    y_test_cpu, 
                    y_pred, 
                    target_names=config.categories,
                    digits=4
                )
                print("\nClassification Report:")
                print(report)
                
                # Save classification report
                report_path = config.output_dir / f"classification_report_CUML_{model_name}.txt"
                with open(report_path, 'w') as f:
                    f.write(f"Model: {model_name} (CUML)\n")
                    f.write(f"Accuracy: {accuracy:.4f}\n")
                    f.write(f"Fit time: {fit_time:.4f}s\n")
                    f.write(f"Predict time: {pred_time:.4f}s\n")
                    f.write(f"Time per Image: {time_per_image:.6f}s\n\n")
                    f.write(report)
                print(f"Classification report saved to {report_path}")

                # Plot and save confusion matrix
                cm_path = config.output_dir / f"confusion_matrix_CUML_{model_name}.png"
                plot_confusion_matrix(
                    y_true=y_test_cpu,
                    y_pred=y_pred,
                    class_names=config.categories,
                    save_path=cm_path,
                    model_name=f"CUML_{model_name}"
                )

                # Save trained model
                model_save_path = config.output_dir / f"CUML_{model_name}.joblib"
                joblib.dump(model, model_save_path)
                print(f"Model saved to {model_save_path}")

            except Exception as e:
                print(f"Error training/evaluating CUML model {model_name}: {e}")

    # 6. Finalize, Save, and Plot Benchmark
    print("\n--- Benchmark Results Summary ---")
    if not benchmark_results:
        print("No benchmark results were recorded.")
    else:
        df_benchmark = pd.DataFrame(benchmark_results)
        df_benchmark = df_benchmark.sort_values(by=['model', 'library'])
        
        # Display summary in console
        print(df_benchmark.to_string(columns=[
            'library', 'model', 'fit_time', 'pred_time', 'time_per_image', 'accuracy'
        ]))
        
        # Save to CSV
        csv_path = config.output_dir / "benchmark_results.csv"
        df_benchmark.to_csv(csv_path, index=False)
        print(f"\nBenchmark results saved to {csv_path}")
        
        # Generate plots
        plot_benchmark_results(df_benchmark, config.output_dir)

    print("\nAll models trained and evaluated successfully!")

if __name__ == '__main__':
    main()

Found CUML and CUPY libraries. GPU benchmarking will be enabled.
Loading data paths...
citrus canker: 2600 images
healthy: 2600 images
melanose: 2600 images

Dataset splits:
Train: 5460 images
Val:   1170 images
Test:  1170 images

Extracting traditional features...


100%|██████████| 6630/6630 [04:48<00:00, 22.96it/s]



Extracting traditional features...


100%|██████████| 1170/1170 [00:57<00:00, 20.44it/s]



Feature extraction complete.
Train features shape: (6630, 4574)
Test features shape:  (1170, 4574)

Moving training data to GPU for CUML...

--- Training and Evaluating SKLEARN (CPU) Models ---

--- [SKLEARN] KNN ---
Training...
Predicting on 1170 images...
Accuracy: 0.8282
Fit time: 0.0129s
Predict time: 0.8916s
Time per Image: 0.000762s

Classification Report:
               precision    recall  f1-score   support

citrus canker     0.9686    0.5538    0.7047       390
      healthy     0.7727    0.9590    0.8558       390
     melanose     0.8186    0.9718    0.8886       390

     accuracy                         0.8282      1170
    macro avg     0.8533    0.8282    0.8164      1170
 weighted avg     0.8533    0.8282    0.8164      1170

Classification report saved to /kaggle/working/classification_report_SKLEARN_KNN.txt
Confusion matrix for SKLEARN_KNN saved to /kaggle/working/confusion_matrix_SKLEARN_KNN.png
Model saved to /kaggle/working/SKLEARN_KNN.joblib

--- Training and Ev

  self._figure.tight_layout(*args, **kwargs)
  plt.tight_layout(rect=[0, 0, 0.9, 1]) # Adjust rect to make space for the legend


Time plot saved at: /kaggle/working/benchmark_times.png
Generating Accuracy Plot...
Accuracy plot saved at: /kaggle/working/benchmark_accuracy.png

Plot generation complete! Check the .png files.

All models trained and evaluated successfully!
