<a href="https://colab.research.google.com/github/joaosMart/fish-species-class-siglip/blob/update-readme-comprehensive/Code/species-classification/Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Evaluation

This notebook was used to perfrom the evaluation analysis of all the strategies that used extracted features. This includes the evaluation of tempral pooling, temporal voting, single frame and ResNET-50 as feature extractor.

This code saves the required data for the learning curves and plot all the representations for the evaluation.

# Evaluation - Temporal Pooling

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.preprocessing import LabelEncoder
import json
import os
import logging
from typing import Dict, List, Tuple, Any
import glob
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

class NumpyEncoder(json.JSONEncoder):
    """Custom encoder for numpy data types"""
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.bool_):
            return bool(obj)
        return super(NumpyEncoder, self).default(obj)

class DataLoader:
    """Handle loading and processing of NPZ files"""
    def __init__(self, data_dir: str):
        self.data_dir = data_dir

    def load_npz_files(self) -> Tuple[np.ndarray, np.ndarray]:
        """
        Load all NPZ files from directory and extract averaged_mean features and labels

        Returns:
            features: numpy array of averaged_mean features
            labels: numpy array of fish species labels
        """
        features_list = []
        labels_list = []

        # Get all NPZ files in directory
        npz_files = glob.glob(os.path.join(self.data_dir, "*.npz"))

        logging.info(f"Found {len(npz_files)} NPZ files")

        for npz_file in npz_files:
            try:
                # Load NPZ file
                data = np.load(npz_file, allow_pickle=True)

                # Extract averaged_mean feature and label
                features = data['averaged_features']  # Convert from np.ndarray to dict
                fish_species = str(data['fish_species'].item())  # Convert to string

                if features is not None:
                    features_list.append(features)
                    labels_list.append(fish_species)

            except Exception as e:
                logging.error(f"Error processing file {npz_file}: {str(e)}")
                continue

        # Convert lists to numpy arrays
        features_array = np.array(features_list)
        labels_array = np.array(labels_list)

        # Log data distribution
        unique_labels, counts = np.unique(labels_array, return_counts=True)
        for label, count in zip(unique_labels, counts):
            percentage = (count / len(labels_array)) * 100
            logging.info(f"Class {label}: {count} samples ({percentage:.2f}%)")

        return features_array, labels_array


class FishClassifier:
    def __init__(self, random_state: int = 42):
        self.random_state = random_state
        self.le = LabelEncoder()
        self.setup_logging()

    def setup_logging(self):
        """Setup logging configuration"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            filename='fish_classifier.log'
        )

    def prepare_data(self, features: np.ndarray, labels: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        Prepare data by splitting into train and test sets with stratification
        """
        # Encode labels
        y = self.le.fit_transform(labels)

        # Create stratified train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            features,
            y,
            test_size=0.2,
            random_state=self.random_state,
            stratify=y
        )

        logging.info(f"Training set size: {X_train.shape[0]}")
        logging.info(f"Test set size: {X_test.shape[0]}")

        return X_train, X_test, y_train, y_test

    def create_baseline_models(self) -> Dict:
        """Create baseline models with default parameters"""
        models = {
            'svm': LinearSVC(
                random_state=self.random_state,
                class_weight='balanced',
                max_iter=2000  # Increased to ensure convergence
            ),
            'logistic': LogisticRegression(
                random_state=self.random_state,
                class_weight='balanced',
                max_iter=2000
            )
        }
        return models

    def evaluate_model(self, model, X: np.ndarray, y: np.ndarray, model_name: str) -> Dict:
        """
        Evaluate model performance with multiple metrics
        """
        # Get predictions
        y_pred = model.predict(X)

        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(y, y_pred),
            'macro_f1': f1_score(y, y_pred, average='macro'),
            'confusion_matrix': confusion_matrix(y, y_pred),
        }

        # Calculate per-class metrics
        precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred)

        # Add per-class metrics
        for i, class_name in enumerate(self.le.classes_):
            metrics[f'{class_name}_precision'] = precision[i]
            metrics[f'{class_name}_recall'] = recall[i]
            metrics[f'{class_name}_f1'] = f1[i]

        # Log results
        logging.info(f"\nResults for {model_name}:")
        logging.info(f"Accuracy: {metrics['accuracy']:.4f}")
        logging.info(f"Macro F1: {metrics['macro_f1']:.4f}")

        return metrics

    def plot_confusion_matrix(self, confusion_mat: np.ndarray, model_name: str):
        """
        Plot confusion matrix heatmap
        """
        plt.figure(figsize=(10, 8))
        sns.heatmap(
            confusion_mat,
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=self.le.classes_,
            yticklabels=self.le.classes_
        )
        plt.title(f'Confusion Matrix - {model_name}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.savefig(f'confusion_matrix_{model_name}.png')
        plt.close()



In [None]:
features

In [None]:
!unzip "/content/model_optimization_20241212_124830_multiseed.zip"

Archive:  /content/model_optimization_20241212_124830_multiseed.zip
   creating: model_optimization_20241212_124830_multiseed/
   creating: model_optimization_20241212_124830_multiseed/LogisticRegression/
   creating: model_optimization_20241212_124830_multiseed/SVM/
   creating: model_optimization_20241212_124830_multiseed/seed_1/
   creating: model_optimization_20241212_124830_multiseed/seed_11/
   creating: model_optimization_20241212_124830_multiseed/seed_23/
   creating: model_optimization_20241212_124830_multiseed/seed_40/
   creating: model_optimization_20241212_124830_multiseed/seed_45/
   creating: model_optimization_20241212_124830_multiseed/seed_46/
   creating: model_optimization_20241212_124830_multiseed/seed_54/
   creating: model_optimization_20241212_124830_multiseed/seed_71/
   creating: model_optimization_20241212_124830_multiseed/seed_81/
   creating: model_optimization_20241212_124830_multiseed/seed_84/
  inflating: model_optimization_20241212_124830_multiseed/Logis

## More Robust Learning Curve

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import json
import os
import logging
from typing import Dict, List, Tuple, Any
import glob
import pandas as pd
from datetime import datetime

class ModelEvaluator:
    """Handles multi-seed evaluation of models using grid search results"""

    def __init__(self, random_state: int = 42):
        self.random_state = random_state
        self.le = LabelEncoder()
        self.setup_output_dir()
        self.setup_logging()

    def setup_output_dir(self):
        """Create output directory with timestamp"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        self.output_dir = f'model_evaluation_{timestamp}'
        os.makedirs(self.output_dir, exist_ok=True)

    def setup_logging(self):
        """Configure logging"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(os.path.join(self.output_dir, 'evaluation.log')),
                logging.StreamHandler()
            ]
        )

    def load_grid_search_results(self, grid_search_dir: str) -> Dict:
        """
        Load results from multi-seed grid search
        """
        results = {}

        # Load random seeds
        with open(os.path.join(grid_search_dir, 'random_seeds.json'), 'r') as f:
            seeds_info = json.load(f)
            seeds = seeds_info['generated_seeds']

        # Process each seed directory
        for seed in seeds:
            seed_dir = os.path.join(grid_search_dir, f'seed_{seed}')
            results[seed] = {}

            # Load results for each model type
            for model_name in ['SVM', 'LogisticRegression']:
                metrics_file = os.path.join(seed_dir, f'{model_name}_metrics.json')
                with open(metrics_file, 'r') as f:
                    results[seed][model_name] = json.load(f)

        return results, seeds

    def prepare_data(self, features: np.ndarray, labels: np.ndarray, seed: int) -> Tuple:
        """Prepare train-test split using specific seed"""
        y = self.le.fit_transform(labels)
        return train_test_split(features, y, test_size=0.2, random_state=seed, stratify=y)

    def create_model(self, model_type: str, params: Dict, seed: int) -> Any:
        """Create model with specified parameters"""
        if model_type == 'SVM':
            return LinearSVC(random_state=seed, max_iter=2000, **params)
        elif model_type == 'LogisticRegression':
            return LogisticRegression(random_state=seed, max_iter=2000, **params)
        else:
            raise ValueError(f"Unknown model type: {model_type}")

    def evaluate_model(self, model, X: np.ndarray, y: np.ndarray) -> Dict:
        """Comprehensive model evaluation"""
        y_pred = model.predict(X)

        metrics = {
            'weighted_f1': f1_score(y, y_pred, average='weighted'),
            'macro_f1': f1_score(y, y_pred, average='macro'),
            'accuracy': accuracy_score(y, y_pred),
            'confusion_matrix': confusion_matrix(y, y_pred)
        }

        # Calculate per-class metrics
        precision, recall, f1, support = precision_recall_fscore_support(y, y_pred)

        for i, class_name in enumerate(self.le.classes_):
            metrics[f'{class_name}_precision'] = precision[i]
            metrics[f'{class_name}_recall'] = recall[i]
            metrics[f'{class_name}_f1'] = f1[i]

        return metrics

    def plot_confusion_matrices(self, confusion_matrices: List[np.ndarray],
                              model_name: str, seed: int):
        """Plot confusion matrices for a specific seed"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

        # Plot training confusion matrix
        sns.heatmap(confusion_matrices[0], annot=True, fmt='d', ax=ax1, cmap='Blues',
                   xticklabels=self.le.classes_, yticklabels=self.le.classes_)
        ax1.set_title(f'Training Confusion Matrix\nSeed: {seed}')
        ax1.set_xlabel('Predicted Label')
        ax1.set_ylabel('True Label')

        # Plot test confusion matrix
        sns.heatmap(confusion_matrices[1], annot=True, fmt='d', ax=ax2, cmap='Blues',
                   xticklabels=self.le.classes_, yticklabels=self.le.classes_)
        ax2.set_title(f'Test Confusion Matrix\nSeed: {seed}')
        ax2.set_xlabel('Predicted Label')
        ax2.set_ylabel('True Label')

        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, f'{model_name}_confusion_matrices_seed_{seed}.png'))
        plt.close()

    def plot_aggregated_results(self, all_results: Dict, model_name: str):
        """Plot aggregated results across seeds"""
        test_scores = {
            'weighted_f1': [],
            'macro_f1': [],
            'accuracy': []
        }

        for seed_results in all_results.values():
            metrics = seed_results['test_metrics']
            for metric in test_scores.keys():
                test_scores[metric].append(metrics[metric])

        # Create box plots
        plt.figure(figsize=(10, 6))
        data = [scores for scores in test_scores.values()]
        plt.boxplot(data, labels=list(test_scores.keys()))
        plt.title(f'{model_name}: Performance Distribution Across Seeds')
        plt.ylabel('Score')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, f'{model_name}_performance_distribution.png'))
        plt.close()

    def calculate_learning_curve(self, features: np.ndarray, labels: np.ndarray,
                           grid_results: Dict, seeds: List[int],
                           train_sizes: np.ndarray = None) -> Dict:
        """
        Calculate learning curves across multiple seeds using incremental training data
        and evaluating on the holdout test set.

        Args:
            features: Input features
            labels: Target labels
            grid_results: Grid search results containing best parameters for each seed
            seeds: List of random seeds
            train_sizes: Array of training set sizes to evaluate (proportions from 0 to 1)

        Returns:
            Dictionary containing learning curve data for both models
        """
        if train_sizes is None:
            train_sizes = np.linspace(0.05, 1.0, 15)

        # Initialize storage for learning curve data
        curve_data = {
            'SVM': {size: [] for size in train_sizes},
            'LogisticRegression': {size: [] for size in train_sizes}
        }

        # For each seed
        for seed in seeds:
            # Split data into train and test sets
            X_train, X_test, y_train, y_test = self.prepare_data(features, labels, seed)
            n_samples = len(y_train)

            # For each model type
            for model_name in ['SVM', 'LogisticRegression']:
                # Get best parameters for this seed
                params = grid_results[seed][model_name]['best_params']

                # For each training set size
                for train_size in train_sizes:
                    # Calculate number of samples for this training size
                    n_train = int(n_samples * train_size)

                    # Create and train model on subset
                    model = self.create_model(model_name, params, seed)
                    model.fit(X_train[:n_train], y_train[:n_train])

                    # Evaluate on test set
                    metrics = self.evaluate_model(model, X_test, y_test)
                    curve_data[model_name][train_size].append(metrics['macro_f1'])

        # Calculate mean and std for each size
        learning_curves = {
            model_name: {
                'train_sizes': train_sizes * len(y_train),
                'test_scores_mean': [np.mean(curve_data[model_name][size])
                                  for size in train_sizes],
                'test_scores_std': [np.std(curve_data[model_name][size])
                                  for size in train_sizes]
            }
            for model_name in ['SVM', 'LogisticRegression']
        }

        return learning_curves

    def plot_averaged_confusion_matrices(self, all_results: Dict[str, Dict]):
        """
        Plot averaged confusion matrices across all seeds for both models.

        Args:
            all_results: Dictionary containing results for all models and seeds
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

        for idx, model_name in enumerate(['SVM', 'LogisticRegression']):
            # Get all test confusion matrices for this model
            confusion_matrices = []
            for seed_results in all_results[model_name].values():
                cm = seed_results['test_metrics']['confusion_matrix']
                confusion_matrices.append(cm)

            # Calculate average confusion matrix
            avg_cm = np.mean(confusion_matrices, axis=0)

            # Calculate standard deviation for annotations
            std_cm = np.std(confusion_matrices, axis=0)

            # Create annotations with mean ± std
            annotations = np.array([
                [f'{avg:.1f} ± {std:.1f}'
                for avg, std in zip(row_avg, row_std)]
                for row_avg, row_std in zip(avg_cm, std_cm)
            ])

            # Plot heatmap
            ax = ax1 if idx == 0 else ax2
            sns.heatmap(
                avg_cm,
                annot=annotations,
                fmt='',
                cmap='Blues',
                xticklabels=self.le.classes_,
                yticklabels=self.le.classes_,
                ax=ax
            )

            ax.set_xlabel('Predicted Label')
            ax.set_ylabel('True Label')


        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'averaged_confusion_matrices.png'),
                    dpi=300, bbox_inches='tight')
        plt.close()


    def plot_learning_curves(self, learning_curves: Dict):
        """
        Plot learning curves with total samples info in top left box
        and seeds info in top right.

        Args:
            learning_curves: Dictionary containing learning curve data for both models
        """
        plt.figure(figsize=(12, 8))

        # Get the current axes
        ax = plt.gca()

        # Remove the top and right spines
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['bottom'].set_visible(False)


        colors = {
            'SVM': 'blue',
            'LogisticRegression': 'red'
        }

        for model_name in ['SVM', 'LogisticRegression']:
            data = learning_curves[model_name]

            # Plot mean test scores
            plt.plot(data['train_sizes'], data['test_scores_mean'],
                    f'-', color=colors[model_name], label=f'{model_name}',
                    linewidth=2)

            # Plot standard deviation bands
            plt.fill_between(data['train_sizes'],
                            np.array(data['test_scores_mean']) - np.array(data['test_scores_std']),
                            np.array(data['test_scores_mean']) + np.array(data['test_scores_std']),
                            alpha=0.1, color=colors[model_name])

        plt.xlabel('Number of Training Samples')
        plt.ylabel('Macro F1 Score')

        plt.legend(loc='lower right', frameon=True)

        # Make grid lighter
        plt.grid(True)


        # Get total number of samples (maximum training size)
        total_samples = max(learning_curves['SVM']['train_sizes'])

        # Add text box with total samples at top left
        plt.text(0.02, 0.98, f'Total samples: {total_samples:.0f}',
                transform=plt.gca().transAxes,
                bbox=dict(facecolor='white',
                          alpha=0.8,
                          boxstyle='round,pad=0.5'),
                verticalalignment='top',
                horizontalalignment='left',
                fontsize=10)


        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'learning_curves_mean_ensemble.png'),
                    dpi=300,
                    bbox_inches='tight')
        plt.close()

    def save_learning_curves_data(self, learning_curves: Dict, output_dir: str):
        """
        Save learning curves data to a JSON file for later plotting

        Args:
            learning_curves: Dictionary containing learning curve data for both models
            output_dir: Directory to save the output file
        """
        curves_data = {}

        for model_name in learning_curves:
            data = learning_curves[model_name]
            curves_data[model_name] = {
                'train_sizes': data['train_sizes'].tolist(),  # Convert numpy array to list
                'test_scores_mean': data['test_scores_mean'],
                'test_scores_std': data['test_scores_std']
            }

        output_file = os.path.join(output_dir, 'learning_curves_data.json')
        with open(output_file, 'w') as f:
            json.dump(curves_data, f, indent=2)


def run_evaluation(data_dir: str, grid_search_dir: str) -> Dict:
    """Run complete evaluation pipeline"""
    try:
        # Initialize evaluator
        evaluator = ModelEvaluator()

        # Load data
        data_loader = DataLoader(data_dir)
        features, labels = data_loader.load_npz_files()

        # Load grid search results
        grid_results, seeds = evaluator.load_grid_search_results(grid_search_dir)

        all_results = {}
        for model_name in ['SVM', 'LogisticRegression']:
            all_results[model_name] = {}

            for seed in seeds:
                # Prepare data using seed
                X_train, X_test, y_train, y_test = evaluator.prepare_data(features, labels, seed)

                # Get best parameters for this seed
                params = grid_results[seed][model_name]['best_params']

                # Create and train model
                model = evaluator.create_model(model_name, params, seed)
                model.fit(X_train, y_train)

                # Evaluate model
                train_metrics = evaluator.evaluate_model(model, X_train, y_train)
                test_metrics = evaluator.evaluate_model(model, X_test, y_test)

                # Store results
                all_results[model_name][seed] = {
                    'train_metrics': train_metrics,
                    'test_metrics': test_metrics,
                    'params': params
                }

        # Plot averaged confusion matrices
        evaluator.plot_averaged_confusion_matrices(all_results)

        # Calculate and saves learning curves
        learning_curves = evaluator.calculate_learning_curve(
            features, labels, grid_results, seeds
        )
        evaluator.save_learning_curves_data(learning_curves, evaluator.output_dir)

        # Plot learning curves
        evaluator.plot_learning_curves(learning_curves)

        # Save all results
        for model_name in all_results:
            with open(os.path.join(evaluator.output_dir, f'{model_name}_evaluation.json'), 'w') as f:
                json.dump(all_results[model_name], f, cls=NumpyEncoder)

        return {
            'model_results': all_results,
            'learning_curves': learning_curves
        }

    except Exception as e:
        logging.error(f"Error in evaluation pipeline: {str(e)}")
        raise



In [None]:
!unzip '/content/model_optimization_20241212_191723_multiseed.zip'

Archive:  /content/model_optimization_20241212_191723_multiseed.zip
   creating: model_optimization_20241212_191723_multiseed/
   creating: model_optimization_20241212_191723_multiseed/LogisticRegression/
   creating: model_optimization_20241212_191723_multiseed/SVM/
   creating: model_optimization_20241212_191723_multiseed/seed_15/
   creating: model_optimization_20241212_191723_multiseed/seed_29/
   creating: model_optimization_20241212_191723_multiseed/seed_30/
   creating: model_optimization_20241212_191723_multiseed/seed_32/
   creating: model_optimization_20241212_191723_multiseed/seed_37/
   creating: model_optimization_20241212_191723_multiseed/seed_38/
   creating: model_optimization_20241212_191723_multiseed/seed_4/
   creating: model_optimization_20241212_191723_multiseed/seed_65/
   creating: model_optimization_20241212_191723_multiseed/seed_88/
   creating: model_optimization_20241212_191723_multiseed/seed_91/
  inflating: model_optimization_20241212_191723_multiseed/SVM_s

In [None]:
# Example usage
if __name__ == "__main__":
    # Directory containing your NPZ files
    data_dir = "/path/to/Feature Extraction/ViT-SO400M-14-SigLIP"

    # Directory containing grid search results
    grid_search_dir = "/path/to/Temporal Pooling/model_optimization_20250206_130548_multiseed"

    # Run evaluation
    results = run_evaluation(data_dir, grid_search_dir)



In [None]:
import shutil
import os

def zip_folder(folder_path, output_zip_path):
    """
    Create a zip file from a folder in Google Colab.

    Args:
        folder_path (str): Path to the folder you want to zip
        output_zip_path (str): Path where you want to save the zip file
    """
    # Make sure the folder exists
    if not os.path.exists(folder_path):
        raise ValueError(f"Folder {folder_path} does not exist")

    # Create the zip file
    shutil.make_archive(
        base_name=output_zip_path.replace('.zip', ''),
        format='zip',
        root_dir=os.path.dirname(folder_path),
        base_dir=os.path.basename(folder_path)
    )

In [None]:
# Example usage
folder_to_zip = '/content/model_evaluation_20241212_194724'  # Path to your folder
output_zip = '/content/model_evaluation_20241212_194724.zip'  # Where to save the zip file

zip_folder(folder_to_zip, output_zip)

# Evaluation - Single Frame

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.preprocessing import LabelEncoder
import json
import os
import logging
from typing import Dict, List, Tuple, Any
import glob
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression


class NumpyEncoder(json.JSONEncoder):
    """Custom encoder for numpy data types"""
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.bool_):
            return bool(obj)
        return super(NumpyEncoder, self).default(obj)

class DataLoader:
    """Handle loading and processing of NPZ files"""
    def __init__(self, data_dir: str):
        self.data_dir = data_dir

    def load_npz_files(self) -> Tuple[np.ndarray, np.ndarray]:
        """
        Load all NPZ files from directory and extract averaged_mean features and labels

        Returns:
            features: numpy array of averaged_mean features
            labels: numpy array of fish species labels
        """
        features_list = []
        labels_list = []

        # Get all NPZ files in directory
        npz_files = glob.glob(os.path.join(self.data_dir, "*.npz"))

        logging.info(f"Found {len(npz_files)} NPZ files")

        for npz_file in npz_files:
            try:
                # Load NPZ file
                data = np.load(npz_file, allow_pickle=True)

                # Extract averaged_mean feature and label
                middle_frame = data['middle_frame'].item()
                frame_features = data['features'].item()[middle_frame]
                fish_species = str(data['fish_species'].item())  # Convert to string

                if frame_features is not None:
                    features_list.append(frame_features)
                    labels_list.append(fish_species)

            except Exception as e:
                logging.error(f"Error processing file {npz_file}: {str(e)}")
                continue

        # Convert lists to numpy arrays
        features_array = np.array(features_list)
        labels_array = np.array(labels_list)

        # Log data distribution
        unique_labels, counts = np.unique(labels_array, return_counts=True)
        for label, count in zip(unique_labels, counts):
            percentage = (count / len(labels_array)) * 100
            logging.info(f"Class {label}: {count} samples ({percentage:.2f}%)")

        return features_array, labels_array


class FishClassifier:
    def __init__(self, random_state: int = 42):
        self.random_state = random_state
        self.le = LabelEncoder()
        self.setup_logging()

    def setup_logging(self):
        """Setup logging configuration"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            filename='fish_classifier.log'
        )

    def prepare_data(self, features: np.ndarray, labels: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        Prepare data by splitting into train and test sets with stratification
        """
        # Encode labels
        y = self.le.fit_transform(labels)

        # Create stratified train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            features,
            y,
            test_size=0.2,
            random_state=self.random_state,
            stratify=y
        )

        logging.info(f"Training set size: {X_train.shape[0]}")
        logging.info(f"Test set size: {X_test.shape[0]}")

        return X_train, X_test, y_train, y_test

    def create_baseline_models(self) -> Dict:
        """Create baseline models with default parameters"""
        models = {
            'svm': LinearSVC(
                random_state=self.random_state,
                class_weight='balanced',
                max_iter=2000
            ),
            'logistic': LogisticRegression(
                random_state=self.random_state,
                class_weight='balanced',
                max_iter=2000
            )
        }
        return models

    def evaluate_model(self, model, X: np.ndarray, y: np.ndarray, model_name: str) -> Dict:
        """
        Evaluate model performance with multiple metrics
        """
        # Get predictions
        y_pred = model.predict(X)

        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(y, y_pred),
            'macro_f1': f1_score(y, y_pred, average='macro'),
            'confusion_matrix': confusion_matrix(y, y_pred),
        }

        # Calculate per-class metrics
        precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred)

        # Add per-class metrics
        for i, class_name in enumerate(self.le.classes_):
            metrics[f'{class_name}_precision'] = precision[i]
            metrics[f'{class_name}_recall'] = recall[i]
            metrics[f'{class_name}_f1'] = f1[i]

        # Log results
        logging.info(f"\nResults for {model_name}:")
        logging.info(f"Accuracy: {metrics['accuracy']:.4f}")
        logging.info(f"Macro F1: {metrics['macro_f1']:.4f}")

        return metrics

    def plot_confusion_matrix(self, confusion_mat: np.ndarray, model_name: str):
        """
        Plot confusion matrix heatmap
        """
        plt.figure(figsize=(10, 8))
        sns.heatmap(
            confusion_mat,
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=self.le.classes_,
            yticklabels=self.le.classes_
        )
        plt.title(f'Confusion Matrix - {model_name}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.savefig(f'confusion_matrix_{model_name}.png')
        plt.close()



## Robust Learning Curve

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import json
import os
import logging
from typing import Dict, List, Tuple, Any
import glob
import pandas as pd
from datetime import datetime

class ModelEvaluator:
    """Handles multi-seed evaluation of models using grid search results"""

    def __init__(self, random_state: int = 42):
        self.random_state = random_state
        self.le = LabelEncoder()
        self.setup_output_dir()
        self.setup_logging()

    def setup_output_dir(self):
        """Create output directory with timestamp"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        self.output_dir = f'model_evaluation_{timestamp}'
        os.makedirs(self.output_dir, exist_ok=True)

    def setup_logging(self):
        """Configure logging"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(os.path.join(self.output_dir, 'evaluation.log')),
                logging.StreamHandler()
            ]
        )

    def load_grid_search_results(self, grid_search_dir: str) -> Dict:
        """
        Load results from multi-seed grid search
        """
        results = {}

        # Load random seeds
        with open(os.path.join(grid_search_dir, 'random_seeds.json'), 'r') as f:
            seeds_info = json.load(f)
            seeds = seeds_info['generated_seeds']

        # Process each seed directory
        for seed in seeds:
            seed_dir = os.path.join(grid_search_dir, f'seed_{seed}')
            results[seed] = {}

            # Load results for each model type
            for model_name in ['SVM', 'LogisticRegression']:
                metrics_file = os.path.join(seed_dir, f'{model_name}_metrics.json')
                with open(metrics_file, 'r') as f:
                    results[seed][model_name] = json.load(f)

        return results, seeds

    def prepare_data(self, features: np.ndarray, labels: np.ndarray, seed: int) -> Tuple:
        """Prepare train-test split using specific seed"""
        y = self.le.fit_transform(labels)
        return train_test_split(features, y, test_size=0.2, random_state=seed, stratify=y)

    def create_model(self, model_type: str, params: Dict, seed: int) -> Any:
        """Create model with specified parameters"""
        if model_type == 'SVM':
            return LinearSVC(random_state=seed, max_iter=2000, **params)
        elif model_type == 'LogisticRegression':
            return LogisticRegression(random_state=seed, max_iter=2000, **params)
        else:
            raise ValueError(f"Unknown model type: {model_type}")

    def evaluate_model(self, model, X: np.ndarray, y: np.ndarray) -> Dict:
        """Comprehensive model evaluation"""
        y_pred = model.predict(X)

        metrics = {
            'weighted_f1': f1_score(y, y_pred, average='weighted'),
            'macro_f1': f1_score(y, y_pred, average='macro'),
            'accuracy': accuracy_score(y, y_pred),
            'confusion_matrix': confusion_matrix(y, y_pred)
        }

        # Calculate per-class metrics
        precision, recall, f1, support = precision_recall_fscore_support(y, y_pred)

        for i, class_name in enumerate(self.le.classes_):
            metrics[f'{class_name}_precision'] = precision[i]
            metrics[f'{class_name}_recall'] = recall[i]
            metrics[f'{class_name}_f1'] = f1[i]

        return metrics

    def plot_confusion_matrices(self, confusion_matrices: List[np.ndarray],
                              model_name: str, seed: int):
        """Plot confusion matrices for a specific seed"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

        # Plot training confusion matrix
        sns.heatmap(confusion_matrices[0], annot=True, fmt='d', ax=ax1, cmap='Blues',
                   xticklabels=self.le.classes_, yticklabels=self.le.classes_)
        ax1.set_title(f'Training Confusion Matrix\nSeed: {seed}')
        ax1.set_xlabel('Predicted Label')
        ax1.set_ylabel('True Label')

        # Plot test confusion matrix
        sns.heatmap(confusion_matrices[1], annot=True, fmt='d', ax=ax2, cmap='Blues',
                   xticklabels=self.le.classes_, yticklabels=self.le.classes_)
        ax2.set_title(f'Test Confusion Matrix\nSeed: {seed}')
        ax2.set_xlabel('Predicted Label')
        ax2.set_ylabel('True Label')

        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, f'{model_name}_confusion_matrices_seed_{seed}.png'))
        plt.close()

    def plot_aggregated_results(self, all_results: Dict, model_name: str):
        """Plot aggregated results across seeds"""
        test_scores = {
            'weighted_f1': [],
            'macro_f1': [],
            'accuracy': []
        }

        for seed_results in all_results.values():
            metrics = seed_results['test_metrics']
            for metric in test_scores.keys():
                test_scores[metric].append(metrics[metric])

        # Create box plots
        plt.figure(figsize=(1C0, 6))
        data = [scores for scores in test_scores.values()]
        plt.boxplot(data, labels=list(test_scores.keys()))
        plt.title(f'{model_name}: Performance Distribution Across Seeds')
        plt.ylabel('Score')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, f'{model_name}_performance_distribution.png'))
        plt.close()

    def calculate_learning_curve(self, features: np.ndarray, labels: np.ndarray,
                           grid_results: Dict, seeds: List[int],
                           train_sizes: np.ndarray = None) -> Dict:
        """
        Calculate learning curves across multiple seeds using incremental training data
        and evaluating on the holdout test set.

        Args:
            features: Input features
            labels: Target labels
            grid_results: Grid search results containing best parameters for each seed
            seeds: List of random seeds
            train_sizes: Array of training set sizes to evaluate (proportions from 0 to 1)

        Returns:
            Dictionary containing learning curve data for both models
        """
        if train_sizes is None:
            train_sizes = np.linspace(0.05, 1.0, 15)

        # Initialize storage for learning curve data
        curve_data = {
            'SVM': {size: [] for size in train_sizes},
            'LogisticRegression': {size: [] for size in train_sizes}
        }

        # For each seed
        for seed in seeds:
            # Split data into train and test sets
            X_train, X_test, y_train, y_test = self.prepare_data(features, labels, seed)
            n_samples = len(y_train)

            # For each model type
            for model_name in ['SVM', 'LogisticRegression']:
                # Get best parameters for this seed
                params = grid_results[seed][model_name]['best_params']

                # For each training set size
                for train_size in train_sizes:
                    # Calculate number of samples for this training size
                    n_train = int(n_samples * train_size)

                    # Create and train model on subset
                    model = self.create_model(model_name, params, seed)
                    model.fit(X_train[:n_train], y_train[:n_train])

                    # Evaluate on test set
                    metrics = self.evaluate_model(model, X_test, y_test)
                    curve_data[model_name][train_size].append(metrics['macro_f1'])

        # Calculate mean and std for each size
        learning_curves = {
            model_name: {
                'train_sizes': train_sizes * len(y_train),
                'test_scores_mean': [np.mean(curve_data[model_name][size])
                                  for size in train_sizes],
                'test_scores_std': [np.std(curve_data[model_name][size])
                                  for size in train_sizes]
            }
            for model_name in ['SVM', 'LogisticRegression']
        }

        return learning_curves

    def plot_averaged_confusion_matrices(self, all_results: Dict[str, Dict]):
        """
        Plot averaged confusion matrices across all seeds for both models.

        Args:
            all_results: Dictionary containing results for all models and seeds
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

        for idx, model_name in enumerate(['SVM', 'LogisticRegression']):
            # Get all test confusion matrices for this model
            confusion_matrices = []
            for seed_results in all_results[model_name].values():
                cm = seed_results['test_metrics']['confusion_matrix']
                confusion_matrices.append(cm)

            # Calculate average confusion matrix
            avg_cm = np.mean(confusion_matrices, axis=0)

            # Calculate standard deviation for annotations
            std_cm = np.std(confusion_matrices, axis=0)

            # Create annotations with mean ± std
            annotations = np.array([
                [f'{avg:.1f} ± {std:.1f}'
                for avg, std in zip(row_avg, row_std)]
                for row_avg, row_std in zip(avg_cm, std_cm)
            ])

            # Plot heatmap
            ax = ax1 if idx == 0 else ax2
            sns.heatmap(
                avg_cm,
                annot=annotations,
                fmt='',
                cmap='Blues',
                xticklabels=self.le.classes_,
                yticklabels=self.le.classes_,
                ax=ax
            )

            ax.set_xlabel('Predicted Label')
            ax.set_ylabel('True Label')


        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'averaged_confusion_matrices.png'),
                    dpi=300, bbox_inches='tight')
        plt.close()


    def plot_learning_curves(self, learning_curves: Dict):
        """
        Plot learning curves with total samples info in top left box
        and seeds info in top right.

        Args:
            learning_curves: Dictionary containing learning curve data for both models
        """
        plt.figure(figsize=(12, 8))

        # Get the current axes
        ax = plt.gca()

        # Remove the top and right spines
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['bottom'].set_visible(False)


        colors = {
            'SVM': 'blue',
            'LogisticRegression': 'red'
        }

        for model_name in ['SVM', 'LogisticRegression']:
            data = learning_curves[model_name]

            # Plot mean test scores
            plt.plot(data['train_sizes'], data['test_scores_mean'],
                    f'-', color=colors[model_name], label=f'{model_name}',
                    linewidth=2)

            # Plot standard deviation bands
            plt.fill_between(data['train_sizes'],
                            np.array(data['test_scores_mean']) - np.array(data['test_scores_std']),
                            np.array(data['test_scores_mean']) + np.array(data['test_scores_std']),
                            alpha=0.1, color=colors[model_name])

        plt.xlabel('Number of Training Samples')
        plt.ylabel('Macro F1 Score')

        plt.legend(loc='lower right', frameon=True)

        # Make grid lighter
        plt.grid(True)


        # Get total number of samples (maximum training size)
        total_samples = max(learning_curves['SVM']['train_sizes'])

        # Add text box with total samples at top left
        plt.text(0.02, 0.98, f'Total samples: {total_samples:.0f}',
                transform=plt.gca().transAxes,
                bbox=dict(facecolor='white',
                          alpha=0.8,
                          boxstyle='round,pad=0.5'),
                verticalalignment='top',
                horizontalalignment='left',
                fontsize=10)


        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'learning_curves_mean_ensemble.png'),
                    dpi=300,
                    bbox_inches='tight')
        plt.close()

    def save_learning_curves_data(self, learning_curves: Dict, output_dir: str):
        """
        Save learning curves data to a JSON file for later plotting

        Args:
            learning_curves: Dictionary containing learning curve data for both models
            output_dir: Directory to save the output file
        """
        curves_data = {}

        for model_name in learning_curves:
            data = learning_curves[model_name]
            curves_data[model_name] = {
                'train_sizes': data['train_sizes'].tolist(),  # Convert numpy array to list
                'test_scores_mean': data['test_scores_mean'],
                'test_scores_std': data['test_scores_std']
            }

        output_file = os.path.join(output_dir, 'learning_curves_data.json')
        with open(output_file, 'w') as f:
            json.dump(curves_data, f, indent=2)


def run_evaluation(data_dir: str, grid_search_dir: str) -> Dict:
    """Run complete evaluation pipeline"""
    try:
        # Initialize evaluator
        evaluator = ModelEvaluator()

        # Load data
        data_loader = DataLoader(data_dir)
        features, labels = data_loader.load_npz_files()

        # Load grid search results
        grid_results, seeds = evaluator.load_grid_search_results(grid_search_dir)

        all_results = {}
        for model_name in ['SVM', 'LogisticRegression']:
            all_results[model_name] = {}

            for seed in seeds:
                # Prepare data using seed
                X_train, X_test, y_train, y_test = evaluator.prepare_data(features, labels, seed)

                # Get best parameters for this seed
                params = grid_results[seed][model_name]['best_params']

                # Create and train model
                model = evaluator.create_model(model_name, params, seed)
                model.fit(X_train, y_train)

                # Evaluate model
                train_metrics = evaluator.evaluate_model(model, X_train, y_train)
                test_metrics = evaluator.evaluate_model(model, X_test, y_test)

                # Store results
                all_results[model_name][seed] = {
                    'train_metrics': train_metrics,
                    'test_metrics': test_metrics,
                    'params': params
                }

        # Plot averaged confusion matrices
        evaluator.plot_averaged_confusion_matrices(all_results)

        # Calculate and saves learning curves
        learning_curves = evaluator.calculate_learning_curve(
            features, labels, grid_results, seeds
        )
        evaluator.save_learning_curves_data(learning_curves, evaluator.output_dir)

        # Plot learning curves
        evaluator.plot_learning_curves(learning_curves)

        # Save all results
        for model_name in all_results:
            with open(os.path.join(evaluator.output_dir, f'{model_name}_evaluation.json'), 'w') as f:
                json.dump(all_results[model_name], f, cls=NumpyEncoder)

        return {
            'model_results': all_results,
            'learning_curves': learning_curves
        }

    except Exception as e:
        logging.error(f"Error in evaluation pipeline: {str(e)}")
        raise



In [None]:
!unzip '/content/model_optimization_20241212_201232_multiseed.zip'

Archive:  /content/model_optimization_20241212_201232_multiseed.zip
   creating: model_optimization_20241212_201232_multiseed/
   creating: model_optimization_20241212_201232_multiseed/LogisticRegression/
   creating: model_optimization_20241212_201232_multiseed/SVM/
   creating: model_optimization_20241212_201232_multiseed/seed_15/
   creating: model_optimization_20241212_201232_multiseed/seed_29/
   creating: model_optimization_20241212_201232_multiseed/seed_30/
   creating: model_optimization_20241212_201232_multiseed/seed_32/
   creating: model_optimization_20241212_201232_multiseed/seed_37/
   creating: model_optimization_20241212_201232_multiseed/seed_38/
   creating: model_optimization_20241212_201232_multiseed/seed_4/
   creating: model_optimization_20241212_201232_multiseed/seed_65/
   creating: model_optimization_20241212_201232_multiseed/seed_88/
   creating: model_optimization_20241212_201232_multiseed/seed_91/
  inflating: model_optimization_20241212_201232_multiseed/SVM_s

In [None]:
# Example usage
if __name__ == "__main__":
    # Directory containing your NPZ files
    data_dir = "/path/to/Feature Extraction/ViT-SO400M-14-SigLIP"

    # Directory containing grid search results
    grid_search_dir = "/path/to/Single Frame/model_optimization_20250206_115650_multiseed"

    # Run evaluation
    results = run_evaluation(data_dir, grid_search_dir)



# Evaluation - ResNet

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, learning_curve
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.preprocessing import LabelEncoder
from scipy.stats import loguniform
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import pandas as pd
from typing import Dict, Tuple, List
import json
import os
import logging
import glob
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.metrics import balanced_accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from typing import Dict, Tuple, List
import logging
import pandas as pd


# Custom scorer for weighted F1
macro_f1_scorer = make_scorer(f1_score, average='macro')

class DataLoader:
    """Handle loading and processing of NPZ files"""
    def __init__(self, data_dir: str):
        self.data_dir = data_dir

    def load_npz_files(self) -> Tuple[np.ndarray, np.ndarray]:
        """
        Load all NPZ files from directory and extract averaged_mean features and labels

        Returns:
            features: numpy array of averaged_mean features
            labels: numpy array of fish species labels
        """
        features_list = []
        labels_list = []

        # Get all NPZ files in directory
        npz_files = glob.glob(os.path.join(self.data_dir, "*.npz"))

        logging.info(f"Found {len(npz_files)} NPZ files")

        for npz_file in npz_files:
            try:
                # Load NPZ file
                data = np.load(npz_file, allow_pickle=True)

                # Extract averaged_mean feature and label
                features = data['features']  # Convert from np.ndarray to dict
                fish_species = str(data['fish_species'].item())  # Convert to string

                if features is not None:
                    features_list.append(features)
                    labels_list.append(fish_species)

            except Exception as e:
                logging.error(f"Error processing file {npz_file}: {str(e)}")
                continue

        # Convert lists to numpy arrays
        features_array = np.array(features_list)
        labels_array = np.array(labels_list)

        # Log data distribution
        unique_labels, counts = np.unique(labels_array, return_counts=True)
        for label, count in zip(unique_labels, counts):
            percentage = (count / len(labels_array)) * 100
            logging.info(f"Class {label}: {count} samples ({percentage:.2f}%)")

        return features_array, labels_array


class NumpyEncoder(json.JSONEncoder):
    """Custom encoder for numpy data types"""
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.bool_):
            return bool(obj)
        return super(NumpyEncoder, self).default(obj)


class FishClassifier:
    def __init__(self, random_state: int = 42):
        self.random_state = random_state
        self.le = LabelEncoder()
        self.setup_logging()

    def setup_logging(self):
        """Setup logging configuration"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            filename='fish_classifier.log'
        )

    def prepare_data(self, features: np.ndarray, labels: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        Prepare data by splitting into train and test sets with stratification
        """
        # Encode labels
        y = self.le.fit_transform(labels)

        # Create stratified train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            features,
            y,
            test_size=0.2,
            random_state=self.random_state,
            stratify=y
        )

        logging.info(f"Training set size: {X_train.shape[0]}")
        logging.info(f"Test set size: {X_test.shape[0]}")

        return X_train, X_test, y_train, y_test

    def create_baseline_models(self) -> Dict:
        """Create baseline models with default parameters"""
        models = {
            'svm': LinearSVC(
                random_state=self.random_state,
                class_weight='balanced',
                max_iter=2000  # Increased to ensure convergence
            ),
            'logistic': LogisticRegression(
                random_state=self.random_state,
                class_weight='balanced',
                max_iter=2000
            )
        }
        return models

    def evaluate_model(self, model, X: np.ndarray, y: np.ndarray, model_name: str) -> Dict:
        """
        Evaluate model performance with multiple metrics
        """
        # Get predictions
        y_pred = model.predict(X)

        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(y, y_pred),
            'macro_f1': f1_score(y, y_pred, average='macro'),
            'confusion_matrix': confusion_matrix(y, y_pred),
        }

        # Calculate per-class metrics
        precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred)

        # Add per-class metrics
        for i, class_name in enumerate(self.le.classes_):
            metrics[f'{class_name}_precision'] = precision[i]
            metrics[f'{class_name}_recall'] = recall[i]
            metrics[f'{class_name}_f1'] = f1[i]

        # Log results
        logging.info(f"\nResults for {model_name}:")
        logging.info(f"Accuracy: {metrics['accuracy']:.4f}")
        logging.info(f"Macro F1: {metrics['macro_f1']:.4f}")

        return metrics

    def plot_confusion_matrix(self, confusion_mat: np.ndarray, model_name: str):
        """
        Plot confusion matrix heatmap
        """
        plt.figure(figsize=(10, 8))
        sns.heatmap(
            confusion_mat,
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=self.le.classes_,
            yticklabels=self.le.classes_
        )
        plt.title(f'Confusion Matrix - {model_name}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.savefig(f'confusion_matrix_{model_name}.png')
        plt.close()


## Robust Learning Curve

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import json
import os
import logging
from typing import Dict, List, Tuple, Any
import glob
import pandas as pd
from datetime import datetime

class ModelEvaluator:
    """Handles multi-seed evaluation of models using grid search results"""

    def __init__(self, random_state: int = 42):
        self.random_state = random_state
        self.le = LabelEncoder()
        self.setup_output_dir()
        self.setup_logging()

    def setup_output_dir(self):
        """Create output directory with timestamp"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        self.output_dir = f'model_evaluation_{timestamp}'
        os.makedirs(self.output_dir, exist_ok=True)

    def setup_logging(self):
        """Configure logging"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(os.path.join(self.output_dir, 'evaluation.log')),
                logging.StreamHandler()
            ]
        )

    def load_grid_search_results(self, grid_search_dir: str) -> Dict:
        """
        Load results from multi-seed grid search
        """
        results = {}

        # Load random seeds
        with open(os.path.join(grid_search_dir, 'random_seeds.json'), 'r') as f:
            seeds_info = json.load(f)
            seeds = seeds_info['generated_seeds']

        # Process each seed directory
        for seed in seeds:
            seed_dir = os.path.join(grid_search_dir, f'seed_{seed}')
            results[seed] = {}

            # Load results for each model type
            for model_name in ['SVM', 'LogisticRegression']:
                metrics_file = os.path.join(seed_dir, f'{model_name}_metrics.json')
                with open(metrics_file, 'r') as f:
                    results[seed][model_name] = json.load(f)

        return results, seeds

    def prepare_data(self, features: np.ndarray, labels: np.ndarray, seed: int) -> Tuple:
        """Prepare train-test split using specific seed"""
        y = self.le.fit_transform(labels)
        return train_test_split(features, y, test_size=0.2, random_state=seed, stratify=y)

    def create_model(self, model_type: str, params: Dict, seed: int) -> Any:
        """Create model with specified parameters"""
        if model_type == 'SVM':
            return LinearSVC(random_state=seed, max_iter=2000, **params)
        elif model_type == 'LogisticRegression':
            return LogisticRegression(random_state=seed, max_iter=2000, **params)
        else:
            raise ValueError(f"Unknown model type: {model_type}")

    def evaluate_model(self, model, X: np.ndarray, y: np.ndarray) -> Dict:
        """Comprehensive model evaluation"""
        y_pred = model.predict(X)

        metrics = {
            'weighted_f1': f1_score(y, y_pred, average='weighted'),
            'macro_f1': f1_score(y, y_pred, average='macro'),
            'accuracy': accuracy_score(y, y_pred),
            'confusion_matrix': confusion_matrix(y, y_pred)
        }

        # Calculate per-class metrics
        precision, recall, f1, support = precision_recall_fscore_support(y, y_pred)

        for i, class_name in enumerate(self.le.classes_):
            metrics[f'{class_name}_precision'] = precision[i]
            metrics[f'{class_name}_recall'] = recall[i]
            metrics[f'{class_name}_f1'] = f1[i]

        return metrics

    def plot_confusion_matrices(self, confusion_matrices: List[np.ndarray],
                              model_name: str, seed: int):
        """Plot confusion matrices for a specific seed"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

        # Plot training confusion matrix
        sns.heatmap(confusion_matrices[0], annot=True, fmt='d', ax=ax1, cmap='Blues',
                   xticklabels=self.le.classes_, yticklabels=self.le.classes_)
        ax1.set_title(f'Training Confusion Matrix\nSeed: {seed}')
        ax1.set_xlabel('Predicted Label')
        ax1.set_ylabel('True Label')

        # Plot test confusion matrix
        sns.heatmap(confusion_matrices[1], annot=True, fmt='d', ax=ax2, cmap='Blues',
                   xticklabels=self.le.classes_, yticklabels=self.le.classes_)
        ax2.set_title(f'Test Confusion Matrix\nSeed: {seed}')
        ax2.set_xlabel('Predicted Label')
        ax2.set_ylabel('True Label')

        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, f'{model_name}_confusion_matrices_seed_{seed}.png'))
        plt.close()

    def plot_aggregated_results(self, all_results: Dict, model_name: str):
        """Plot aggregated results across seeds"""
        test_scores = {
            'weighted_f1': [],
            'macro_f1': [],
            'accuracy': []
        }

        for seed_results in all_results.values():
            metrics = seed_results['test_metrics']
            for metric in test_scores.keys():
                test_scores[metric].append(metrics[metric])

        # Create box plots
        plt.figure(figsize=(10, 6))
        data = [scores for scores in test_scores.values()]
        plt.boxplot(data, labels=list(test_scores.keys()))
        plt.title(f'{model_name}: Performance Distribution Across Seeds')
        plt.ylabel('Score')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, f'{model_name}_performance_distribution.png'))
        plt.close()

    def calculate_learning_curve(self, features: np.ndarray, labels: np.ndarray,
                           grid_results: Dict, seeds: List[int],
                           train_sizes: np.ndarray = None) -> Dict:
        """
        Calculate learning curves across multiple seeds using incremental training data
        and evaluating on the holdout test set.

        Args:
            features: Input features
            labels: Target labels
            grid_results: Grid search results containing best parameters for each seed
            seeds: List of random seeds
            train_sizes: Array of training set sizes to evaluate (proportions from 0 to 1)

        Returns:
            Dictionary containing learning curve data for both models
        """
        if train_sizes is None:
            train_sizes = np.linspace(0.05, 1.0, 15)

        # Initialize storage for learning curve data
        curve_data = {
            'SVM': {size: [] for size in train_sizes},
            'LogisticRegression': {size: [] for size in train_sizes}
        }

        # For each seed
        for seed in seeds:
            # Split data into train and test sets
            X_train, X_test, y_train, y_test = self.prepare_data(features, labels, seed)
            n_samples = len(y_train)

            # For each model type
            for model_name in ['SVM', 'LogisticRegression']:
                # Get best parameters for this seed
                params = grid_results[seed][model_name]['best_params']

                # For each training set size
                for train_size in train_sizes:
                    # Calculate number of samples for this training size
                    n_train = int(n_samples * train_size)

                    # Create and train model on subset
                    model = self.create_model(model_name, params, seed)
                    model.fit(X_train[:n_train], y_train[:n_train])

                    # Evaluate on test set
                    metrics = self.evaluate_model(model, X_test, y_test)
                    curve_data[model_name][train_size].append(metrics['macro_f1'])

        # Calculate mean and std for each size
        learning_curves = {
            model_name: {
                'train_sizes': train_sizes * len(y_train),
                'test_scores_mean': [np.mean(curve_data[model_name][size])
                                  for size in train_sizes],
                'test_scores_std': [np.std(curve_data[model_name][size])
                                  for size in train_sizes]
            }
            for model_name in ['SVM', 'LogisticRegression']
        }

        return learning_curves

    def plot_averaged_confusion_matrices(self, all_results: Dict[str, Dict]):
        """
        Plot averaged confusion matrices across all seeds for both models.

        Args:
            all_results: Dictionary containing results for all models and seeds
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

        for idx, model_name in enumerate(['SVM', 'LogisticRegression']):
            # Get all test confusion matrices for this model
            confusion_matrices = []
            for seed_results in all_results[model_name].values():
                cm = seed_results['test_metrics']['confusion_matrix']
                confusion_matrices.append(cm)

            # Calculate average confusion matrix
            avg_cm = np.mean(confusion_matrices, axis=0)

            # Calculate standard deviation for annotations
            std_cm = np.std(confusion_matrices, axis=0)

            # Create annotations with mean ± std
            annotations = np.array([
                [f'{avg:.1f} ± {std:.1f}'
                for avg, std in zip(row_avg, row_std)]
                for row_avg, row_std in zip(avg_cm, std_cm)
            ])

            # Plot heatmap
            ax = ax1 if idx == 0 else ax2
            sns.heatmap(
                avg_cm,
                annot=annotations,
                fmt='',
                cmap='Blues',
                xticklabels=self.le.classes_,
                yticklabels=self.le.classes_,
                ax=ax
            )

            ax.set_xlabel('Predicted Label')
            ax.set_ylabel('True Label')


        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'averaged_confusion_matrices.png'),
                    dpi=300, bbox_inches='tight')
        plt.close()


    def plot_learning_curves(self, learning_curves: Dict):
        """
        Plot learning curves with total samples info in top left box
        and seeds info in top right.

        Args:
            learning_curves: Dictionary containing learning curve data for both models
        """
        plt.figure(figsize=(12, 8))

        # Get the current axes
        ax = plt.gca()

        # Remove the top and right spines
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['bottom'].set_visible(False)


        colors = {
            'SVM': 'blue',
            'LogisticRegression': 'red'
        }

        for model_name in ['SVM', 'LogisticRegression']:
            data = learning_curves[model_name]

            # Plot mean test scores
            plt.plot(data['train_sizes'], data['test_scores_mean'],
                    f'-', color=colors[model_name], label=f'{model_name}',
                    linewidth=2)

            # Plot standard deviation bands
            plt.fill_between(data['train_sizes'],
                            np.array(data['test_scores_mean']) - np.array(data['test_scores_std']),
                            np.array(data['test_scores_mean']) + np.array(data['test_scores_std']),
                            alpha=0.1, color=colors[model_name])

        plt.xlabel('Number of Training Samples')
        plt.ylabel('Macro F1 Score')

        plt.legend(loc='lower right', frameon=True)

        # Make grid lighter
        plt.grid(True)


        # Get total number of samples (maximum training size)
        total_samples = max(learning_curves['SVM']['train_sizes'])

        # Add text box with total samples at top left
        plt.text(0.02, 0.98, f'Total samples: {total_samples:.0f}',
                transform=plt.gca().transAxes,
                bbox=dict(facecolor='white',
                          alpha=0.8,
                          boxstyle='round,pad=0.5'),
                verticalalignment='top',
                horizontalalignment='left',
                fontsize=10)


        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'learning_curves_mean_ensemble.png'),
                    dpi=300,
                    bbox_inches='tight')
        plt.close()

    def save_learning_curves_data(self, learning_curves: Dict, output_dir: str):
        """
        Save learning curves data to a JSON file for later plotting

        Args:
            learning_curves: Dictionary containing learning curve data for both models
            output_dir: Directory to save the output file
        """
        curves_data = {}

        for model_name in learning_curves:
            data = learning_curves[model_name]
            curves_data[model_name] = {
                'train_sizes': data['train_sizes'].tolist(),  # Convert numpy array to list
                'test_scores_mean': data['test_scores_mean'],
                'test_scores_std': data['test_scores_std']
            }

        output_file = os.path.join(output_dir, 'learning_curves_data.json')
        with open(output_file, 'w') as f:
            json.dump(curves_data, f, indent=2)


def run_evaluation(data_dir: str, grid_search_dir: str) -> Dict:
    """Run complete evaluation pipeline"""
    try:
        # Initialize evaluator
        evaluator = ModelEvaluator()

        # Load data
        data_loader = DataLoader(data_dir)
        features, labels = data_loader.load_npz_files()

        # Load grid search results
        grid_results, seeds = evaluator.load_grid_search_results(grid_search_dir)

        all_results = {}
        for model_name in ['SVM', 'LogisticRegression']:
            all_results[model_name] = {}

            for seed in seeds:
                # Prepare data using seed
                X_train, X_test, y_train, y_test = evaluator.prepare_data(features, labels, seed)

                # Get best parameters for this seed
                params = grid_results[seed][model_name]['best_params']

                # Create and train model
                model = evaluator.create_model(model_name, params, seed)
                model.fit(X_train, y_train)

                # Evaluate model
                train_metrics = evaluator.evaluate_model(model, X_train, y_train)
                test_metrics = evaluator.evaluate_model(model, X_test, y_test)

                # Store results
                all_results[model_name][seed] = {
                    'train_metrics': train_metrics,
                    'test_metrics': test_metrics,
                    'params': params
                }

        # Plot averaged confusion matrices
        evaluator.plot_averaged_confusion_matrices(all_results)

        # Calculate and saves learning curves
        learning_curves = evaluator.calculate_learning_curve(
            features, labels, grid_results, seeds
        )
        evaluator.save_learning_curves_data(learning_curves, evaluator.output_dir)

        # Plot learning curves
        evaluator.plot_learning_curves(learning_curves)

        # Save all results
        for model_name in all_results:
            with open(os.path.join(evaluator.output_dir, f'{model_name}_evaluation.json'), 'w') as f:
                json.dump(all_results[model_name], f, cls=NumpyEncoder)

        return {
            'model_results': all_results,
            'learning_curves': learning_curves
        }

    except Exception as e:
        logging.error(f"Error in evaluation pipeline: {str(e)}")
        raise



In [None]:
!mkdir "/content/model_optimization_20241211_152718_multiseed"
!unzip "/content/model_optimization_20241211_152718_multiseed.zip" -d "/content/model_optimization_20241211_152718_multiseedy"

Archive:  /content/model_optimization_20241211_152718_multiseed.zip
replace averaged_validation_curves.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: averaged_validation_curves.png  
replace SVM_seed_summary.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: SVM_seed_summary.json   
replace SVM_seed_comparison.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: SVM_seed_comparison.png  
replace LogisticRegression_seed_summary.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: LogisticRegression_seed_summary.json  
replace LogisticRegression_seed_comparison.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: LogisticRegression_seed_comparison.png  
replace seed_37/SVM_metrics.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 
error:  invalid response [{ENTER}]
replace seed_37/SVM_metrics.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: yy
  inflating: seed_37/SVM_metrics.json  
replace seed_37/LogisticRegression_results.csv? [y]es, [n]o, [A]ll, [N]one, [r]ena

In [None]:
import shutil
import os

def create_local_copy(drive_path: str) -> str:
    # Create directory in /content/
    local_path = '/content/local_features'

    # Remove if already exists
    if os.path.exists(local_path):
        shutil.rmtree(local_path)

    # Copy data from Drive to local
    print(f"Copying data to: {local_path}")
    shutil.copytree(drive_path, local_path)

    return local_path

# Example usage
drive_path = '/path/to/Feature Extraction/ResNet-50'
local_path = create_local_copy(drive_path)
print(f"Data copied to: {local_path}")

Copying data to: /content/local_features
Data copied to: /content/local_features


In [None]:
# Example usage
if __name__ == "__main__":
    # Directory containing your NPZ files
    data_dir = "/content/local_features"

    # Directory containing grid search results
    grid_search_dir = "/path/to/ResNet_Benchmark/model_optimization_20250203_223027_multiseed"

    # Run evaluation
    results = run_evaluation(data_dir, grid_search_dir)



# Evaluation - Temporal Voting

In [None]:
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, f1_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple, Any
import logging
import json
import os
import glob
from datetime import datetime
import pandas as pd

class NumpyEncoder(json.JSONEncoder):
    """Custom encoder for numpy data types"""
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.bool_):
            return bool(obj)
        return super(NumpyEncoder, self).default(obj)

class TemporalVotingEvaluator:
    """Evaluates models using temporal voting across video frames"""

    def __init__(self, random_state: int = 42):
        self.random_state = random_state
        self.le = LabelEncoder()
        self.setup_output_dir()
        self.setup_logging()

    def setup_output_dir(self):
        """Create output directory with timestamp"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        self.output_dir = f'temporal_voting_evaluation_{timestamp}'
        os.makedirs(self.output_dir, exist_ok=True)

    def setup_logging(self):
        """Configure logging"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(os.path.join(self.output_dir, 'temporal_voting.log')),
                logging.StreamHandler()
            ]
        )

    def load_video_features(self, data_dir: str) -> Tuple[Dict[str, Dict], List[str]]:
        """
        Load features for all frames from all videos

        Returns:
            Dictionary mapping video IDs to their frame features and labels
            List of all unique labels
        """
        video_data = {}
        all_labels = set()

        npz_files = glob.glob(os.path.join(data_dir, "*.npz"))
        logging.info(f"Found {len(npz_files)} NPZ files")

        for npz_file in npz_files:
            try:
                # Extract video ID from filename
                video_id = os.path.basename(npz_file).split('_features.npz')[0]

                # Load NPZ file
                data = np.load(npz_file, allow_pickle=True)

                # Get all frame features and label
                frame_features = data['features'].item()  # Dictionary of frame features
                fish_species = str(data['fish_species'].item())
                middle_frame = data['middle_frame'].item()

                video_data[video_id] = {
                    'features': frame_features,
                    'label': fish_species,
                    'middle_frame': middle_frame
                }
                all_labels.add(fish_species)

            except Exception as e:
                logging.error(f"Error processing file {npz_file}: {str(e)}")
                continue

        return video_data, sorted(list(all_labels))

    def prepare_data_split(self, video_data: Dict, seed: int) -> Tuple[List[str], List[str]]:
        """
        Split video IDs into train and test sets while maintaining class distribution

        Args:
            video_data: Dictionary of video data
            seed: Random seed for reproducibility

        Returns:
            Lists of video IDs for training and testing
        """
        # Prepare video IDs and labels
        video_ids = list(video_data.keys())
        labels = [video_data[vid]['label'] for vid in video_ids]

        # Perform stratified split on video IDs
        train_ids, test_ids = train_test_split(
            video_ids,
            test_size=0.2,
            random_state=seed,
            stratify=labels
        )

        # Log split information
        train_labels = [video_data[vid]['label'] for vid in train_ids]
        test_labels = [video_data[vid]['label'] for vid in test_ids]

        for label in set(labels):
            train_count = train_labels.count(label)
            test_count = test_labels.count(label)
            total = train_count + test_count
            logging.info(f"Class {label}:")
            logging.info(f"  Train: {train_count} ({train_count/total*100:.2f}%)")
            logging.info(f"  Test: {test_count} ({test_count/total*100:.2f}%)")

        return train_ids, test_ids

    def prepare_central_frame_data(self, video_data: Dict) -> Tuple[np.ndarray, np.ndarray]:
        """Extract central frame features and labels for initial model training"""
        features_list = []
        labels_list = []

        for video_info in video_data.values():
            middle_frame = video_info['middle_frame']
            features = video_info['features'][middle_frame]
            label = video_info['label']

            features_list.append(features)
            labels_list.append(label)

        return np.array(features_list), np.array(labels_list)

    def train_model(self, model_type: str, params: Dict,
                   X_train: np.ndarray, y_train: np.ndarray) -> Any:
        """Train model with specified parameters"""
        if model_type == 'SVM':
            model = LinearSVC(max_iter=2000, **params)
        elif model_type == 'LogisticRegression':
            model = LogisticRegression(max_iter=2000, **params)
        else:
            raise ValueError(f"Unknown model type: {model_type}")

        model.fit(X_train, y_train)
        return model

    def get_prediction_probabilities(self, model, X: np.ndarray) -> np.ndarray:
        """Get prediction probabilities from model"""
        if isinstance(model, LogisticRegression):
            return model.predict_proba(X)
        else:  # SVM
            decision_values = model.decision_function(X)
            if decision_values.ndim == 1:  # Binary classification
                decision_values = np.column_stack([-decision_values, decision_values])
            return self._softmax(decision_values)

    def _softmax(self, X: np.ndarray) -> np.ndarray:
        """Apply softmax to array"""
        exp_X = np.exp(X - np.max(X, axis=1, keepdims=True))
        return exp_X / np.sum(exp_X, axis=1, keepdims=True)

    def temporal_voting_predict(self, model, video_features: Dict) -> Tuple[int, np.ndarray]:
        """
        Perform temporal voting on all frames in a video

        Returns:
            Predicted class index and aggregated probabilities
        """
        # Get predictions for all frames
        frame_predictions = []
        for frame_num, features in video_features.items():
            probs = self.get_prediction_probabilities(model, features.reshape(1, -1))
            frame_predictions.append(probs)

        # Average probabilities across frames
        avg_probs = np.mean(frame_predictions, axis=0)
        return np.argmax(avg_probs), avg_probs

    def evaluate_temporal_voting(self, model, video_data: Dict,
                           video_indices: List[str]) -> Dict:
        """Evaluate model using temporal voting on specified videos"""
        y_true = []
        y_pred = []
        all_probs = []

        for video_id in video_indices:
            video_info = video_data[video_id]
            true_label = self.le.transform([video_info['label']])[0]

            # Get prediction using temporal voting
            pred_label, probs = self.temporal_voting_predict(
                model, video_info['features']
            )

            y_true.append(true_label)
            y_pred.append(pred_label)
            all_probs.append(probs)

        y_true = np.array(y_true)
        y_pred = np.array(y_pred)

        # Calculate all metrics
        metrics = {
            'weighted_f1': f1_score(y_true, y_pred, average='weighted'),
            'macro_f1': f1_score(y_true, y_pred, average='macro'),
            'accuracy': accuracy_score(y_true, y_pred),
            'confusion_matrix': confusion_matrix(y_true, y_pred).tolist()  # Convert to list for JSON serialization
        }

        # Calculate per-class metrics
        precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred)

        for i, class_name in enumerate(self.le.classes_):
            metrics[f'{class_name}_precision'] = precision[i]
            metrics[f'{class_name}_recall'] = recall[i]
            metrics[f'{class_name}_f1'] = f1[i]
            metrics[f'{class_name}_support'] = int(support[i])

        return metrics

    def plot_confusion_matrix(self, cm: np.ndarray, model_name: str, seed: int):
        """Plot confusion matrix"""
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=self.le.classes_,
                   yticklabels=self.le.classes_)
        plt.title(f'Temporal Voting Confusion Matrix\n{model_name} - Seed {seed}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.savefig(os.path.join(self.output_dir,
                                f'{model_name}_confusion_matrix_seed_{seed}.png'))
        plt.close()

    def plot_averaged_confusion_matrices(self, results: Dict):
        """
        Plot averaged confusion matrices across all seeds for both models.

        Args:
            results: Dictionary containing results for all models and seeds
        """
        plt.figure(figsize=(20, 8))

        for idx, model_name in enumerate(['SVM', 'LogisticRegression']):
            # Get all confusion matrices for this model
            confusion_matrices = []
            for seed_results in results[model_name].values():
                if isinstance(seed_results, dict) and 'confusion_matrix' in seed_results:
                    cm = np.array(seed_results['confusion_matrix'])
                    confusion_matrices.append(cm)

            if not confusion_matrices:  # Skip if no valid confusion matrices
                logging.warning(f"No confusion matrices found for {model_name}")
                continue

            # Calculate average and standard deviation
            avg_cm = np.mean(confusion_matrices, axis=0)
            std_cm = np.std(confusion_matrices, axis=0)

            # Create subplot
            plt.subplot(1, 2, idx + 1)

            # Create annotations with mean ± std
            annotations = np.array([
                [f'{avg:.1f}±{std:.1f}'
                for avg, std in zip(row_avg, row_std)]
                for row_avg, row_std in zip(avg_cm, std_cm)
            ])

            # Plot heatmap
            sns.heatmap(
                avg_cm,
                annot=annotations,
                fmt='',
                cmap='Blues',
                xticklabels=self.le.classes_,
                yticklabels=self.le.classes_
            )

            plt.title(f'Average Confusion Matrix - {model_name}\n(across {len(confusion_matrices)} seeds)')
            plt.ylabel('True Label')
            plt.xlabel('Predicted Label')

        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'averaged_confusion_matrices.png'),
                    dpi=300, bbox_inches='tight')
        plt.close()


    def calculate_learning_curves(self, video_data: Dict, seeds: List[int],
                            model_type: str, params_by_seed: Dict,
                            train_sizes: np.ndarray = None) -> Dict:
        """
        Calculate learning curves for temporal voting across multiple seeds

        Args:
            video_data: Dictionary of video data
            seeds: List of random seeds
            model_type: Type of model ('SVM' or 'LogisticRegression')
            params_by_seed: Dictionary of best parameters for each seed
            train_sizes: Array of training set proportions (0 to 1)

        Returns:
            Dictionary containing learning curve data
        """
        if train_sizes is None:
            train_sizes = np.linspace(0.05, 1.0, 15)

        scores_by_size = {size: [] for size in train_sizes}

        # Get central frame data for initial training
        X, y = self.prepare_central_frame_data(video_data)

        for seed in seeds:
            logging.info(f"Calculating learning curve for seed {seed}")

            # Split data consistently for this seed
            train_videos, test_videos = self.prepare_data_split(video_data, seed)

            # Get indices for full training set
            train_indices = [list(video_data.keys()).index(vid) for vid in train_videos]
            X_train_full = X[train_indices]
            y_train_full = self.le.transform([video_data[vid]['label'] for vid in train_videos])

            # For each training set size
            for train_size in train_sizes:
                # Calculate number of samples for this size
                n_samples = int(len(train_videos) * train_size)

                # Get indices for each class while preserving order
                subset_indices = []
                labels_array = np.array(y_train_full)
                unique_labels = np.unique(labels_array)

                # Calculate target samples per class
                total_per_class = {label: np.sum(labels_array == label) for label in unique_labels}
                target_per_class = {
                    label: int(n_samples * (count / len(labels_array)))
                    for label, count in total_per_class.items()
                }

                # Adjust for rounding errors to match n_samples exactly
                remaining = n_samples - sum(target_per_class.values())
                if remaining > 0:
                    # Add remaining samples to classes proportionally
                    for label in sorted(unique_labels,
                                    key=lambda x: total_per_class[x],
                                    reverse=True):
                        if remaining <= 0:
                            break
                        target_per_class[label] += 1
                        remaining -= 1

                # Get stratified indices
                for label in unique_labels:
                    label_indices = np.where(labels_array == label)[0]
                    n_label_samples = target_per_class[label]
                    subset_indices.extend(label_indices[:n_label_samples])

                # Sort indices to maintain order
                subset_indices = np.array(sorted(subset_indices))

                # Take subset of training data
                X_train_subset = X_train_full[subset_indices]
                y_train_subset = y_train_full[subset_indices]

                # Train model
                model = self.train_model(
                    model_type,
                    params_by_seed[seed],
                    X_train_subset,
                    y_train_subset
                )

                # Evaluate on test set using temporal voting
                metrics = self.evaluate_temporal_voting(model, video_data, test_videos)
                scores_by_size[train_size].append(metrics['macro_f1'])

            # Calculate mean and std for each training size
            learning_curve_data = {
                'train_sizes': train_sizes * len(y_train_full),
                'test_scores_mean': [np.mean(scores_by_size[size]) for size in train_sizes],
                'test_scores_std': [np.std(scores_by_size[size]) for size in train_sizes]
            }

        return learning_curve_data


    def plot_learning_curves(self, learning_curves: Dict):
        """
        Plot learning curves for both models

        Args:
            learning_curves: Dictionary containing learning curve data for both models
        """
        plt.figure(figsize=(12, 8))
        ax = plt.gca()

        colors = {
            'SVM': 'blue',
            'LogisticRegression': 'red'
        }

        for model_name in ['SVM', 'LogisticRegression']:
            data = learning_curves[model_name]
            train_sizes = data['train_sizes']
            test_scores_mean = data['test_scores_mean']
            test_scores_std = data['test_scores_std']

            # Plot mean test scores
            plt.plot(train_sizes, test_scores_mean,
                    label=f'{model_name}',
                    color=colors[model_name],
                    linewidth=2)

            # Plot standard deviation bands
            plt.fill_between(train_sizes,
                            np.array(test_scores_mean) - np.array(test_scores_std),
                            np.array(test_scores_mean) + np.array(test_scores_std),
                            alpha=0.1, color=colors[model_name])

        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.spines['left'].set_visible(False)

        plt.xlabel('Training Set Size')
        plt.ylabel('Macro F1 Score')
        plt.grid(True, alpha=0.3)
        plt.legend(loc='lower right')

        # Add total samples info
        total_samples = int(np.max(train_sizes))
        plt.text(0.02, 0.98, f'Total videos: {total_samples}',
                transform=plt.gca().transAxes,
                bbox=dict(facecolor='white', alpha=0.8),
                verticalalignment='top')

        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'learning_curves.png'),
                    dpi=300, bbox_inches='tight')
        plt.close()

    def save_learning_curves_data(self, learning_curves: Dict):
        """
        Save learning curves data to a JSON file for later plotting

        Args:
            learning_curves: Dictionary containing learning curve data for both models
        """
        curves_data = {}

        for model_name in learning_curves:
            data = learning_curves[model_name]
            curves_data[model_name] = {
                'train_sizes': data['train_sizes'].tolist(),  # Convert numpy array to list for JSON serialization
                'test_scores_mean': data['test_scores_mean'],
                'test_scores_std': data['test_scores_std']
            }

        output_file = os.path.join(self.output_dir, 'learning_curves_data.json')
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(curves_data, f, indent=2, cls=NumpyEncoder)  # Using NumpyEncoder for numpy arrays

def run_temporal_voting_evaluation(data_dir: str, random_search_dir: str) -> Dict:
    """Run complete temporal voting evaluation pipeline"""
    try:
        # Initialize evaluator
        evaluator = TemporalVotingEvaluator()

        # Load video data
        logging.info("Loading video data...")
        video_data, unique_labels = evaluator.load_video_features(data_dir)
        evaluator.le.fit(unique_labels)

        # Load random seeds information
        with open(os.path.join(random_search_dir, 'random_seeds.json'), 'r') as f:
            seeds_info = json.load(f)
            seeds = seeds_info['generated_seeds']

        logging.info(f"Found {len(seeds)} seeds for evaluation")

        # Prepare central frame data for initial model training
        X, y = evaluator.prepare_central_frame_data(video_data)

        # Initialize results containers
        results = {
            'SVM': {},
            'LogisticRegression': {}
        }
        learning_curves = {}

        # Store parameters by seed for learning curves
        params_by_seed = {
            'SVM': {},
            'LogisticRegression': {}
        }

        # First pass: Model evaluation
        for model_name in ['SVM', 'LogisticRegression']:
            logging.info(f"\nEvaluating {model_name} across {len(seeds)} seeds")

            for seed in seeds:
                logging.info(f"\nProcessing {model_name} with seed {seed}")

                try:
                    # Load best parameters for this seed
                    metrics_file = os.path.join(random_search_dir, f'seed_{seed}',
                                              f'{model_name}_metrics.json')
                    with open(metrics_file, 'r') as f:
                        seed_results = json.load(f)
                        params = seed_results['best_params']
                        params_by_seed[model_name][seed] = params

                    # Split data using stratified split
                    train_videos, test_videos = evaluator.prepare_data_split(video_data, seed)

                    # Get training data
                    train_indices = [list(video_data.keys()).index(vid) for vid in train_videos]
                    X_train = X[train_indices]
                    y_train = evaluator.le.transform([video_data[vid]['label']
                                                    for vid in train_videos])

                    # Train model on central frames
                    model = evaluator.train_model(model_name, params, X_train, y_train)

                    # Evaluate using temporal voting
                    metrics = evaluator.evaluate_temporal_voting(
                        model, video_data, test_videos
                    )

                    # Store results for this seed
                    results[model_name][seed] = metrics

                    # Log seed-specific results
                    logging.info(f"Results for {model_name} (seed {seed}):")
                    logging.info(f"Weighted F1: {metrics['weighted_f1']:.4f}")
                    logging.info(f"Macro F1: {metrics['macro_f1']:.4f}")
                    logging.info("Per-class F1 scores:")
                    for class_name in evaluator.le.classes_:
                        logging.info(f"  {class_name}: {metrics[f'{class_name}_f1']:.4f}")

                except Exception as e:
                    logging.error(f"Error processing seed {seed} for {model_name}: {str(e)}")
                    continue

            # Calculate average metrics
            seed_metrics = [metrics for metrics in results[model_name].values()
                          if isinstance(metrics, dict)]

            if seed_metrics:
                avg_metrics = {
                    'weighted_f1': np.mean([m['weighted_f1'] for m in seed_metrics]),
                    'weighted_f1_std': np.std([m['weighted_f1'] for m in seed_metrics]),
                    'macro_f1': np.mean([m['macro_f1'] for m in seed_metrics]),
                    'macro_f1_std': np.std([m['macro_f1'] for m in seed_metrics]),
                }

                # Add per-class average metrics
                for class_name in evaluator.le.classes_:
                    avg_metrics[f'{class_name}_f1'] = np.mean([
                        m[f'{class_name}_f1'] for m in seed_metrics
                    ])
                    avg_metrics[f'{class_name}_f1_std'] = np.std([
                        m[f'{class_name}_f1'] for m in seed_metrics
                    ])

                results[model_name]['average_metrics'] = avg_metrics

                # Log average results
                logging.info(f"\nAverage metrics for {model_name} across "
                           f"{len(seed_metrics)} seeds:")
                logging.info(f"Weighted F1: {avg_metrics['weighted_f1']:.4f} ± "
                           f"{avg_metrics['weighted_f1_std']:.4f}")
                logging.info(f"Macro F1: {avg_metrics['macro_f1']:.4f} ± "
                           f"{avg_metrics['macro_f1_std']:.4f}")
                logging.info("Per-class F1 scores:")
                for class_name in evaluator.le.classes_:
                    logging.info(f"  {class_name}: {avg_metrics[f'{class_name}_f1']:.4f} ± "
                               f"{avg_metrics[f'{class_name}_f1_std']:.4f}")

        # Create averaged confusion matrices plot
        evaluator.plot_averaged_confusion_matrices(results)

        # Calculate and plot learning curves
        learning_curves = {}
        for model_name in ['SVM', 'LogisticRegression']:
            if params_by_seed[model_name]:  # Only if we have parameters for this model
                learning_curves[model_name] = evaluator.calculate_learning_curves(
                    video_data,
                    seeds,
                    model_name,
                    params_by_seed[model_name]
                )

        if learning_curves:
            evaluator.save_learning_curves_data(learning_curves)
            evaluator.plot_learning_curves(learning_curves)

        # Save all results
        output_files = {
            'temporal_voting_results.json': results,
            'learning_curves.json': learning_curves
        }

        for filename, data in output_files.items():
            filepath = os.path.join(evaluator.output_dir, filename)
            with open(filepath, 'w') as f:
                json.dump(data, f, cls=NumpyEncoder, indent=4)

        logging.info(f"All results saved to {evaluator.output_dir}")

        return {
            'evaluation_results': results,
            'learning_curves': learning_curves
        }

    except Exception as e:
        logging.error(f"Error in temporal voting evaluation: {str(e)}")
        raise

# Example usage
if __name__ == "__main__":
    # Directory containing your NPZ files
    data_dir = "/path/to/Feature Extraction/ViT-SO400M-14-SigLIP"

    # Directory containing random search results
    grid_search_dir = "/path/to/Single Frame/model_optimization_20250206_115650_multiseed"

    # Run evaluation
    results = run_temporal_voting_evaluation(data_dir, grid_search_dir)

