In [None]:
import os
import numpy as np
import pandas as pd
import time
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, top_k_accuracy_score
import json
from sklearn.model_selection import StratifiedShuffleSplit
import argparse

def save_csv_for_iteration(temp_dir, model_dir, split_percentage, iteration, df):
    csv_file = os.path.join(temp_dir, f'{model_dir}_{split_percentage}_{iteration}.csv')
    df.to_csv(csv_file, index=False)

def find_last_iteration(temp_dir, model_dir, split_percentage):
    for iteration in range(10, 0, -1):
        csv_file = os.path.join(temp_dir, f'{model_dir}_{split_percentage}_{iteration}.csv')
        if os.path.exists(csv_file):
            return iteration, pd.read_csv(csv_file)
    return 0, pd.DataFrame()

def calculate_mean_std(df):
    metric_columns = ['Accuracy', 'Top-1%', 'Top-3%', 'Top-5%', 'Precision', 'Recall', 'F1 Score', 'Time Taken (s)']
    aggregated_results = {}
    for col in metric_columns:
        if col in df:
            aggregated_results[f'Mean {col}'] = round(df[col].mean(), 2)
            aggregated_results[f'Std {col}'] = round(df[col].std(), 2)
    return pd.DataFrame([aggregated_results])

def save_final_csv_with_aggregates(results_dir, model_dir, split_percentage, final_results_df):
    aggregated_results_df = calculate_mean_std(final_results_df)
    final_csv_file = os.path.join(results_dir, f'{model_dir}_{split_percentage}_10.csv')
    aggregated_results_df.to_csv(final_csv_file, index=False)
    print(f"Final aggregated results saved to {final_csv_file}")

def classify_and_save_results(encodings_dir, temp_dir, results_dir, model_dir, split_percentage):
    features_path = os.path.join(encodings_dir, model_dir, 'encoded_images.npy')
    labels_path = os.path.join(encodings_dir, model_dir, 'labels.npy')
    features = np.load(features_path, allow_pickle=True)
    labels = np.load(labels_path, allow_pickle=True)
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
    svm = SVC(kernel='linear', probability=True)
    start_iteration, final_results_df = find_last_iteration(temp_dir, model_dir, split_percentage)
    stratified_split = StratifiedShuffleSplit(n_splits=3, test_size=(1 - split_percentage / 100), random_state=42)
    for iteration, (train_idx, test_idx) in enumerate(stratified_split.split(features, labels), start=start_iteration):
        print(f"Iteration {iteration+1}/3")
        X_train, X_test = features[train_idx], features[test_idx]
        y_train, y_test = labels[train_idx], labels[test_idx]
        start_time = time.time()
        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_test)
        y_prob = svm.predict_proba(X_test)

        n_classes = len(np.unique(labels))
        results = {
            'Iteration': iteration + 1,
            'Model': model_dir,
            'Split Percentage': split_percentage,
            'Accuracy': round(accuracy_score(y_test, y_pred) * 100, 2),
            'Precision': round(precision_score(y_test, y_pred, average='weighted') * 100, 2),
            'Recall': round(recall_score(y_test, y_pred, average='weighted') * 100, 2),
            'F1 Score': round(f1_score(y_test, y_pred, average='weighted') * 100, 2),
            'Time Taken (s)': time.time() - start_time
        }

        # Add Top-K metrics only for multi-class classification
        if n_classes > 2:
            results['Top-1%'] = round(top_k_accuracy_score(y_test, y_prob, k=1, labels=np.unique(labels)) * 100, 2)
            results['Top-3%'] = round(top_k_accuracy_score(y_test, y_prob, k=3, labels=np.unique(labels)) * 100, 2) if n_classes > 3 else None
            results['Top-5%'] = round(top_k_accuracy_score(y_test, y_prob, k=5, labels=np.unique(labels)) * 100, 2) if n_classes > 5 else None

        final_results_df = pd.concat([final_results_df, pd.DataFrame([results])], ignore_index=True)
        save_csv_for_iteration(temp_dir, model_dir, split_percentage, iteration + 1, final_results_df)
    save_final_csv_with_aggregates(results_dir, model_dir, split_percentage, final_results_df)

def cleanup_temp_folder(temp_dir):
    for file in os.listdir(temp_dir):
        file_path = os.path.join(temp_dir, file)
        if file.endswith('.csv'):
            os.remove(file_path)
    print("All temporary CSV files deleted from temp folder.")

def combine_csv_files_for_model(results_dir, model_dir):
    combined_df = pd.DataFrame()
    for split_percentage in [70, 80, 90]:
        csv_file = os.path.join(results_dir, f'{model_dir}_{split_percentage}_10.csv')
        if os.path.exists(csv_file):
            df = pd.read_csv(csv_file)
            df.insert(0, 'Split', split_percentage)
            combined_df = pd.concat([combined_df, df], ignore_index=True)
    combined_csv_file = os.path.join(results_dir, f'{model_dir}.csv')
    combined_df.to_csv(combined_csv_file, index=False)
    print(f"Combined CSV saved as {combined_csv_file}")

def main(proj_dir, encodings_dir, results_dir):
    temp_dir = os.path.join(proj_dir, 'temp_results')
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    model_dirs = [d for d in os.listdir(encodings_dir) if os.path.isdir(os.path.join(encodings_dir, d))]
    for model_dir in model_dirs:
        print(f"Processing model: {model_dir}")
        for split_percentage in [90, 80, 70]:
            final_csv_file = os.path.join(results_dir, f'{model_dir}_{split_percentage}_10.csv')
            if os.path.exists(final_csv_file):
                print(f"Results for {model_dir} with {split_percentage}% split already exist. Skipping.")
                continue
            print(f"Evaluating {split_percentage}-{100 - split_percentage} split")
            classify_and_save_results(encodings_dir, temp_dir, results_dir, model_dir, split_percentage)
    cleanup_temp_folder(temp_dir)
    for model_dir in model_dirs:
        combine_csv_files_for_model(results_dir, model_dir)
    print("Processing completed.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Classify images and save results using SVM.')
    parser.add_argument('--proj_dir', type=str, required=True, help='Project directory path')
    parser.add_argument('--encodings_dir', type=str, required=True, help='Directory path for encoded images')
    parser.add_argument('--results_dir', type=str, required=True, help='Directory path to save results')
    args = parser.parse_args()
    main(args.proj_dir, args.encodings_dir, args.results_dir)

In [None]:
# Define arguments directly
proj_dir = "path/to/project/directory"
encodings_dir = "path/to/encodings/directory"
results_dir = "path/to/results/directory"

# Call the main function with these arguments
main(proj_dir, encodings_dir, results_dir)