In [None]:
import numpy as np
import warnings
import sys
from pathlib import Path
from PIL import Image
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
from tqdm import tqdm
from typing import Tuple, List, Dict

# --- EXPERIMENT CONFIGURATION ---
RANDOM_STATE = 42

# PATH CONFIGURATION (Relative Path)
# Assuming the notebook runs from the root or 'notebooks' folder
BASE_DATA_DIR = Path("data/experimento_baselines") 

CARRIER_NAMES = [
    "Carrier_C1_675",
    "Carrier_C2_2825",
    "Carrier_C3_2975",
    "Carrier_C4_9435"
]

TRAIN_FOLDER_NAME = "train_set_70"
TEST_FOLDER_NAME = "test_set_15"
IMAGE_SIZE = (100, 100)

# Suppress convergence warnings for cleaner output
warnings.filterwarnings("ignore", category=ConvergenceWarning)

def load_and_flatten_images(base_dir: Path) -> Tuple[np.ndarray, np.ndarray, List[str]]:
    """
    Loads images from directory, converts to grayscale, resizes, 
    and flattens them into 1D vectors for Scikit-Learn models.
    """
    X_data, y_data = [], []
    
    if not base_dir.exists():
        print(f"[ERROR] Path not found: {base_dir}")
        return np.array([]), np.array([]), []

    # Get class names, ignoring specific non-class folders
    classes = sorted([
        d.name for d in base_dir.iterdir() 
        if d.is_dir() and d.name.upper() != "SIN_CLASIFICAR"
    ])
    
    if not classes:
        print(f"[ERROR] No class directories found in {base_dir}")
        return np.array([]), np.array([]), []

    print(f"   > Loading data from: {base_dir.name}")
    print(f"   > Detected classes: {classes}")
    
    for class_name in classes:
        class_dir = base_dir / class_name
        # Support multiple image formats
        image_paths = list(class_dir.glob('*.png')) + \
                      list(class_dir.glob('*.jpg')) + \
                      list(class_dir.glob('*.jpeg'))
        
        if not image_paths:
            print(f"[WARN] Class '{class_name}' is empty.")
            continue

        for img_path in tqdm(image_paths, desc=f"Reading {class_name}", leave=False, ncols=80):
            try:
                with Image.open(img_path) as img:
                    # 1. Convert to Grayscale ('L')
                    img = img.convert('L')
                    # 2. Resize to fixed dimensions
                    img = img.resize(IMAGE_SIZE)
                    # 3. Flatten (100, 100) -> (10000,) vector
                    img_vector = np.asarray(img).flatten()
                    
                    X_data.append(img_vector)
                    y_data.append(class_name)
            except Exception as e:
                print(f"[ERROR] Failed to read {img_path.name}: {e}")

    return np.array(X_data), np.array(y_data), classes

def run_single_carrier(train_dir: Path, test_dir: Path) -> Dict[str, float]:
    """
    Executes training and evaluation for a single carrier scenario.
    Returns a dictionary containing F1-Macro scores for RF and SVM.
    """
    
    # 1. Load Data
    X_train, y_train, _ = load_and_flatten_images(train_dir)
    X_test, y_test, _ = load_and_flatten_images(test_dir)
    
    if len(X_train) == 0 or len(X_test) == 0:
        print("[ERROR] Missing training or testing data. Skipping carrier.")
        return {"rf_f1": 0.0, "svm_f1": 0.0}

    # 2. Feature Engineering (Standard Scaling)
    print("   > Standardizing features (StandardScaler)...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # 3. Model A: Random Forest
    print("   > Training Random Forest...")
    rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
    rf.fit(X_train_scaled, y_train)
    y_pred_rf = rf.predict(X_test_scaled)
    f1_rf = f1_score(y_test, y_pred_rf, average='macro', zero_division=0)
    print(f"     Result: RF F1-Macro: {f1_rf:.4f}")

    # 4. Model B: SVM (RBF Kernel)
    print("   > Training SVM (RBF Kernel)...")
    svm = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=RANDOM_STATE)
    svm.fit(X_train_scaled, y_train)
    y_pred_svm = svm.predict(X_test_scaled)
    f1_svm = f1_score(y_test, y_pred_svm, average='macro', zero_division=0)
    print(f"     Result: SVM F1-Macro: {f1_svm:.4f}")

    return {"rf_f1": f1_rf, "svm_f1": f1_svm}

def main_experiment_loop():
    """
    Main execution loop: Iterates through carriers, runs benchmarks, 
    and generates the final comparison table.
    """
    print("-" * 60)
    print("STARTING EXPERIMENT 3.1: CLASSICAL BASELINES (SVM & RF)")
    print(f"Base Data Path: {BASE_DATA_DIR}")
    print("-" * 60)
    
    if not BASE_DATA_DIR.exists():
        print(f"[CRITICAL] Data directory not found at: {BASE_DATA_DIR}")
        print("Please ensure your data folders are correctly placed.")
        return

    # Dictionary to store results
    all_results = {
        "SVM (RBF Kernel)": [],
        "Random Forest (100 est.)": []
    }
    
    for carrier_name in CARRIER_NAMES:
        print(f"\nProcessing: {carrier_name}")
        
        train_path = BASE_DATA_DIR / carrier_name / TRAIN_FOLDER_NAME
        test_path = BASE_DATA_DIR / carrier_name / TEST_FOLDER_NAME
        
        # Execute benchmark
        scores = run_single_carrier(train_path, test_path)
        
        # Store results
        all_results["SVM (RBF Kernel)"].append(scores["svm_f1"])
        all_results["Random Forest (100 est.)"].append(scores["rf_f1"])

    # --- Generate Final Table ---
    print("\n" + "="*60)
    print("EXPERIMENTS COMPLETED. GENERATING TABLE 10...")
    print("="*60)
    
    # Calculate Averages
    avg_svm = np.mean(all_results["SVM (RBF Kernel)"])
    avg_rf = np.mean(all_results["Random Forest (100 est.)"])
    
    # Print Markdown Header
    header = "| Model |"
    divider = "| :--- |"
    for name in CARRIER_NAMES:
        # Extract carrier identifier (e.g., C1_675)
        short_name = name.split('_', 1)[1] 
        header += f" {short_name} |"
        divider += " :---: |"
    header += " Average |"
    divider += " :---: |"
    print(header)
    print(divider)

    # Print SVM Row
    svm_scores = all_results["SVM (RBF Kernel)"]
    svm_row = f"| SVM (RBF Kernel) |"
    for score in svm_scores:
        svm_row += f" {score:.2f} |"
    svm_row += f" **{avg_svm:.2f}** |"
    print(svm_row)

    # Print RF Row
    rf_scores = all_results["Random Forest (100 est.)"]
    rf_row = f"| Random Forest (100 est.) |"
    for score in rf_scores:
        rf_row += f" {score:.2f} |"
    rf_row += f" **{avg_rf:.2f}** |"
    print(rf_row)

if __name__ == "__main__":
    main_experiment_loop()