In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix
import os
import zipfile
from datetime import datetime
import time

################### 1 SHOT #########
# Dataset class
class EmbeddingDataset(Dataset):
    def __init__(self, csv_file, num_classes=3):
        self.data = pd.read_csv(csv_file, header=0)
        self.num_classes = num_classes
        self.classes = ['Pastos', 'planta_daninha', 'planta_toxicas']
        self.class_to_idx = {self.classes[i]: i for i in range(num_classes)}
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        label_str, embedding_str = self.data.iloc[idx]
        label = self.class_to_idx[label_str]
        embedding = np.fromstring(embedding_str.strip('[]'), sep=' ')
        embedding = torch.tensor(embedding, dtype=torch.float32)
        return embedding, label

# Model definition
class EmbeddingClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(EmbeddingClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

def plot_fold_history(fold_histories, output_dir):
    """Plot training history for all folds"""
    plt.figure(figsize=(15, 5))
    
    # Plot loss
    plt.subplot(1, 2, 1)
    for fold, history in enumerate(fold_histories):
        plt.plot(history['train_loss'], label=f'Train Fold {fold+1}')
        plt.plot(history['val_loss'], label=f'Val Fold {fold+1}', linestyle='--')
    plt.title('Model Loss by Fold')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    # Plot accuracy
    plt.subplot(1, 2, 2)
    for fold, history in enumerate(fold_histories):
        plt.plot(history['train_acc'], label=f'Train Fold {fold+1}')
        plt.plot(history['val_acc'], label=f'Val Fold {fold+1}', linestyle='--')
    plt.title('Model Accuracy by Fold')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'training_history.png'))
    plt.close()

def plot_fold_metrics(fold_scores, output_dir):
    """Plot final metrics for each fold"""
    plt.figure(figsize=(10, 6))
    folds = range(1, len(fold_scores) + 1)
    plt.bar(folds, fold_scores)
    plt.axhline(y=np.mean(fold_scores), color='r', linestyle='--', label='Mean')
    
    plt.title('Accuracy by Fold')
    plt.xlabel('Fold Number')
    plt.ylabel('Accuracy')
    plt.ylim([0, 1])
    
    for i, v in enumerate(fold_scores):
        plt.text(i + 1, v + 0.01, f'{v:.4f}', ha='center')
    
    plt.legend()
    plt.savefig(os.path.join(output_dir, 'fold_metrics.png'))
    plt.close()

def plot_confusion_matrix(y_true, y_pred, classes, output_dir):
    """Plot and save confusion matrix"""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=classes, yticklabels=classes)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
    plt.close()

def save_results(fold_scores, classification_rep, fold_times, total_time, output_dir):
    """Save results to text files"""
    with open(os.path.join(output_dir, 'fold_accuracies.txt'), 'w') as f:
        f.write("Results by Fold:\n")
        for i, (score, time_taken) in enumerate(zip(fold_scores, fold_times)):
            f.write(f"Fold {i+1}:\n")
            f.write(f"  Accuracy: {score:.4f}\n")
            f.write(f"  Training time: {time_taken:.2f} seconds ({time_taken/60:.2f} minutes)\n")
        
        f.write(f"\nMean Accuracy: {np.mean(fold_scores):.4f}")
        f.write(f"\nAccuracy Standard Deviation: {np.std(fold_scores):.4f}")
        f.write(f"\n\nAverage time per fold: {np.mean(fold_times):.2f} seconds ({np.mean(fold_times)/60:.2f} minutes)")
        f.write(f"\nTotal training time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")

    with open(os.path.join(output_dir, 'classification_report.txt'), 'w') as f:
        f.write(classification_rep)

def create_zip_archive(output_dir, zip_filename):
    """Create zip archive with all results"""
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for root, dirs, files in os.walk(output_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, output_dir)
                zipf.write(file_path, arcname)

def train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=100):
    """Train the model and return training history"""
    history = {
        'train_loss': [], 'train_acc': [],
        'val_loss': [], 'val_acc': []
    }
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        
        # Validation phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        # Record history
        history['train_loss'].append(train_loss / len(train_loader))
        history['train_acc'].append(train_correct / train_total)
        history['val_loss'].append(val_loss / len(val_loader))
        history['val_acc'].append(val_correct / val_total)
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], '
                  f'Train Loss: {train_loss/len(train_loader):.4f}, '
                  f'Train Acc: {train_correct/train_total:.4f}, '
                  f'Val Loss: {val_loss/len(val_loader):.4f}, '
                  f'Val Acc: {val_correct/val_total:.4f}')
    
    return history

def main():
    # Parameters
    csv_file = '../../embeddings_with_augmentation.csv'
    k_folds = 10
    epochs = 100
    batch_size = 32
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Create output directory
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f'resultados_kfold_{timestamp}'
    os.makedirs(output_dir, exist_ok=True)
    
    # Load dataset
    print("Loading data...")
    dataset = EmbeddingDataset(csv_file)
    input_dim = dataset[0][0].shape[0]
    
    # Initialize K-Fold
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    
    # Storage for metrics
    fold_histories = []
    fold_scores = []
    fold_times = []
    all_predictions = []
    all_true_labels = []
    
    # Record total training time
    total_start_time = time.time()
    
    # K-Fold Cross Validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(range(len(dataset)))):
        print(f"\nFold {fold + 1}/{k_folds}")
        print("-" * 20)
        
        # Create data loaders for this fold
        train_sampler = SubsetRandomSampler(train_idx)
        val_sampler = SubsetRandomSampler(val_idx)
        
        train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
        val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)
        
        # Initialize model, criterion, and optimizer
        model = EmbeddingClassifier(input_dim=input_dim, num_classes=len(dataset.classes)).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        # Train the model
        fold_start_time = time.time()
        history = train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs)
        fold_time = time.time() - fold_start_time
        
        # Evaluate the model
        model.eval()
        val_correct = 0
        val_total = 0
        fold_predictions = []
        fold_true_labels = []
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
                
                fold_predictions.extend(predicted.cpu().numpy())
                fold_true_labels.extend(labels.cpu().numpy())
        
        # Record metrics
        fold_acc = val_correct / val_total
        print(f"Fold {fold + 1} - Accuracy: {fold_acc:.4f}")
        print(f"Training time: {fold_time:.2f} seconds ({fold_time/60:.2f} minutes)")
        
        fold_histories.append(history)
        fold_scores.append(fold_acc)
        fold_times.append(fold_time)
        all_predictions.extend(fold_predictions)
        all_true_labels.extend(fold_true_labels)
    
    # Calculate total training time
    total_time = time.time() - total_start_time
    
    # Generate and save final results
    print("\nGenerating reports and visualizations...")
    
    # Classification Report
    report = classification_report(all_true_labels, all_predictions, 
                                 target_names=dataset.classes, digits=4)
    print("\nClassification Report:")
    print(report)
    
    # Generate all visualizations and save results
    plot_fold_history(fold_histories, output_dir)
    plot_fold_metrics(fold_scores, output_dir)
    plot_confusion_matrix(all_true_labels, all_predictions, dataset.classes, output_dir)
    save_results(fold_scores, report, fold_times, total_time, output_dir)
    
    # Create zip file with all results
    zip_filename = f'resultados_kfold_{timestamp}.zip'
    create_zip_archive(output_dir, zip_filename)
    
    print(f"\nCross Validation Results:")
    print(f"Mean accuracy: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})")
    print(f"Average time per fold: {np.mean(fold_times):.2f} seconds ({np.mean(fold_times)/60:.2f} minutes)")
    print(f"Total training time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")
    print(f"\nAll results have been saved to '{zip_filename}'")

if __name__ == "__main__":
    main()