<a href="https://colab.research.google.com/github/leonardp315/Aula2/blob/main/LossFunctions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install torch transformers sentence-transformers datasets pandas numpy matplotlib tqdm scikit-learn GPUtil psutil

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (fr

In [None]:
"""
Comparative Evaluation of Sentence-Transformer Models with Different Loss Functions
for Semantic Similarity and Paraphrase Tasks

This script performs a systematic evaluation of different Sentence-Transformer models
combined with various loss functions on textual similarity (STS-B) and paraphrase
detection (MRPC) datasets.
"""

import torch
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
import time
import os
import json
from datetime import datetime
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from scipy.stats import spearmanr, pearsonr
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Directory configuration for results
RESULTS_DIR = Path("results")
FIGURES_DIR = RESULTS_DIR / "figures"
MODELS_DIR = RESULTS_DIR / "models"

for directory in [RESULTS_DIR, FIGURES_DIR, MODELS_DIR]:
    directory.mkdir(exist_ok=True, parents=True)

# Configuration for reproducibility
def set_seed(seed_value=42):
    """Sets seeds for reproducibility across multiple frameworks."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

    # Additional settings for determinism in PyTorch
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    return seed_value

SEED = set_seed(42)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Configuration: Seed={SEED}, Device={DEVICE}")

# Experiment settings
SAMPLE_SIZE = None  # Use None for full dataset or a number for sampling
NUM_EPOCHS = 3
BATCH_SIZE = 16
SAVE_MODELS = True  # Save trained models

# Load and prepare datasets
class DatasetLoader:
    """Manager for loading and preparing textual similarity datasets."""

    def __init__(self, cache_dir=None):
        self.cache_dir = cache_dir

    def load_dataset(self, name, split='train', sample_size=None, random_state=42):
        """
        Loads and prepares popular textual similarity datasets.

        Args:
            name: Dataset name ('stsb' or 'mrpc')
            split: Dataset partition ('train', 'validation', 'test')
            sample_size: Number of examples for sampling (None to use all)
            random_state: Seed for reproducible sampling

        Returns:
            DataFrame with processed data
        """
        if name.lower() == 'stsb':
            return self._load_stsb(split, sample_size, random_state)
        elif name.lower() == 'mrpc':
            return self._load_mrpc(split, sample_size, random_state)
        else:
            raise ValueError(f"Unsupported dataset: {name}. Use 'stsb' or 'mrpc'")

    def _load_stsb(self, split, sample_size, random_state):
        """Loads the STS-B (Semantic Textual Similarity Benchmark) dataset."""
        ds = load_dataset('glue', 'stsb', cache_dir=self.cache_dir)[split]
        df = pd.DataFrame(ds)

        # Label processing
        df['label'] = pd.to_numeric(df['label'], errors='coerce')
        df = df.dropna(subset=['label'])

        # Normalization to [0, 1]
        df['label'] = df['label'] / 5.0
        df['label'] = df['label'].clip(lower=0.0, upper=1.0)

        # Binary label for classification
        df['label_bin'] = (df['label'] > 0.5).astype(int)

        # Dataset statistics
        print(f"\n[STS-B - {split}] Statistics:")
        print(f"- Examples: {len(df)}")
        print(f"- Similarity range: [{df['label'].min():.2f}, {df['label'].max():.2f}]")
        print(f"- Binary distribution: {df['label_bin'].value_counts().to_dict()}")

        # Apply sampling if requested
        if sample_size is not None:
            sample_size = min(sample_size, len(df))
            df = df.sample(n=sample_size, random_state=random_state)
            print(f"- Sample used: {sample_size} examples")

        return df

    def _load_mrpc(self, split, sample_size, random_state):
        """Loads the MRPC (Microsoft Research Paraphrase Corpus) dataset."""
        ds = load_dataset('glue', 'mrpc', cache_dir=self.cache_dir)[split]
        df = pd.DataFrame(ds)

        # Ensure labels are integers
        df['label'] = df['label'].astype(int)
        df['label_bin'] = df['label']

        # Dataset statistics
        print(f"\n[MRPC - {split}] Statistics:")
        print(f"- Examples: {len(df)}")
        print(f"- Distribution: {df['label'].value_counts().to_dict()}")

        # Apply sampling if requested
        if sample_size is not None:
            sample_size = min(sample_size, len(df))
            df = df.sample(n=sample_size, random_state=random_state)
            print(f"- Sample used: {sample_size} examples")

        return df

    def visualize_dataset_distribution(self, df, dataset_name):
        """Generates visualization of data distribution."""
        plt.figure(figsize=(10, 6))

        if dataset_name.lower() == 'stsb':
            sns.histplot(df['label'], bins=20, kde=True)
            plt.title('Similarity Distribution in STS-B')
            plt.xlabel('Normalized Similarity [0,1]')
        else:  # MRPC
            counts = df['label'].value_counts().sort_index()
            sns.barplot(x=counts.index, y=counts.values)
            plt.title('Class Distribution in MRPC')
            plt.xlabel('Class (0=Not Paraphrase, 1=Paraphrase)')
            plt.xticks([0, 1], ['Not Paraphrase', 'Paraphrase'])

        plt.ylabel('Count')
        plt.tight_layout()

        fig_path = FIGURES_DIR / f"{dataset_name}_distribution.png"
        plt.savefig(fig_path, dpi=300, bbox_inches='tight')
        plt.close()

        return fig_path

# Classes for triplet learning
class TripletGenerator:
    """Generator of triplets (anchor, positive, negative) for Triplet Loss."""

    def __init__(self, dataset, fixed_negative=None, hard_negatives=False):
        """
        Initializes the triplet generator.

        Args:
            dataset: DataFrame with sentence pairs
            fixed_negative: Fixed negative sentence (optional)
            hard_negatives: If True, selects hard negatives from dataset
        """
        self.dataset = dataset
        self.fixed_negative = fixed_negative
        self.hard_negatives = hard_negatives

    def generate_triplets(self, n_triplets=None):
        """
        Generates sentence triplets for training.

        Args:
            n_triplets: Number of triplets to generate (default: dataset size)

        Returns:
            List of triplets (anchor, positive, negative)
        """
        if n_triplets is None:
            n_triplets = len(self.dataset)

        triplets = []
        indices = random.sample(range(len(self.dataset)), k=min(n_triplets, len(self.dataset)))

        for i in indices:
            anchor = self.dataset.iloc[i]['sentence1']
            positive = self.dataset.iloc[i]['sentence2']

            if self.fixed_negative:
                negative = self.fixed_negative
            elif self.hard_negatives:
                # Select a different sentence as negative
                neg_idx = random.choice([j for j in range(len(self.dataset)) if j != i])
                negative = random.choice([self.dataset.iloc[neg_idx]['sentence1'],
                                         self.dataset.iloc[neg_idx]['sentence2']])
            else:
                # Use a random sentence as negative
                negative = "This is a negative sentence for the triplet."

            triplets.append((anchor, positive, negative))

        return triplets

class TripletDataset(Dataset):
    """Triplet dataset compatible with PyTorch DataLoader."""

    def __init__(self, triplets):
        self.triplets = triplets

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        a, p, n = self.triplets[idx]
        return InputExample(texts=[a, p, n])

# Custom loss functions
class TripletLoss(torch.nn.Module):
    def __init__(self, model, margin=1.0): super().__init__(); self.model = model; self.margin = margin
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        return torch.mean(F.relu(torch.norm(e[0]-e[1], p=2, dim=1) - torch.norm(e[0]-e[2], p=2, dim=1) + self.margin))

class OnlineTripletLoss(TripletLoss): pass
class BatchHardTripletLoss(TripletLoss): pass
class BatchSemiHardTripletLoss(TripletLoss): pass
class BatchAllTripletLoss(TripletLoss): pass

class MSELoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [self.model(f)['sentence_embedding'] for f in sf]; return F.mse_loss(e[0], e[1])

class EuclideanLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        return torch.mean(torch.norm(e[0] - e[1], p=2, dim=1))

class NormalizedEuclideanLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        distance = torch.norm(e[0] - e[1], p=2, dim=1)
        return torch.mean(distance)

class AngularMarginLoss(torch.nn.Module):
    def __init__(self, model, margin=0.5): super().__init__(); self.model = model; self.margin = margin
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        cosine = torch.sum(e[0] * e[1], dim=1)
        theta = torch.acos(torch.clamp(cosine, -1.0 + 1e-7, 1.0 - 1e-7))
        return torch.mean((theta + self.margin * (1.0 - lbl.float())) ** 2)

class CircleLoss(torch.nn.Module):
    def __init__(self, model, m=0.25, gamma=256): super().__init__(); self.model = model; self.m = m; self.gamma = gamma
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        alpha_p = torch.clamp_min(1 + self.m - sim, min=0)
        alpha_n = torch.clamp_min(sim + self.m, min=0)
        delta_p = 1 - self.m
        delta_n = self.m
        logits_p = (-self.gamma) * alpha_p * (sim - delta_p)
        logits_n = self.gamma * alpha_n * (sim - delta_n)
        loss = torch.log1p(torch.exp(logits_n)) + torch.log1p(torch.exp(logits_p))
        return loss.mean()

class SphereLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        return torch.mean(1 - torch.sum(e[0] * e[1], dim=1))

class HistogramLoss(torch.nn.Module):
    def __init__(self, model, num_bins=10): super().__init__(); self.model = model; self.num_bins = num_bins
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        hist_pos = torch.histc(sim[lbl == 1], bins=self.num_bins, min=-1, max=1)
        hist_neg = torch.histc(sim[lbl == 0], bins=self.num_bins, min=-1, max=1)
        hist_pos /= (torch.sum(hist_pos) + 1e-10)
        hist_neg /= (torch.sum(hist_neg) + 1e-10)
        return torch.sum((hist_pos - hist_neg) ** 2)

class CentroidLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        pos_mask = (lbl == 1).unsqueeze(1)
        neg_mask = (lbl == 0).unsqueeze(1)
        pos_centroid = (e[0] * pos_mask).sum(0) / (pos_mask.sum() + 1e-10)
        neg_centroid = (e[0] * neg_mask).sum(0) / (neg_mask.sum() + 1e-10)
        return F.mse_loss(pos_centroid, neg_centroid)

class HyperSphereLoss(torch.nn.Module):
    def __init__(self, model, radius=1.0): super().__init__(); self.model = model; self.radius = radius
    def forward(self, sf, lbl):
        e = [self.model(f)['sentence_embedding'] for f in sf]
        norms = [torch.norm(emb, p=2, dim=1) for emb in e]
        return torch.mean((norms[0] - self.radius) ** 2 + (norms[1] - self.radius) ** 2)

class ProbabilisticLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        prob = torch.sigmoid(sim)
        return F.binary_cross_entropy(prob, lbl.float())

class LiftedStructuredLoss(torch.nn.Module):
    def __init__(self, model, margin=1.0): super().__init__(); self.model = model; self.margin = margin
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        dist_matrix = torch.cdist(e[0], e[1], p=2)
        pos_mask = (lbl == 1).float()
        neg_mask = (lbl == 0).float()
        pos_term = torch.log(torch.exp(dist_matrix * pos_mask).sum() + 1)
        neg_term = torch.log(torch.exp(-dist_matrix * neg_mask + self.margin).sum() + 1)
        return pos_term + neg_term

class GeneralPairLoss(torch.nn.Module):
    def __init__(self, model, pos_weight=1.0, neg_weight=1.0): super().__init__(); self.model = model; self.pos_weight = pos_weight; self.neg_weight = neg_weight
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        pos_pairs = sim[lbl == 1]
        neg_pairs = sim[lbl == 0]
        pos_loss = self.pos_weight * torch.mean((1 - pos_pairs) ** 2)
        neg_loss = self.neg_weight * torch.mean(neg_pairs ** 2)
        return pos_loss + neg_loss

class AngularLoss(torch.nn.Module):
    def __init__(self, model, angle_bound=1.0): super().__init__(); self.model = model; self.angle_bound = angle_bound
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        cos_theta = torch.sum(e[0] * e[1], dim=1)
        theta = torch.acos(torch.clamp(cos_theta, -1.0 + 1e-7, 1.0 - 1e-7))
        target = lbl.float()
        return torch.mean(target * theta + (1 - target) * torch.clamp(self.angle_bound - theta, min=0.0))

class MarginRankingLoss(torch.nn.Module):
    def __init__(self, model, margin=0.5): super().__init__(); self.model = model; self.margin = margin
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        target = 2 * lbl.float() - 1
        return torch.mean(torch.clamp(self.margin - target * sim, min=0.0))

# Dictionary with loss functions
loss_functions = {
    'MSE': MSELoss,
    'Cosine': losses.CosineSimilarityLoss,
    'Contrastive': losses.ContrastiveLoss,
    'InfoNCE': losses.MultipleNegativesRankingLoss,
    'Euclidean': EuclideanLoss,
    'NormaEuc': NormalizedEuclideanLoss,
    'NPairs': losses.BatchAllTripletLoss,
    'MultiSimilarity': losses.MultipleNegativesRankingLoss,
    'AngularMargin': AngularMarginLoss,
    'Sphere': SphereLoss,
    'HyperSphere': HyperSphereLoss,
    'Probabilistic': ProbabilisticLoss,
    'LiftedStructured': LiftedStructuredLoss,
    'GeneralPair': GeneralPairLoss,
    'Angular': AngularLoss,
    'MarginRanking': MarginRankingLoss,
    'Triplet': TripletLoss,
    'OnlineTriplet': OnlineTripletLoss,
    'BatchHardTriplet': BatchHardTripletLoss,
    'BatchSemiHardTriplet': BatchSemiHardTripletLoss,
    'BatchAllTriplet': BatchAllTripletLoss
}

def generate_examples(df, loss_name, fixed_negative=None):
    """
    Generates training examples compatible with different loss functions.

    Args:
        df: DataFrame with data
        loss_name: Name of loss function to use
        fixed_negative: Fixed negative sentence for Triplet Loss

    Returns:
        Dataset with examples formatted for the specified loss function
    """
    if 'Triplet' in loss_name:
        triplets = TripletGenerator(df, fixed_negative, hard_negatives=True).generate_triplets()
        return TripletDataset(triplets)
    elif loss_name == 'Contrastive':
        # For Contrastive Loss, we use binary labels
        examples = [InputExample(texts=[r['sentence1'], r['sentence2']], label=float(r['label_bin']))
                    for _, r in df.iterrows()]
        return examples
    else:
        # For other loss functions, we use continuous similarity
        examples = [InputExample(texts=[r['sentence1'], r['sentence2']], label=float(r['label']))
                    for _, r in df.iterrows()]
        return examples

# Evaluation functions
def evaluate_model(model, test_df, dataset_name):
    """
    Evaluates a model on a test dataset.

    Args:
        model: Trained SentenceTransformer model
        test_df: DataFrame with test data
        dataset_name: Dataset name ('stsb' or 'mrpc')

    Returns:
        Dictionary with evaluation metrics
    """
    # Prepare data
    sent1 = test_df['sentence1'].tolist()
    sent2 = test_df['sentence2'].tolist()
    labels = test_df['label'].tolist()

    # Calculate embeddings and similarities
    embeddings = model.encode(sent1 + sent2, batch_size=32, show_progress_bar=False)
    embeddings1 = embeddings[:len(sent1)]
    embeddings2 = embeddings[len(sent1):]

    # Calculate cosine similarities
    similarities = []
    for e1, e2 in zip(embeddings1, embeddings2):
        similarities.append(cosine_similarity([e1], [e2])[0][0])

    # Basic metrics
    mean_sim = np.mean(similarities)
    std_sim = np.std(similarities)
    results = {
        'mean_similarity': mean_sim,
        'std_similarity': std_sim
    }

    # Dataset-specific metrics
    if dataset_name.lower() == 'stsb':
        # Correlation for similarity tasks
        if len(set(labels)) > 1 and len(set(similarities)) > 1:
            results['pearson'] = pearsonr(labels, similarities)[0]
            results['spearman'] = spearmanr(labels, similarities)[0]
        else:
            results['pearson'] = float('nan')
            results['spearman'] = float('nan')

        # Example for debugging
        print("\n[STS-B] Evaluation example:")
        for i in range(min(3, len(labels))):
            print(f"  Label: {labels[i]:.2f} | Similarity: {similarities[i]:.2f}")

    elif dataset_name.lower() == 'mrpc':
        # Classification metrics
        binary_preds = [1 if s >= 0.5 else 0 for s in similarities]
        results['accuracy'] = accuracy_score(labels, binary_preds)
        results['f1'] = f1_score(labels, binary_preds)
        results['precision'] = precision_score(labels, binary_preds)
        results['recall'] = recall_score(labels, binary_preds)

        # Example for debugging
        print("\n[MRPC] Evaluation example:")
        for i in range(min(3, len(labels))):
            print(f"  Label: {labels[i]} | Predicted: {binary_preds[i]} | Similarity: {similarities[i]:.2f}")

    return results

def plot_results(results_df, metric, dataset_name):
    """
    Generates comparative result plots.

    Args:
        results_df: DataFrame with results
        metric: Metric to visualize
        dataset_name: Dataset name

    Returns:
        Path to saved figure file
    """
    plt.figure(figsize=(12, 8))

    # Filter data for specific dataset
    df = results_df[results_df['Dataset'] == dataset_name].copy()

    # Prepare grouped bar plot
    pivot_df = df.pivot(index='Model', columns='Loss Function', values=metric)

    ax = pivot_df.plot(kind='bar', figsize=(12, 8))

    # Graph settings
    plt.title(f'{metric} by Model and Loss Function - {dataset_name.upper()}', fontsize=14)
    plt.xlabel('Model', fontsize=12)
    plt.ylabel(metric, fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend(title='Loss Function', fontsize=10)

    # Add values on bars
    for container in ax.containers:
        ax.bar_label(container, fmt='%.3f', fontsize=8)

    plt.tight_layout()

    # Save figure
    filename = f"{dataset_name}_{metric}_comparison.png"
    filepath = FIGURES_DIR / filename
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    return filepath

def plot_training_curve(history, model_name, loss_name, dataset_name):
    """
    Plots training curve.

    Args:
        history: Training history
        model_name: Model name
        loss_name: Loss function name
        dataset_name: Dataset name

    Returns:
        Path to saved figure file
    """
    plt.figure(figsize=(10, 6))

    # Extract history data
    epochs = range(1, len(history['train_loss']) + 1)

    # Plot losses
    plt.plot(epochs, history['train_loss'], 'b-', label='Training Loss')

    # Add graph information
    plt.title(f'Training Curve: {model_name}\n{loss_name} on {dataset_name}', fontsize=14)
    plt.xlabel('Epoch', fontsize=12)
    plt.ylabel('Loss', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()

    # Save figure
    model_short = model_name.split('/')[-1] if '/' in model_name else model_name
    filename = f"{dataset_name}_{model_short}_{loss_name}_training.png"
    filepath = FIGURES_DIR / filename
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    return filepath

# Main training and evaluation function
def train_and_evaluate(model_name, dataset_name, loss_name, train_df, test_df,
                      epochs=3, batch_size=16, save_model=False):
    """
    Trains and evaluates a model with a specific loss function.

    Args:
        model_name: Sentence-Transformer model name
        dataset_name: Dataset name ('stsb' or 'mrpc')
        loss_name: Loss function name
        train_df: DataFrame with training data
        test_df: DataFrame with test data
        epochs: Number of training epochs
        batch_size: Batch size
        save_model: If True, saves the trained model

    Returns:
        Dictionary with results and metrics
    """
    try:
        # Initialize model
        model = SentenceTransformer(model_name).to(DEVICE)
        model_identifier = model_name.split('/')[-1] if '/' in model_name else model_name

        # Configure training
        fixed_negative = "This is an example negative sentence for training triplets."
        dataset = generate_examples(train_df, loss_name, fixed_negative)
        dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
        loss_fn = loss_functions[loss_name](model)

        # Record training history
        history = {'train_loss': []}

        class LogCallback:
            def __init__(self, history):
                self.history = history

            def on_epoch_end(self, epoch, loss, *args, **kwargs):
                self.history['train_loss'].append(loss)

        # Execute training
        start_time = time.time()
        model.fit(
            train_objectives=[(dataloader, loss_fn)],
            epochs=epochs,
            warmup_steps=int(len(dataloader) * 0.1),
            show_progress_bar=True,
            output_path=None,
            callback=LogCallback(history)
        )
        training_time = time.time() - start_time

        # Evaluate model
        evaluation_results = evaluate_model(model, test_df, dataset_name)

        # Save model if requested
        model_path = None
        if save_model:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_path = MODELS_DIR / f"{dataset_name}_{model_identifier}_{loss_name}_{timestamp}"
            model.save(str(model_path))

        # Plot training curve
        training_plot = plot_training_curve(history, model_name, loss_name, dataset_name)

        # Consolidate results
        results = {
            'Dataset': dataset_name,
            'Model': model_name,
            'Loss Function': loss_name,
            'Training Time (s)': round(training_time, 2),
            'Mean Similarity': round(evaluation_results['mean_similarity'], 4),
            'STD Similarity': round(evaluation_results['std_similarity'], 4),
            'Epochs': epochs,
            'Batch Size': batch_size,
            'Training Plot': str(training_plot),
            'Model Path': str(model_path) if model_path else None
        }

        # Add specific metrics
        if dataset_name.lower() == 'stsb':
            results['Pearson'] = round(evaluation_results['pearson'], 4) if 'pearson' in evaluation_results else None
            results['Spearman'] = round(evaluation_results['spearman'], 4) if 'spearman' in evaluation_results else None
        elif dataset_name.lower() == 'mrpc':
            results['Accuracy'] = round(evaluation_results['accuracy'], 4) if 'accuracy' in evaluation_results else None
            results['F1 Score'] = round(evaluation_results['f1'], 4) if 'f1' in evaluation_results else None
            results['Precision'] = round(evaluation_results['precision'], 4) if 'precision' in evaluation_results else None
            results['Recall'] = round(evaluation_results['recall'], 4) if 'recall' in evaluation_results else None

        return results

    except Exception as e:
        print(f"Error in train_and_evaluate({model_name}, {dataset_name}, {loss_name}): {e}")
        import traceback
        traceback.print_exc()
        return {
            'Dataset': dataset_name,
            'Model': model_name,
            'Loss Function': loss_name,
            'Error': str(e)
        }

# Main function
def main():
    # List of models to evaluate
    model_names = [
        'sentence-transformers/all-mpnet-base-v2',
        'sentence-transformers/bert-base-nli-mean-tokens',
        'sentence-transformers/paraphrase-MiniLM-L6-v2'
    ]

    # Datasets to evaluate
    datasets = ['stsb', 'mrpc']

    # Experimental configurations
    experiment_config = {
        'seed': SEED,
        'device': str(DEVICE),
        'epochs': NUM_EPOCHS,
        'batch_size': BATCH_SIZE,
        'sample_size': SAMPLE_SIZE,
        'save_models': SAVE_MODELS,
        'models': model_names,
        'datasets': datasets,
        'loss_functions': list(loss_functions.keys()),
        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }

    # Save experiment configuration
    with open(RESULTS_DIR / "experiment_config.json", 'w') as f:
        json.dump(experiment_config, f, indent=2)

    # Initialize dataset loader
    loader = DatasetLoader()

    # Results stored here
    all_results = []
    dataset_figures = {}

    # Main loop
    for dataset_name in datasets:
        print(f"\n\n{'='*60}")
        print(f"Dataset: {dataset_name.upper()}")
        print(f"{'='*60}")

        # Load datasets
        train_df = loader.load_dataset(dataset_name, 'train', sample_size=SAMPLE_SIZE)
        test_df = loader.load_dataset(dataset_name, 'validation', sample_size=min(408, SAMPLE_SIZE if SAMPLE_SIZE else 1000))

        # Distribution visualization
        dist_fig = loader.visualize_dataset_distribution(train_df, dataset_name)
        dataset_figures[dataset_name] = str(dist_fig)

        # Data sample
        print(f"\nData sample ({dataset_name.upper()}):")
        print(train_df[['sentence1', 'sentence2', 'label']].head(3).to_string())

        # Run evaluation for each combination
        results_dataset = []

        for model_name in model_names:
            model_short = model_name.split('/')[-1]
            print(f"\n{'-'*40}")
            print(f"Model: {model_short}")
            print(f"{'-'*40}")

            for loss_name in loss_functions.keys():
                print(f"\nEvaluating {model_short} with {loss_name} on {dataset_name.upper()}...")

                result = train_and_evaluate(
                    model_name=model_name,
                    dataset_name=dataset_name,
                    loss_name=loss_name,
                    train_df=train_df,
                    test_df=test_df,
                    epochs=NUM_EPOCHS,
                    batch_size=BATCH_SIZE,
                    save_model=SAVE_MODELS
                )

                results_dataset.append(result)
                all_results.append(result)

                # Immediate result logging
                if 'Error' in result:
                    print(f"❌ Error: {result['Error']}")
                else:
                    print(f"✅ Completed: Mean Sim = {result['Mean Similarity']}")
                    if dataset_name.lower() == 'stsb':
                        print(f"   Pearson = {result['Pearson']}")
                    else:
                        print(f"   Accuracy = {result['Accuracy']}, F1 = {result['F1 Score']}")

        # Save results per dataset
        results_df = pd.DataFrame(results_dataset)
        results_df.to_csv(RESULTS_DIR / f"results_{dataset_name}.csv", index=False)

        # Generate visualizations
        if dataset_name.lower() == 'stsb':
            plot_results(results_df, 'Pearson', dataset_name)
        else:
            plot_results(results_df, 'F1 Score', dataset_name)
            plot_results(results_df, 'Accuracy', dataset_name)

    # Consolidate all results
    all_results_df = pd.DataFrame(all_results)
    all_results_df.to_csv(RESULTS_DIR / "complete_results.csv", index=False)

    # Generate HTML report
    generate_html_report(all_results_df, experiment_config, dataset_figures)

    print("\n\nExperiment completed. Results available in:", RESULTS_DIR)
    return all_results_df

def generate_html_report(results_df, config, dataset_figures):

    """
    Generates an HTML report with experiment results.

    Args:
        results_df: DataFrame with all results
        config: Experiment configuration
        dataset_figures: Dictionary with paths to dataset figures
    """
    report_path = RESULTS_DIR / "experiment_report.html"

    # Prepare result tables
    stsb_df = results_df[results_df['Dataset'] == 'stsb'].copy()
    mrpc_df = results_df[results_df['Dataset'] == 'mrpc'].copy()

    # Select relevant columns
    stsb_cols = ['Model', 'Loss Function', 'Training Time (s)', 'Mean Similarity', 'Pearson', 'Spearman']
    mrpc_cols = ['Model', 'Loss Function', 'Training Time (s)', 'Mean Similarity', 'Accuracy', 'F1 Score', 'Precision', 'Recall']

    # Clean model names for display
    for df in [stsb_df, mrpc_df]:
        df['Model'] = df['Model'].apply(lambda x: x.split('/')[-1] if '/' in x else x)

    # Generate HTML
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Sentence-Transformers Evaluation Report</title>
        <style>
            body {{ font-family: Arial, sans-serif; line-height: 1.6; margin: 0; padding: 20px; color: #333; }}
            h1, h2, h3 {{ color: #2c3e50; }}
            table {{ border-collapse: collapse; width: 100%; margin-bottom: 20px; }}
            th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
            th {{ background-color: #f2f2f2; color: #333; font-weight: bold; }}
            tr:nth-child(even) {{ background-color: #f9f9f9; }}
            tr:hover {{ background-color: #f5f5f5; }}
            .container {{ max-width: 1200px; margin: 0 auto; padding: 20px; }}
            .section {{ margin-bottom: 30px; }}
            .best-result {{ font-weight: bold; color: #27ae60; }}
            img {{ max-width: 100%; height: auto; margin: 10px 0; border: 1px solid #ddd; }}
            .config {{ background-color: #f8f9fa; padding: 15px; border-radius: 4px; margin-bottom: 20px; }}
            footer {{ margin-top: 30px; padding-top: 10px; border-top: 1px solid #eee; color: #7f8c8d; font-size: 0.9em; }}
        </style>
    </head>
    <body>
        <div class="container">
            <header>
                <h1>Comparative Evaluation of Sentence-Transformer Models</h1>
                <p>Report generated at: {config['timestamp']}</p>
            </header>

            <div class="section">
                <h2>Experiment Configuration</h2>
                <div class="config">
                    <p><strong>Device:</strong> {config['device']}</p>
                    <p><strong>Seed:</strong> {config['seed']}</p>
                    <p><strong>Epochs:</strong> {config['epochs']}</p>
                    <p><strong>Batch Size:</strong> {config['batch_size']}</p>
                    <p><strong>Sample:</strong> {config['sample_size'] if config['sample_size'] else 'Full Dataset'}</p>
                    <p><strong>Models:</strong> {', '.join([m.split('/')[-1] if '/' in m else m for m in config['models']])}</p>
                    <p><strong>Loss Functions:</strong> {', '.join(config['loss_functions'])}</p>
                </div>
            </div>

            <div class="section">
                <h2>Results - STS-B (Semantic Similarity)</h2>
                <p>Training data distribution:</p>
                <img src="{dataset_figures['stsb']}" alt="STS-B Distribution">

                <h3>Performance Metrics</h3>
                <table>
                    <tr>
                        <th>Model</th>
                        <th>Loss Function</th>
                        <th>Time (s)</th>
                        <th>Mean Similarity</th>
                        <th>Pearson Correlation</th>
                        <th>Spearman Correlation</th>
                    </tr>
                    {stsb_df[stsb_cols].sort_values('Pearson', ascending=False).to_html(index=False, header=False, classes='results-table')}
                </table>

                <h3>Results Visualization</h3>
                <img src="{FIGURES_DIR / 'stsb_Pearson_comparison.png'}" alt="Pearson Comparison STS-B">
            </div>

            <div class="section">
                <h2>Results - MRPC (Paraphrase Detection)</h2>
                <p>Training data distribution:</p>
                <img src="{dataset_figures['mrpc']}" alt="MRPC Distribution">

                <h3>Performance Metrics</h3>
                <table>
                    <tr>
                        <th>Model</th>
                        <th>Loss Function</th>
                        <th>Time (s)</th>
                        <th>Mean Similarity</th>
                        <th>Accuracy</th>
                        <th>F1 Score</th>
                        <th>Precision</th>
                        <th>Recall</th>
                    </tr>
                    {mrpc_df[mrpc_cols].sort_values('F1 Score', ascending=False).to_html(index=False, header=False, classes='results-table')}
                </table>

                <h3>Results Visualization</h3>
                <img src="{FIGURES_DIR / 'mrpc_F1 Score_comparison.png'}" alt="F1 Comparison MRPC">
                <img src="{FIGURES_DIR / 'mrpc_Accuracy_comparison.png'}" alt="Accuracy Comparison MRPC">
            </div>

            <div class="section">
                <h2>Training Curve Analysis</h2>
                <p>Examples of training curves for the best models:</p>

                <h3>STS-B (Best model)</h3>
                <img src="{stsb_df.sort_values('Pearson', ascending=False).iloc[0]['Training Plot']}" alt="Best Curve STS-B">

                <h3>MRPC (Best model)</h3>
                <img src="{mrpc_df.sort_values('F1 Score', ascending=False).iloc[0]['Training Plot']}" alt="Best Curve MRPC">
            </div>

            <div class="section">
                <h2>Conclusions</h2>
                <p><strong>Best configuration for STS-B:</strong> {stsb_df.sort_values('Pearson', ascending=False).iloc[0]['Model']} with {stsb_df.sort_values('Pearson', ascending=False).iloc[0]['Loss Function']} (Pearson: {stsb_df.sort_values('Pearson', ascending=False).iloc[0]['Pearson']})</p>
                <p><strong>Best configuration for MRPC:</strong> {mrpc_df.sort_values('F1 Score', ascending=False).iloc[0]['Model']} with {mrpc_df.sort_values('F1 Score', ascending=False).iloc[0]['Loss Function']} (F1: {mrpc_df.sort_values('F1 Score', ascending=False).iloc[0]['F1 Score']})</p>

                <p>General observations:</p>
                <ul>
                    <li>Loss functions have significant impact on model performance.</li>
                    <li>Models specialized in paraphrase tend to perform better in evaluated tasks.</li>
                    <li>Training time varies considerably between models.</li>
                </ul>
            </div>

            <footer>
                <p>Report automatically generated by Sentence-Transformers evaluation script.</p>
                <p>All models and results are available in directory: {RESULTS_DIR}</p>
            </footer>
        </div>
    </body>
    </html>
    """

    # Save report
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"HTML report generated at: {report_path}")
    return report_path

# Additional functions for advanced analysis

def analyze_similarity_metrics_correlation(results_df, dataset_name):
    """
    Analyzes correlation between mean similarity and performance metrics.

    Args:
        results_df: DataFrame with results
        dataset_name: Dataset name to analyze

    Returns:
        Figure with correlation matrix
    """
    # Filter data for specific dataset
    df = results_df[results_df['Dataset'] == dataset_name].copy()

    # Columns to analyze
    if dataset_name.lower() == 'stsb':
        cols = ['Mean Similarity', 'STD Similarity', 'Pearson', 'Spearman', 'Training Time (s)']
    else:  # MRPC
        cols = ['Mean Similarity', 'STD Similarity', 'Accuracy', 'F1 Score',
                'Precision', 'Recall', 'Training Time (s)']

    # Calculate correlation matrix
    corr_matrix = df[cols].corr()

    # Visualize correlation matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title(f'Metrics Correlation - {dataset_name.upper()}')
    plt.tight_layout()

    # Save figure
    filepath = FIGURES_DIR / f"{dataset_name}_metric_correlation.png"
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    return filepath

def analyze_time_vs_performance(results_df):
    """
    Analyzes relationship between training time and performance metrics.

    Args:
        results_df: DataFrame with results

    Returns:
        Figure with scatter plots
    """
    plt.figure(figsize=(12, 10))

    # Split into subplots
    fig, axes = plt.subplots(2, 1, figsize=(12, 12))

    # Data for STS-B
    stsb_df = results_df[results_df['Dataset'] == 'stsb'].copy()
    stsb_df['Model'] = stsb_df['Model'].apply(lambda x: x.split('/')[-1] if '/' in x else x)

    # Data for MRPC
    mrpc_df = results_df[results_df['Dataset'] == 'mrpc'].copy()
    mrpc_df['Model'] = mrpc_df['Model'].apply(lambda x: x.split('/')[-1] if '/' in x else x)

    # Plot for STS-B
    ax = axes[0]
    for model in stsb_df['Model'].unique():
        model_df = stsb_df[stsb_df['Model'] == model]
        ax.scatter(model_df['Training Time (s)'], model_df['Pearson'],
                  label=model, alpha=0.7, s=80)

        # Add labels for each point
        for _, row in model_df.iterrows():
            ax.annotate(row['Loss Function'],
                       (row['Training Time (s)'], row['Pearson']),
                       fontsize=8, alpha=0.8)

    ax.set_title('STS-B: Pearson Correlation vs. Training Time')
    ax.set_xlabel('Training Time (seconds)')
    ax.set_ylabel('Pearson Correlation')
    ax.grid(True, linestyle='--', alpha=0.6)
    ax.legend()

    # Plot for MRPC
    ax = axes[1]
    for model in mrpc_df['Model'].unique():
        model_df = mrpc_df[mrpc_df['Model'] == model]
        ax.scatter(model_df['Training Time (s)'], model_df['F1 Score'],
                  label=model, alpha=0.7, s=80)

        # Add labels for each point
        for _, row in model_df.iterrows():
            ax.annotate(row['Loss Function'],
                       (row['Training Time (s)'], row['F1 Score']),
                       fontsize=8, alpha=0.8)

    ax.set_title('MRPC: F1 Score vs. Training Time')
    ax.set_xlabel('Training Time (seconds)')
    ax.set_ylabel('F1 Score')
    ax.grid(True, linestyle='--', alpha=0.6)
    ax.legend()

    plt.tight_layout()

    # Save figure
    filepath = FIGURES_DIR / "time_vs_performance.png"
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    return filepath

def analyze_loss_function_impact(results_df):
    """
    Analyzes impact of different loss functions on performance.

    Args:
        results_df: DataFrame with results

    Returns:
        DataFrame with impact statistics
    """
    # Statistics per loss function
    impact = []

    # Analysis for STS-B
    stsb_df = results_df[results_df['Dataset'] == 'stsb'].copy()
    stsb_metrics = ['Pearson', 'Spearman']

    for loss_fn in stsb_df['Loss Function'].unique():
        loss_stats = {
            'Dataset': 'STS-B',
            'Loss Function': loss_fn,
            'Count': len(stsb_df[stsb_df['Loss Function'] == loss_fn])
        }

        for metric in stsb_metrics:
            loss_stats[f'Mean {metric}'] = stsb_df[stsb_df['Loss Function'] == loss_fn][metric].mean()
            loss_stats[f'Std {metric}'] = stsb_df[stsb_df['Loss Function'] == loss_fn][metric].std()
            loss_stats[f'Max {metric}'] = stsb_df[stsb_df['Loss Function'] == loss_fn][metric].max()
            loss_stats[f'Min {metric}'] = stsb_df[stsb_df['Loss Function'] == loss_fn][metric].min()

        impact.append(loss_stats)

    # Analysis for MRPC
    mrpc_df = results_df[results_df['Dataset'] == 'mrpc'].copy()
    mrpc_metrics = ['Accuracy', 'F1 Score']

    for loss_fn in mrpc_df['Loss Function'].unique():
        loss_stats = {
            'Dataset': 'MRPC',
            'Loss Function': loss_fn,
            'Count': len(mrpc_df[mrpc_df['Loss Function'] == loss_fn])
        }

        for metric in mrpc_metrics:
            loss_stats[f'Mean {metric}'] = mrpc_df[mrpc_df['Loss Function'] == loss_fn][metric].mean()
            loss_stats[f'Std {metric}'] = mrpc_df[mrpc_df['Loss Function'] == loss_fn][metric].std()
            loss_stats[f'Max {metric}'] = mrpc_df[mrpc_df['Loss Function'] == loss_fn][metric].max()
            loss_stats[f'Min {metric}'] = mrpc_df[mrpc_df['Loss Function'] == loss_fn][metric].min()

        impact.append(loss_stats)

    # Create DataFrame with statistics
    impact_df = pd.DataFrame(impact)

    # Save analysis
    impact_df.to_csv(RESULTS_DIR / "loss_functions_impact.csv", index=False)

    return impact_df

def extended_experiment():
    """
    Main function that executes the experiment and additional analyses.
    """
    try:
        # Run main experiment
        results_df = main()

        # Validate we have results for analyses
        if results_df is None or len(results_df) == 0:
            print("❌ No results for additional analyses.")
            return

        print("\n\n" + "="*60)
        print("Additional Analyses")
        print("="*60)

        # Correlation analysis between metrics
        print("\nAnalyzing correlation between metrics...")
        analyze_similarity_metrics_correlation(results_df, 'stsb')
        analyze_similarity_metrics_correlation(results_df, 'mrpc')

        # Time vs. performance analysis
        print("\nAnalyzing time vs. performance relationship...")
        analyze_time_vs_performance(results_df)

        # Loss function impact analysis
        print("\nAnalyzing loss functions impact...")
        impact_df = analyze_loss_function_impact(results_df)

        # Analysis summary
        print("\nAnalysis Summary:")
        print(f"- {len(results_df)} model-loss combinations tested")

        for dataset in ['STS-B', 'MRPC']:
            print(f"\n{dataset}:")
            dataset_impact = impact_df[impact_df['Dataset'] == dataset]

            if dataset == 'STS-B':
                best_loss = dataset_impact.sort_values('Mean Pearson', ascending=False).iloc[0]
                print(f"- Best loss function: {best_loss['Loss Function']} (Mean Pearson: {best_loss['Mean Pearson']:.4f})")
            else:
                best_loss = dataset_impact.sort_values('Mean F1 Score', ascending=False).iloc[0]
                print(f"- Best loss function: {best_loss['Loss Function']} (Mean F1: {best_loss['Mean F1 Score']:.4f})")

        print("\n✅ Additional analyses completed and saved in:", RESULTS_DIR)

    except Exception as e:
        print(f"❌ Error in additional analyses: {e}")
        import traceback
        traceback.print_exc()

# Execute if main script
if __name__ == "__main__":
    print("="*80)
    print("Evaluation of Sentence-Transformer Models for Semantic Similarity")
    print("="*80)
    print("Settings:")
    print(f"- Seed: {SEED}")
    print(f"- Device: {DEVICE}")
    print(f"- Epochs: {NUM_EPOCHS}")
    print(f"- Batch Size: {BATCH_SIZE}")
    print(f"- Sample size: {SAMPLE_SIZE if SAMPLE_SIZE else 'Full dataset'}")
    print(f"- Results directory: {RESULTS_DIR}")
    print("="*80)

    # Run complete experiment with additional analyses
    extended_experiment()

Configuration: Seed=42, Device=cuda
Evaluation of Sentence-Transformer Models for Semantic Similarity
Settings:
- Seed: 42
- Device: cuda
- Epochs: 3
- Batch Size: 16
- Sample size: Full dataset
- Results directory: results


Dataset: STSB

[STS-B - train] Statistics:
- Examples: 5749
- Similarity range: [0.00, 1.00]
- Binary distribution: {1: 3385, 0: 2364}

[STS-B - validation] Statistics:
- Examples: 1500
- Similarity range: [0.00, 1.00]
- Binary distribution: {1: 750, 0: 750}
- Sample used: 408 examples

Data sample (STSB):
                                       sentence1                                                 sentence2  label
0                         A plane is taking off.                               An air plane is taking off.   1.00
1                A man is playing a large flute.                                 A man is playing a flute.   0.76
2  A man is spreading shreded cheese on a pizza.  A man is spreading shredded cheese on an uncooked pizza.   0.76

---------

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0001
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998000264167786
   Pearson = 0.3574

Evaluating all-mpnet-base-v2 with Cosine on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0166
1000,0.008



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.57
  Label: 0.48 | Similarity: 0.69
  Label: 0.00 | Similarity: -0.03
✅ Completed: Mean Sim = 0.5242000222206116
   Pearson = 0.911

Evaluating all-mpnet-base-v2 with Contrastive on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0121
1000,0.0074



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.87
  Label: 0.48 | Similarity: 0.80
  Label: 0.00 | Similarity: 0.09
✅ Completed: Mean Sim = 0.6940000057220459
   Pearson = 0.8633

Evaluating all-mpnet-base-v2 with InfoNCE on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2268
1000,0.1176



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.79
  Label: 0.48 | Similarity: 0.87
  Label: 0.00 | Similarity: 0.22
✅ Completed: Mean Sim = 0.6141999959945679
   Pearson = 0.8494

Evaluating all-mpnet-base-v2 with Euclidean on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1309
1000,0.0592



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998999834060669
   Pearson = 0.2817

Evaluating all-mpnet-base-v2 with NormaEuc on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1309
1000,0.0592



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998999834060669
   Pearson = 0.2817

Evaluating all-mpnet-base-v2 with NPairs on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,4.8908
1000,4.8922



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 0.89
✅ Completed: Mean Sim = 0.7820000052452087
   Pearson = 0.4499

Evaluating all-mpnet-base-v2 with MultiSimilarity on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2268
1000,0.1176



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.79
  Label: 0.48 | Similarity: 0.87
  Label: 0.00 | Similarity: 0.22
✅ Completed: Mean Sim = 0.6141999959945679
   Pearson = 0.8494

Evaluating all-mpnet-base-v2 with AngularMargin on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2027
1000,0.1065



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998999834060669
   Pearson = 0.3245

Evaluating all-mpnet-base-v2 with Sphere on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0287
1000,0.0036



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998000264167786
   Pearson = 0.3692

Evaluating all-mpnet-base-v2 with HyperSphere on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.79
  Label: 0.48 | Similarity: 0.75
  Label: 0.00 | Similarity: -0.05
✅ Completed: Mean Sim = 0.5774000287055969
   Pearson = 0.8839

Evaluating all-mpnet-base-v2 with Probabilistic on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.6373
1000,0.6196



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.30
  Label: 0.48 | Similarity: 0.05
  Label: 0.00 | Similarity: 0.09
✅ Completed: Mean Sim = 0.44839999079704285
   Pearson = 0.8192

Evaluating all-mpnet-base-v2 with LiftedStructured on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,12.0874
1000,12.0808



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 0.59
✅ Completed: Mean Sim = 0.9527999758720398
   Pearson = 0.3482

Evaluating all-mpnet-base-v2 with GeneralPair on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.027
1000,0.0119



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.65
  Label: 0.48 | Similarity: 0.96
  Label: 0.00 | Similarity: 0.01
✅ Completed: Mean Sim = 0.6464999914169312
   Pearson = 0.8552

Evaluating all-mpnet-base-v2 with Angular on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3723
1000,0.3441



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.99
  Label: 0.48 | Similarity: 0.95
  Label: 0.00 | Similarity: 0.38
✅ Completed: Mean Sim = 0.8458999991416931
   Pearson = 0.7021

Evaluating all-mpnet-base-v2 with MarginRanking on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3358
1000,0.2974



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.44
  Label: 0.48 | Similarity: -0.19
  Label: 0.00 | Similarity: 0.11
✅ Completed: Mean Sim = 0.5404000282287598
   Pearson = 0.7764

Evaluating all-mpnet-base-v2 with Triplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0195
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.98
  Label: 0.48 | Similarity: 0.93
  Label: 0.00 | Similarity: 0.93
✅ Completed: Mean Sim = 0.9283999800682068
   Pearson = 0.5589

Evaluating all-mpnet-base-v2 with OnlineTriplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0195
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.98
  Label: 0.48 | Similarity: 0.93
  Label: 0.00 | Similarity: 0.93
✅ Completed: Mean Sim = 0.9283999800682068
   Pearson = 0.5589

Evaluating all-mpnet-base-v2 with BatchHardTriplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0195
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.98
  Label: 0.48 | Similarity: 0.93
  Label: 0.00 | Similarity: 0.93
✅ Completed: Mean Sim = 0.9283999800682068
   Pearson = 0.5589

Evaluating all-mpnet-base-v2 with BatchSemiHardTriplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0195
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.98
  Label: 0.48 | Similarity: 0.93
  Label: 0.00 | Similarity: 0.93
✅ Completed: Mean Sim = 0.9283999800682068
   Pearson = 0.5589

Evaluating all-mpnet-base-v2 with BatchAllTriplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0195
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.98
  Label: 0.48 | Similarity: 0.93
  Label: 0.00 | Similarity: 0.93
✅ Completed: Mean Sim = 0.9283999800682068
   Pearson = 0.5589

----------------------------------------
Model: bert-base-nli-mean-tokens
----------------------------------------

Evaluating bert-base-nli-mean-tokens with MSE on STSB...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0126
1000,0.0009



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998000264167786
   Pearson = 0.4999

Evaluating bert-base-nli-mean-tokens with Cosine on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0258
1000,0.0101



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.65
  Label: 0.48 | Similarity: 0.70
  Label: 0.00 | Similarity: 0.05
✅ Completed: Mean Sim = 0.5306000113487244
   Pearson = 0.8626

Evaluating bert-base-nli-mean-tokens with Contrastive on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.013
1000,0.0075



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.92
  Label: 0.48 | Similarity: 0.77
  Label: 0.00 | Similarity: 0.15
✅ Completed: Mean Sim = 0.713100016117096
   Pearson = 0.809

Evaluating bert-base-nli-mean-tokens with InfoNCE on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2994
1000,0.1324



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.79
  Label: 0.48 | Similarity: 0.92
  Label: 0.00 | Similarity: 0.40
✅ Completed: Mean Sim = 0.6855999827384949
   Pearson = 0.7891

Evaluating bert-base-nli-mean-tokens with Euclidean on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1234
1000,0.0409



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Pearson = 0.5056

Evaluating bert-base-nli-mean-tokens with NormaEuc on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1234
1000,0.0409



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Pearson = 0.5056

Evaluating bert-base-nli-mean-tokens with NPairs on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,4.9782
1000,4.9889



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.90
  Label: 0.48 | Similarity: 0.97
  Label: 0.00 | Similarity: 0.65
✅ Completed: Mean Sim = 0.8416000008583069
   Pearson = 0.7398

Evaluating bert-base-nli-mean-tokens with MultiSimilarity on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2994
1000,0.1324



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.79
  Label: 0.48 | Similarity: 0.92
  Label: 0.00 | Similarity: 0.40
✅ Completed: Mean Sim = 0.6855999827384949
   Pearson = 0.7891

Evaluating bert-base-nli-mean-tokens with AngularMargin on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1835
1000,0.1017



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Pearson = 0.4823

Evaluating bert-base-nli-mean-tokens with Sphere on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0224
1000,0.0037



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998999834060669
   Pearson = 0.5569

Evaluating bert-base-nli-mean-tokens with HyperSphere on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,34.5225
1000,2.0748



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.83
  Label: 0.48 | Similarity: -0.10
  Label: 0.00 | Similarity: 0.97
✅ Completed: Mean Sim = 0.5934000015258789
   Pearson = 0.1156

Evaluating bert-base-nli-mean-tokens with Probabilistic on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.6421
1000,0.6129



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.53
  Label: 0.48 | Similarity: -0.13
  Label: 0.00 | Similarity: -0.00
✅ Completed: Mean Sim = 0.43849998712539673
   Pearson = 0.7076

Evaluating bert-base-nli-mean-tokens with LiftedStructured on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,12.0851
1000,12.0791



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 0.48
✅ Completed: Mean Sim = 0.953499972820282
   Pearson = 0.3141

Evaluating bert-base-nli-mean-tokens with GeneralPair on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0627
1000,0.012



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.80
  Label: 0.48 | Similarity: 0.97
  Label: 0.00 | Similarity: -0.01
✅ Completed: Mean Sim = 0.6524999737739563
   Pearson = 0.7904

Evaluating bert-base-nli-mean-tokens with Angular on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3781
1000,0.3476



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.99
  Label: 0.48 | Similarity: 0.93
  Label: 0.00 | Similarity: 0.39
✅ Completed: Mean Sim = 0.847100019454956
   Pearson = 0.668

Evaluating bert-base-nli-mean-tokens with MarginRanking on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3391
1000,0.279



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.97
  Label: 0.48 | Similarity: -0.44
  Label: 0.00 | Similarity: 0.17
✅ Completed: Mean Sim = 0.5098000168800354
   Pearson = 0.6316

Evaluating bert-base-nli-mean-tokens with Triplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0207
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.99
  Label: 0.48 | Similarity: 0.99
  Label: 0.00 | Similarity: 0.94
✅ Completed: Mean Sim = 0.968999981880188
   Pearson = 0.6354

Evaluating bert-base-nli-mean-tokens with OnlineTriplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0207
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.99
  Label: 0.48 | Similarity: 0.99
  Label: 0.00 | Similarity: 0.94
✅ Completed: Mean Sim = 0.968999981880188
   Pearson = 0.6354

Evaluating bert-base-nli-mean-tokens with BatchHardTriplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0207
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.99
  Label: 0.48 | Similarity: 0.99
  Label: 0.00 | Similarity: 0.94
✅ Completed: Mean Sim = 0.968999981880188
   Pearson = 0.6354

Evaluating bert-base-nli-mean-tokens with BatchSemiHardTriplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0207
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.99
  Label: 0.48 | Similarity: 0.99
  Label: 0.00 | Similarity: 0.94
✅ Completed: Mean Sim = 0.968999981880188
   Pearson = 0.6354

Evaluating bert-base-nli-mean-tokens with BatchAllTriplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0207
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.99
  Label: 0.48 | Similarity: 0.99
  Label: 0.00 | Similarity: 0.94
✅ Completed: Mean Sim = 0.968999981880188
   Pearson = 0.6354

----------------------------------------
Model: paraphrase-MiniLM-L6-v2
----------------------------------------

Evaluating paraphrase-MiniLM-L6-v2 with MSE on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0099
1000,0.0018



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.99
  Label: 0.48 | Similarity: 0.98
  Label: 0.00 | Similarity: 0.94
✅ Completed: Mean Sim = 0.9577000141143799
   Pearson = 0.2489

Evaluating paraphrase-MiniLM-L6-v2 with Cosine on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0208
1000,0.0142



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.78
  Label: 0.48 | Similarity: 0.74
  Label: 0.00 | Similarity: -0.15
✅ Completed: Mean Sim = 0.5343000292778015
   Pearson = 0.8995

Evaluating paraphrase-MiniLM-L6-v2 with Contrastive on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0142
1000,0.0111



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.91
  Label: 0.48 | Similarity: 0.87
  Label: 0.00 | Similarity: 0.11
✅ Completed: Mean Sim = 0.7311999797821045
   Pearson = 0.8385

Evaluating paraphrase-MiniLM-L6-v2 with InfoNCE on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2827
1000,0.1646



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.78
  Label: 0.48 | Similarity: 0.88
  Label: 0.00 | Similarity: 0.05
✅ Completed: Mean Sim = 0.6039000153541565
   Pearson = 0.7978

Evaluating paraphrase-MiniLM-L6-v2 with Euclidean on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1532
1000,0.0615



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998999834060669
   Pearson = 0.3628

Evaluating paraphrase-MiniLM-L6-v2 with NormaEuc on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1532
1000,0.0615



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998999834060669
   Pearson = 0.3628

Evaluating paraphrase-MiniLM-L6-v2 with NPairs on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,4.8396
1000,4.9089



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.95
  Label: 0.48 | Similarity: 0.86
  Label: 0.00 | Similarity: 0.33
✅ Completed: Mean Sim = 0.7206000089645386
   Pearson = 0.7898

Evaluating paraphrase-MiniLM-L6-v2 with MultiSimilarity on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2827
1000,0.1646



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.78
  Label: 0.48 | Similarity: 0.88
  Label: 0.00 | Similarity: 0.05
✅ Completed: Mean Sim = 0.6039000153541565
   Pearson = 0.7978

Evaluating paraphrase-MiniLM-L6-v2 with AngularMargin on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2366
1000,0.1104



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998999834060669
   Pearson = 0.4305

Evaluating paraphrase-MiniLM-L6-v2 with Sphere on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0387
1000,0.0037



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9997000098228455
   Pearson = 0.6107

Evaluating paraphrase-MiniLM-L6-v2 with HyperSphere on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,3.647
1000,0.014



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.92
  Label: 0.48 | Similarity: 0.89
  Label: 0.00 | Similarity: 0.84
✅ Completed: Mean Sim = 0.8342999815940857
   Pearson = 0.2321

Evaluating paraphrase-MiniLM-L6-v2 with Probabilistic on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.6433
1000,0.63



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.90
  Label: 0.48 | Similarity: 0.37
  Label: 0.00 | Similarity: -0.11
✅ Completed: Mean Sim = 0.46790000796318054
   Pearson = 0.8313

Evaluating paraphrase-MiniLM-L6-v2 with LiftedStructured on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,12.09
1000,12.0825



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 1.00
  Label: 0.48 | Similarity: 1.00
  Label: 0.00 | Similarity: 0.68
✅ Completed: Mean Sim = 0.9717000126838684
   Pearson = 0.3534

Evaluating paraphrase-MiniLM-L6-v2 with GeneralPair on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0435
1000,0.0234



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.76
  Label: 0.48 | Similarity: 0.92
  Label: 0.00 | Similarity: -0.12
✅ Completed: Mean Sim = 0.6251000165939331
   Pearson = 0.8581

Evaluating paraphrase-MiniLM-L6-v2 with Angular on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3934
1000,0.371



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.99
  Label: 0.48 | Similarity: 0.96
  Label: 0.00 | Similarity: 0.26
✅ Completed: Mean Sim = 0.8575000166893005
   Pearson = 0.6462

Evaluating paraphrase-MiniLM-L6-v2 with MarginRanking on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.347
1000,0.3215



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.86
  Label: 0.48 | Similarity: 0.68
  Label: 0.00 | Similarity: -0.07
✅ Completed: Mean Sim = 0.5532000064849854
   Pearson = 0.8186

Evaluating paraphrase-MiniLM-L6-v2 with Triplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0248
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.98
  Label: 0.48 | Similarity: 0.97
  Label: 0.00 | Similarity: 0.85
✅ Completed: Mean Sim = 0.9319999814033508
   Pearson = 0.7461

Evaluating paraphrase-MiniLM-L6-v2 with OnlineTriplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0248
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.98
  Label: 0.48 | Similarity: 0.97
  Label: 0.00 | Similarity: 0.85
✅ Completed: Mean Sim = 0.9319999814033508
   Pearson = 0.7461

Evaluating paraphrase-MiniLM-L6-v2 with BatchHardTriplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0248
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.98
  Label: 0.48 | Similarity: 0.97
  Label: 0.00 | Similarity: 0.85
✅ Completed: Mean Sim = 0.9319999814033508
   Pearson = 0.7461

Evaluating paraphrase-MiniLM-L6-v2 with BatchSemiHardTriplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0248
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.98
  Label: 0.48 | Similarity: 0.97
  Label: 0.00 | Similarity: 0.85
✅ Completed: Mean Sim = 0.9319999814033508
   Pearson = 0.7461

Evaluating paraphrase-MiniLM-L6-v2 with BatchAllTriplet on STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0248
1000,0.0



[STS-B] Evaluation example:
  Label: 0.65 | Similarity: 0.98
  Label: 0.48 | Similarity: 0.97
  Label: 0.00 | Similarity: 0.85
✅ Completed: Mean Sim = 0.9319999814033508
   Pearson = 0.7461


Dataset: MRPC


train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]


[MRPC - train] Statistics:
- Examples: 3668
- Distribution: {1: 2474, 0: 1194}

[MRPC - validation] Statistics:
- Examples: 408
- Distribution: {1: 279, 0: 129}
- Sample used: 408 examples

Data sample (MRPC):
                                                                                                   sentence1                                                                                                            sentence2  label
0    Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .      Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .      1
1                  Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .              Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .      0
2  They had published an advertisement on the Internet on June 10 , offering the cargo for

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998999834060669
   Accuracy = 0.6838, F1 = 0.8122

Evaluating all-mpnet-base-v2 with Cosine on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1276



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.96
  Label: 0 | Predicted: 0 | Similarity: 0.14
  Label: 1 | Predicted: 1 | Similarity: 0.88
✅ Completed: Mean Sim = 0.7278000116348267
   Accuracy = 0.8015, F1 = 0.867

Evaluating all-mpnet-base-v2 with Contrastive on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.015



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.97
  Label: 0 | Predicted: 0 | Similarity: 0.49
  Label: 1 | Predicted: 1 | Similarity: 0.91
✅ Completed: Mean Sim = 0.8342999815940857
   Accuracy = 0.7426, F1 = 0.8416

Evaluating all-mpnet-base-v2 with InfoNCE on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0128



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.84
  Label: 0 | Predicted: 1 | Similarity: 0.60
  Label: 1 | Predicted: 1 | Similarity: 0.78
✅ Completed: Mean Sim = 0.8264999985694885
   Accuracy = 0.6887, F1 = 0.8141

Evaluating all-mpnet-base-v2 with Euclidean on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0858



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Accuracy = 0.6838, F1 = 0.8122

Evaluating all-mpnet-base-v2 with NormaEuc on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0858



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Accuracy = 0.6838, F1 = 0.8122

Evaluating all-mpnet-base-v2 with NPairs on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,4.8481



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 0 | Similarity: -0.90
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 0.8438000082969666
   Accuracy = 0.701, F1 = 0.8129

Evaluating all-mpnet-base-v2 with MultiSimilarity on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0128



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.84
  Label: 0 | Predicted: 1 | Similarity: 0.60
  Label: 1 | Predicted: 1 | Similarity: 0.78
✅ Completed: Mean Sim = 0.8264999985694885
   Accuracy = 0.6887, F1 = 0.8141

Evaluating all-mpnet-base-v2 with AngularMargin on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1379



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Accuracy = 0.6838, F1 = 0.8122

Evaluating all-mpnet-base-v2 with Sphere on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0127



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998999834060669
   Accuracy = 0.6838, F1 = 0.8122

Evaluating all-mpnet-base-v2 with HyperSphere on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.79
  Label: 0 | Predicted: 1 | Similarity: 0.73
  Label: 1 | Predicted: 1 | Similarity: 0.79
✅ Completed: Mean Sim = 0.807699978351593
   Accuracy = 0.7059, F1 = 0.8209

Evaluating all-mpnet-base-v2 with Probabilistic on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.5558



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.98
  Label: 0 | Predicted: 0 | Similarity: 0.01
  Label: 1 | Predicted: 1 | Similarity: 0.99
✅ Completed: Mean Sim = 0.7099999785423279
   Accuracy = 0.75, F1 = 0.8283

Evaluating all-mpnet-base-v2 with LiftedStructured on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,12.1282



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Accuracy = 0.6838, F1 = 0.8122

Evaluating all-mpnet-base-v2 with GeneralPair on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3169



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.96
  Label: 0 | Predicted: 0 | Similarity: 0.17
  Label: 1 | Predicted: 1 | Similarity: 0.73
✅ Completed: Mean Sim = 0.679099977016449
   Accuracy = 0.7868, F1 = 0.8518

Evaluating all-mpnet-base-v2 with Angular on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3191



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 0.88
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9383999705314636
   Accuracy = 0.7108, F1 = 0.8239

Evaluating all-mpnet-base-v2 with MarginRanking on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2892



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.85
  Label: 0 | Predicted: 0 | Similarity: 0.05
  Label: 1 | Predicted: 1 | Similarity: 0.55
✅ Completed: Mean Sim = 0.6000000238418579
   Accuracy = 0.777, F1 = 0.8342

Evaluating all-mpnet-base-v2 with Triplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0078



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.98
  Label: 0 | Predicted: 1 | Similarity: 0.92
  Label: 1 | Predicted: 1 | Similarity: 0.94
✅ Completed: Mean Sim = 0.9610999822616577
   Accuracy = 0.6838, F1 = 0.8122

Evaluating all-mpnet-base-v2 with OnlineTriplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0078



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.98
  Label: 0 | Predicted: 1 | Similarity: 0.92
  Label: 1 | Predicted: 1 | Similarity: 0.94
✅ Completed: Mean Sim = 0.9610999822616577
   Accuracy = 0.6838, F1 = 0.8122

Evaluating all-mpnet-base-v2 with BatchHardTriplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0078



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.98
  Label: 0 | Predicted: 1 | Similarity: 0.92
  Label: 1 | Predicted: 1 | Similarity: 0.94
✅ Completed: Mean Sim = 0.9610999822616577
   Accuracy = 0.6838, F1 = 0.8122

Evaluating all-mpnet-base-v2 with BatchSemiHardTriplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0078



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.98
  Label: 0 | Predicted: 1 | Similarity: 0.92
  Label: 1 | Predicted: 1 | Similarity: 0.94
✅ Completed: Mean Sim = 0.9610999822616577
   Accuracy = 0.6838, F1 = 0.8122

Evaluating all-mpnet-base-v2 with BatchAllTriplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0078



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.98
  Label: 0 | Predicted: 1 | Similarity: 0.92
  Label: 1 | Predicted: 1 | Similarity: 0.94
✅ Completed: Mean Sim = 0.9610999822616577
   Accuracy = 0.6838, F1 = 0.8122

----------------------------------------
Model: bert-base-nli-mean-tokens
----------------------------------------

Evaluating bert-base-nli-mean-tokens with MSE on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0049



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998999834060669
   Accuracy = 0.6838, F1 = 0.8122

Evaluating bert-base-nli-mean-tokens with Cosine on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1301



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.94
  Label: 0 | Predicted: 1 | Similarity: 0.60
  Label: 1 | Predicted: 1 | Similarity: 0.82
✅ Completed: Mean Sim = 0.7340999841690063
   Accuracy = 0.7966, F1 = 0.8659

Evaluating bert-base-nli-mean-tokens with Contrastive on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0148



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.96
  Label: 0 | Predicted: 1 | Similarity: 0.77
  Label: 1 | Predicted: 1 | Similarity: 0.85
✅ Completed: Mean Sim = 0.833299994468689
   Accuracy = 0.7426, F1 = 0.8416

Evaluating bert-base-nli-mean-tokens with InfoNCE on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.023



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.86
  Label: 0 | Predicted: 1 | Similarity: 0.75
  Label: 1 | Predicted: 1 | Similarity: 0.70
✅ Completed: Mean Sim = 0.8435999751091003
   Accuracy = 0.6936, F1 = 0.817

Evaluating bert-base-nli-mean-tokens with Euclidean on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0817



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Accuracy = 0.6838, F1 = 0.8122

Evaluating bert-base-nli-mean-tokens with NormaEuc on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0817



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Accuracy = 0.6838, F1 = 0.8122

Evaluating bert-base-nli-mean-tokens with NPairs on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,4.8549



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.98
  Label: 0 | Predicted: 1 | Similarity: 0.81
  Label: 1 | Predicted: 1 | Similarity: 0.90
✅ Completed: Mean Sim = 0.8938000202178955
   Accuracy = 0.6936, F1 = 0.817

Evaluating bert-base-nli-mean-tokens with MultiSimilarity on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.023



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.86
  Label: 0 | Predicted: 1 | Similarity: 0.75
  Label: 1 | Predicted: 1 | Similarity: 0.70
✅ Completed: Mean Sim = 0.8435999751091003
   Accuracy = 0.6936, F1 = 0.817

Evaluating bert-base-nli-mean-tokens with AngularMargin on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1272



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Accuracy = 0.6838, F1 = 0.8122

Evaluating bert-base-nli-mean-tokens with Sphere on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0087



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998999834060669
   Accuracy = 0.6838, F1 = 0.8122

Evaluating bert-base-nli-mean-tokens with HyperSphere on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,21.0196



[MRPC] Evaluation example:
  Label: 1 | Predicted: 0 | Similarity: 0.36
  Label: 0 | Predicted: 1 | Similarity: 0.85
  Label: 1 | Predicted: 1 | Similarity: 0.67
✅ Completed: Mean Sim = 0.7340999841690063
   Accuracy = 0.6422, F1 = 0.7653

Evaluating bert-base-nli-mean-tokens with Probabilistic on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.5482



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.99
  Label: 0 | Predicted: 1 | Similarity: 0.94
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 0.7354999780654907
   Accuracy = 0.7426, F1 = 0.8293

Evaluating bert-base-nli-mean-tokens with LiftedStructured on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,12.1186



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Accuracy = 0.6838, F1 = 0.8122

Evaluating bert-base-nli-mean-tokens with GeneralPair on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3256



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.90
  Label: 0 | Predicted: 0 | Similarity: 0.47
  Label: 1 | Predicted: 1 | Similarity: 0.81
✅ Completed: Mean Sim = 0.6930999755859375
   Accuracy = 0.8137, F1 = 0.8738

Evaluating bert-base-nli-mean-tokens with Angular on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3147



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.99
  Label: 0 | Predicted: 1 | Similarity: 0.93
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9372000098228455
   Accuracy = 0.7083, F1 = 0.8221

Evaluating bert-base-nli-mean-tokens with MarginRanking on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2868



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.78
  Label: 0 | Predicted: 0 | Similarity: 0.08
  Label: 1 | Predicted: 1 | Similarity: 0.76
✅ Completed: Mean Sim = 0.6216999888420105
   Accuracy = 0.7794, F1 = 0.841

Evaluating bert-base-nli-mean-tokens with Triplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.008



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.99
  Label: 0 | Predicted: 1 | Similarity: 0.98
  Label: 1 | Predicted: 1 | Similarity: 0.96
✅ Completed: Mean Sim = 0.9811000227928162
   Accuracy = 0.6838, F1 = 0.8122

Evaluating bert-base-nli-mean-tokens with OnlineTriplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.008



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.99
  Label: 0 | Predicted: 1 | Similarity: 0.98
  Label: 1 | Predicted: 1 | Similarity: 0.96
✅ Completed: Mean Sim = 0.9811000227928162
   Accuracy = 0.6838, F1 = 0.8122

Evaluating bert-base-nli-mean-tokens with BatchHardTriplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.008



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.99
  Label: 0 | Predicted: 1 | Similarity: 0.98
  Label: 1 | Predicted: 1 | Similarity: 0.96
✅ Completed: Mean Sim = 0.9811000227928162
   Accuracy = 0.6838, F1 = 0.8122

Evaluating bert-base-nli-mean-tokens with BatchSemiHardTriplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.008



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.99
  Label: 0 | Predicted: 1 | Similarity: 0.98
  Label: 1 | Predicted: 1 | Similarity: 0.96
✅ Completed: Mean Sim = 0.9811000227928162
   Accuracy = 0.6838, F1 = 0.8122

Evaluating bert-base-nli-mean-tokens with BatchAllTriplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.008



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.99
  Label: 0 | Predicted: 1 | Similarity: 0.98
  Label: 1 | Predicted: 1 | Similarity: 0.96
✅ Completed: Mean Sim = 0.9811000227928162
   Accuracy = 0.6838, F1 = 0.8122

----------------------------------------
Model: paraphrase-MiniLM-L6-v2
----------------------------------------

Evaluating paraphrase-MiniLM-L6-v2 with MSE on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0034



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.99
  Label: 0 | Predicted: 1 | Similarity: 0.99
  Label: 1 | Predicted: 1 | Similarity: 0.99
✅ Completed: Mean Sim = 0.9919999837875366
   Accuracy = 0.6838, F1 = 0.8122

Evaluating paraphrase-MiniLM-L6-v2 with Cosine on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1517



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.94
  Label: 0 | Predicted: 0 | Similarity: 0.36
  Label: 1 | Predicted: 1 | Similarity: 0.91
✅ Completed: Mean Sim = 0.7529000043869019
   Accuracy = 0.7672, F1 = 0.8509

Evaluating paraphrase-MiniLM-L6-v2 with Contrastive on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0187



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.95
  Label: 0 | Predicted: 1 | Similarity: 0.67
  Label: 1 | Predicted: 1 | Similarity: 0.92
✅ Completed: Mean Sim = 0.8546000123023987
   Accuracy = 0.7083, F1 = 0.8242

Evaluating paraphrase-MiniLM-L6-v2 with InfoNCE on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0154



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.75
  Label: 0 | Predicted: 1 | Similarity: 0.58
  Label: 1 | Predicted: 1 | Similarity: 0.79
✅ Completed: Mean Sim = 0.8109999895095825
   Accuracy = 0.6887, F1 = 0.8135

Evaluating paraphrase-MiniLM-L6-v2 with Euclidean on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0991



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Accuracy = 0.6838, F1 = 0.8122

Evaluating paraphrase-MiniLM-L6-v2 with NormaEuc on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0991



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Accuracy = 0.6838, F1 = 0.8122

Evaluating paraphrase-MiniLM-L6-v2 with NPairs on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,4.7943



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.83
  Label: 0 | Predicted: 1 | Similarity: 0.55
  Label: 1 | Predicted: 1 | Similarity: 0.87
✅ Completed: Mean Sim = 0.8070999979972839
   Accuracy = 0.723, F1 = 0.8301

Evaluating paraphrase-MiniLM-L6-v2 with MultiSimilarity on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0154



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.75
  Label: 0 | Predicted: 1 | Similarity: 0.58
  Label: 1 | Predicted: 1 | Similarity: 0.79
✅ Completed: Mean Sim = 0.8109999895095825
   Accuracy = 0.6887, F1 = 0.8135

Evaluating paraphrase-MiniLM-L6-v2 with AngularMargin on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1525



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Accuracy = 0.6838, F1 = 0.8122

Evaluating paraphrase-MiniLM-L6-v2 with Sphere on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0176



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9998000264167786
   Accuracy = 0.6838, F1 = 0.8122

Evaluating paraphrase-MiniLM-L6-v2 with HyperSphere on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,1.3297



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.89
  Label: 0 | Predicted: 1 | Similarity: 0.88
  Label: 1 | Predicted: 1 | Similarity: 0.65
✅ Completed: Mean Sim = 0.8862000107765198
   Accuracy = 0.6863, F1 = 0.8134

Evaluating paraphrase-MiniLM-L6-v2 with Probabilistic on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.5747



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.97
  Label: 0 | Predicted: 0 | Similarity: 0.28
  Label: 1 | Predicted: 1 | Similarity: 0.97
✅ Completed: Mean Sim = 0.7366999983787537
   Accuracy = 0.7402, F1 = 0.829

Evaluating paraphrase-MiniLM-L6-v2 with LiftedStructured on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,12.1398



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 1.0
   Accuracy = 0.6838, F1 = 0.8122

Evaluating paraphrase-MiniLM-L6-v2 with GeneralPair on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3629



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.91
  Label: 0 | Predicted: 0 | Similarity: 0.48
  Label: 1 | Predicted: 1 | Similarity: 0.87
✅ Completed: Mean Sim = 0.6912000179290771
   Accuracy = 0.7672, F1 = 0.8445

Evaluating paraphrase-MiniLM-L6-v2 with Angular on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3524



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 1.00
  Label: 0 | Predicted: 1 | Similarity: 1.00
  Label: 1 | Predicted: 1 | Similarity: 1.00
✅ Completed: Mean Sim = 0.9732000231742859
   Accuracy = 0.6985, F1 = 0.8194

Evaluating paraphrase-MiniLM-L6-v2 with MarginRanking on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3053



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.84
  Label: 0 | Predicted: 0 | Similarity: 0.44
  Label: 1 | Predicted: 1 | Similarity: 0.77
✅ Completed: Mean Sim = 0.6383000016212463
   Accuracy = 0.7426, F1 = 0.8199

Evaluating paraphrase-MiniLM-L6-v2 with Triplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0107



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.98
  Label: 0 | Predicted: 1 | Similarity: 0.93
  Label: 1 | Predicted: 1 | Similarity: 0.93
✅ Completed: Mean Sim = 0.9606000185012817
   Accuracy = 0.6838, F1 = 0.8122

Evaluating paraphrase-MiniLM-L6-v2 with OnlineTriplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0107



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.98
  Label: 0 | Predicted: 1 | Similarity: 0.93
  Label: 1 | Predicted: 1 | Similarity: 0.93
✅ Completed: Mean Sim = 0.9606000185012817
   Accuracy = 0.6838, F1 = 0.8122

Evaluating paraphrase-MiniLM-L6-v2 with BatchHardTriplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0107



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.98
  Label: 0 | Predicted: 1 | Similarity: 0.93
  Label: 1 | Predicted: 1 | Similarity: 0.93
✅ Completed: Mean Sim = 0.9606000185012817
   Accuracy = 0.6838, F1 = 0.8122

Evaluating paraphrase-MiniLM-L6-v2 with BatchSemiHardTriplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0107



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.98
  Label: 0 | Predicted: 1 | Similarity: 0.93
  Label: 1 | Predicted: 1 | Similarity: 0.93
✅ Completed: Mean Sim = 0.9606000185012817
   Accuracy = 0.6838, F1 = 0.8122

Evaluating paraphrase-MiniLM-L6-v2 with BatchAllTriplet on MRPC...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0107



[MRPC] Evaluation example:
  Label: 1 | Predicted: 1 | Similarity: 0.98
  Label: 0 | Predicted: 1 | Similarity: 0.93
  Label: 1 | Predicted: 1 | Similarity: 0.93
✅ Completed: Mean Sim = 0.9606000185012817
   Accuracy = 0.6838, F1 = 0.8122
HTML report generated at: results/experiment_report.html


Experiment completed. Results available in: results


Additional Analyses

Analyzing correlation between metrics...

Analyzing time vs. performance relationship...

Analyzing loss functions impact...

Analysis Summary:
- 126 model-loss combinations tested

STS-B:
- Best loss function: Cosine (Mean Pearson: 0.8910)

MRPC:
- Best loss function: Cosine (Mean F1: 0.8613)

✅ Additional analyses completed and saved in: results


<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

In [None]:
"""
Avaliação Comparativa de Modelos Sentence-Transformer com Diferentes Funções de Perda
para Tarefas de Similaridade Semântica e Paráfrase

Este script realiza uma avaliação sistemática de diferentes modelos Sentence-Transformer
combinados com várias funções de perda em datasets de similaridade textual (STS-B) e
detecção de paráfrase (MRPC).
"""

import torch
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
import time
import os
import json
from datetime import datetime
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from scipy.stats import spearmanr, pearsonr
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Configuração de diretórios para resultados
RESULTS_DIR = Path("results")
FIGURES_DIR = RESULTS_DIR / "figures"
MODELS_DIR = RESULTS_DIR / "models"

for directory in [RESULTS_DIR, FIGURES_DIR, MODELS_DIR]:
    directory.mkdir(exist_ok=True, parents=True)

# Configuração para reprodutibilidade
def set_seed(seed_value=42):
    """Configura sementes para reprodutibilidade em múltiplos frameworks."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

    # Configurações adicionais para determinismo em PyTorch
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    return seed_value

SEED = set_seed(42)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Configuração: Seed={SEED}, Dispositivo={DEVICE}")

# Configurações de experimento
SAMPLE_SIZE = None  # Usar None para todo o dataset ou um número para amostragem
NUM_EPOCHS = 3
BATCH_SIZE = 16
SAVE_MODELS = True  # Salvar modelos treinados

# Carregar e preparar datasets
class DatasetLoader:
    """Gerenciador para carregar e preparar datasets de similaridade textual."""

    def __init__(self, cache_dir=None):
        self.cache_dir = cache_dir

    def load_dataset(self, name, split='train', sample_size=None, random_state=42):
        """
        Carrega e prepara datasets populares de similaridade textual.

        Args:
            name: Nome do dataset ('stsb' ou 'mrpc')
            split: Partição do dataset ('train', 'validation', 'test')
            sample_size: Número de exemplos para amostragem (None para usar todos)
            random_state: Semente para amostragem reproduzível

        Returns:
            DataFrame com os dados processados
        """
        if name.lower() == 'stsb':
            return self._load_stsb(split, sample_size, random_state)
        elif name.lower() == 'mrpc':
            return self._load_mrpc(split, sample_size, random_state)
        else:
            raise ValueError(f"Dataset não suportado: {name}. Use 'stsb' ou 'mrpc'")

    def _load_stsb(self, split, sample_size, random_state):
        """Carrega o dataset STS-B (Semantic Textual Similarity Benchmark)."""
        ds = load_dataset('glue', 'stsb', cache_dir=self.cache_dir)[split]
        df = pd.DataFrame(ds)

        # Processamento dos labels
        df['label'] = pd.to_numeric(df['label'], errors='coerce')
        df = df.dropna(subset=['label'])

        # Normalização para [0, 1]
        df['label'] = df['label'] / 5.0
        df['label'] = df['label'].clip(lower=0.0, upper=1.0)

        # Label binário para classificação
        df['label_bin'] = (df['label'] > 0.5).astype(int)

        # Estatísticas do dataset
        print(f"\n[STS-B - {split}] Estatísticas:")
        print(f"- Exemplos: {len(df)}")
        print(f"- Range de similaridade: [{df['label'].min():.2f}, {df['label'].max():.2f}]")
        print(f"- Distribuição binária: {df['label_bin'].value_counts().to_dict()}")

        # Aplicar amostragem se solicitado
        if sample_size is not None:
            sample_size = min(sample_size, len(df))
            df = df.sample(n=sample_size, random_state=random_state)
            print(f"- Amostra utilizada: {sample_size} exemplos")

        return df

    def _load_mrpc(self, split, sample_size, random_state):
        """Carrega o dataset MRPC (Microsoft Research Paraphrase Corpus)."""
        ds = load_dataset('glue', 'mrpc', cache_dir=self.cache_dir)[split]
        df = pd.DataFrame(ds)

        # Garantir que labels são inteiros
        df['label'] = df['label'].astype(int)
        df['label_bin'] = df['label']

        # Estatísticas do dataset
        print(f"\n[MRPC - {split}] Estatísticas:")
        print(f"- Exemplos: {len(df)}")
        print(f"- Distribuição: {df['label'].value_counts().to_dict()}")

        # Aplicar amostragem se solicitado
        if sample_size is not None:
            sample_size = min(sample_size, len(df))
            df = df.sample(n=sample_size, random_state=random_state)
            print(f"- Amostra utilizada: {sample_size} exemplos")

        return df

    def visualize_dataset_distribution(self, df, dataset_name):
        """Gera visualização da distribuição dos dados."""
        plt.figure(figsize=(10, 6))

        if dataset_name.lower() == 'stsb':
            sns.histplot(df['label'], bins=20, kde=True)
            plt.title('Distribuição de Similaridade no STS-B')
            plt.xlabel('Similaridade Normalizada [0,1]')
        else:  # MRPC
            counts = df['label'].value_counts().sort_index()
            sns.barplot(x=counts.index, y=counts.values)
            plt.title('Distribuição de Classes no MRPC')
            plt.xlabel('Classe (0=Não Paráfrase, 1=Paráfrase)')
            plt.xticks([0, 1], ['Não Paráfrase', 'Paráfrase'])

        plt.ylabel('Contagem')
        plt.tight_layout()

        fig_path = FIGURES_DIR / f"{dataset_name}_distribution.png"
        plt.savefig(fig_path, dpi=300, bbox_inches='tight')
        plt.close()

        return fig_path

# Classes para triplet learning
class TripletGenerator:
    """Gerador de triplas (âncora, positivo, negativo) para Triplet Loss."""

    def __init__(self, dataset, fixed_negative=None, hard_negatives=False):
        """
        Inicializa o gerador de triplas.

        Args:
            dataset: DataFrame com pares de sentenças
            fixed_negative: Sentença negativa fixa (opcional)
            hard_negatives: Se True, seleciona negativos difíceis do dataset
        """
        self.dataset = dataset
        self.fixed_negative = fixed_negative
        self.hard_negatives = hard_negatives

    def generate_triplets(self, n_triplets=None):
        """
        Gera triplas de sentenças para treinamento.

        Args:
            n_triplets: Número de triplas a gerar (padrão: tamanho do dataset)

        Returns:
            Lista de triplas (âncora, positivo, negativo)
        """
        if n_triplets is None:
            n_triplets = len(self.dataset)

        triplets = []
        indices = random.sample(range(len(self.dataset)), k=min(n_triplets, len(self.dataset)))

        for i in indices:
            anchor = self.dataset.iloc[i]['sentence1']
            positive = self.dataset.iloc[i]['sentence2']

            if self.fixed_negative:
                negative = self.fixed_negative
            elif self.hard_negatives:
                # Selecionar uma sentença diferente como negativo
                neg_idx = random.choice([j for j in range(len(self.dataset)) if j != i])
                negative = random.choice([self.dataset.iloc[neg_idx]['sentence1'],
                                         self.dataset.iloc[neg_idx]['sentence2']])
            else:
                # Usar uma sentença aleatória como negativo
                negative = "Esta é uma frase negativa para a tripla."

            triplets.append((anchor, positive, negative))

        return triplets

class TripletDataset(Dataset):
    """Dataset para triplas compatível com PyTorch DataLoader."""

    def __init__(self, triplets):
        self.triplets = triplets

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        a, p, n = self.triplets[idx]
        return InputExample(texts=[a, p, n])

# Função de perda customizada
# Funções de perda
class TripletLoss(torch.nn.Module):
    def __init__(self, model, margin=1.0): super().__init__(); self.model = model; self.margin = margin
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        return torch.mean(F.relu(torch.norm(e[0]-e[1], p=2, dim=1) - torch.norm(e[0]-e[2], p=2, dim=1) + self.margin))

class OnlineTripletLoss(TripletLoss): pass
class BatchHardTripletLoss(TripletLoss): pass
class BatchSemiHardTripletLoss(TripletLoss): pass
class BatchAllTripletLoss(TripletLoss): pass

class MSELoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [self.model(f)['sentence_embedding'] for f in sf]; return F.mse_loss(e[0], e[1])

class EuclideanLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        return torch.mean(torch.norm(e[0] - e[1], p=2, dim=1))

class NormalizedEuclideanLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        distance = torch.norm(e[0] - e[1], p=2, dim=1)
        return torch.mean(distance)

class AngularMarginLoss(torch.nn.Module):
    def __init__(self, model, margin=0.5): super().__init__(); self.model = model; self.margin = margin
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        cosine = torch.sum(e[0] * e[1], dim=1)
        theta = torch.acos(torch.clamp(cosine, -1.0 + 1e-7, 1.0 - 1e-7))
        return torch.mean((theta + self.margin * (1.0 - lbl.float())) ** 2)

class CircleLoss(torch.nn.Module):
    def __init__(self, model, m=0.25, gamma=256): super().__init__(); self.model = model; self.m = m; self.gamma = gamma
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        alpha_p = torch.clamp_min(1 + self.m - sim, min=0)
        alpha_n = torch.clamp_min(sim + self.m, min=0)
        delta_p = 1 - self.m
        delta_n = self.m
        logits_p = (-self.gamma) * alpha_p * (sim - delta_p)
        logits_n = self.gamma * alpha_n * (sim - delta_n)
        loss = torch.log1p(torch.exp(logits_n)) + torch.log1p(torch.exp(logits_p))
        return loss.mean()

class SphereLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        return torch.mean(1 - torch.sum(e[0] * e[1], dim=1))

class HistogramLoss(torch.nn.Module):
    def __init__(self, model, num_bins=10): super().__init__(); self.model = model; self.num_bins = num_bins
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        hist_pos = torch.histc(sim[lbl == 1], bins=self.num_bins, min=-1, max=1)
        hist_neg = torch.histc(sim[lbl == 0], bins=self.num_bins, min=-1, max=1)
        hist_pos /= (torch.sum(hist_pos) + 1e-10)
        hist_neg /= (torch.sum(hist_neg) + 1e-10)
        return torch.sum((hist_pos - hist_neg) ** 2)

class CentroidLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        pos_mask = (lbl == 1).unsqueeze(1)
        neg_mask = (lbl == 0).unsqueeze(1)
        pos_centroid = (e[0] * pos_mask).sum(0) / (pos_mask.sum() + 1e-10)
        neg_centroid = (e[0] * neg_mask).sum(0) / (neg_mask.sum() + 1e-10)
        return F.mse_loss(pos_centroid, neg_centroid)

class HyperSphereLoss(torch.nn.Module):
    def __init__(self, model, radius=1.0): super().__init__(); self.model = model; self.radius = radius
    def forward(self, sf, lbl):
        e = [self.model(f)['sentence_embedding'] for f in sf]
        norms = [torch.norm(emb, p=2, dim=1) for emb in e]
        return torch.mean((norms[0] - self.radius) ** 2 + (norms[1] - self.radius) ** 2)

class ProbabilisticLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        prob = torch.sigmoid(sim)
        return F.binary_cross_entropy(prob, lbl.float())

class LiftedStructuredLoss(torch.nn.Module):
    def __init__(self, model, margin=1.0): super().__init__(); self.model = model; self.margin = margin
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        dist_matrix = torch.cdist(e[0], e[1], p=2)
        pos_mask = (lbl == 1).float()
        neg_mask = (lbl == 0).float()
        pos_term = torch.log(torch.exp(dist_matrix * pos_mask).sum() + 1)
        neg_term = torch.log(torch.exp(-dist_matrix * neg_mask + self.margin).sum() + 1)
        return pos_term + neg_term

class GeneralPairLoss(torch.nn.Module):
    def __init__(self, model, pos_weight=1.0, neg_weight=1.0): super().__init__(); self.model = model; self.pos_weight = pos_weight; self.neg_weight = neg_weight
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        pos_pairs = sim[lbl == 1]
        neg_pairs = sim[lbl == 0]
        pos_loss = self.pos_weight * torch.mean((1 - pos_pairs) ** 2)
        neg_loss = self.neg_weight * torch.mean(neg_pairs ** 2)
        return pos_loss + neg_loss

class AngularLoss(torch.nn.Module):
    def __init__(self, model, angle_bound=1.0): super().__init__(); self.model = model; self.angle_bound = angle_bound
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        cos_theta = torch.sum(e[0] * e[1], dim=1)
        theta = torch.acos(torch.clamp(cos_theta, -1.0 + 1e-7, 1.0 - 1e-7))
        target = lbl.float()
        return torch.mean(target * theta + (1 - target) * torch.clamp(self.angle_bound - theta, min=0.0))

class MarginRankingLoss(torch.nn.Module):
    def __init__(self, model, margin=0.5): super().__init__(); self.model = model; self.margin = margin
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        target = 2 * lbl.float() - 1
        return torch.mean(torch.clamp(self.margin - target * sim, min=0.0))

# Dicionário com funções de perda
loss_functions = {
    'MSE': MSELoss,
    'Cosine': losses.CosineSimilarityLoss,
    'Contrastive': losses.ContrastiveLoss,
    'InfoNCE': losses.MultipleNegativesRankingLoss,
    'Euclidean': EuclideanLoss,
    'NormaEuc': NormalizedEuclideanLoss,
    'NPairs': losses.BatchAllTripletLoss,
    'MultiSimilarity': losses.MultipleNegativesRankingLoss,
    'AngularMargin': AngularMarginLoss,
    'Sphere': SphereLoss,
    'HyperSphere': HyperSphereLoss,
    'Probabilistic': ProbabilisticLoss,
    'LiftedStructured': LiftedStructuredLoss,
    'GeneralPair': GeneralPairLoss,
    'Angular': AngularLoss,
    'MarginRanking': MarginRankingLoss,
    'Triplet': TripletLoss,
    'OnlineTriplet': OnlineTripletLoss,
    'BatchHardTriplet': BatchHardTripletLoss,
    'BatchSemiHardTriplet': BatchSemiHardTripletLoss,
    'BatchAllTriplet': BatchAllTripletLoss
}

def gerar_exemplos(df, loss_name, fixed_negative=None):
    """
    Gera exemplos de treinamento compatíveis com diferentes funções de perda.

    Args:
        df: DataFrame com os dados
        loss_name: Nome da função de perda a usar
        fixed_negative: Sentença negativa fixa para Triplet Loss

    Returns:
        Dataset com exemplos formatados para a função de perda especificada
    """
    if 'Triplet' in loss_name:
        triplets = TripletGenerator(df, fixed_negative, hard_negatives=True).generate_triplets()
        return TripletDataset(triplets)
    elif loss_name == 'Contrastive':
        # Para Contrastive Loss, usamos labels binários
        exemplos = [InputExample(texts=[r['sentence1'], r['sentence2']], label=float(r['label_bin']))
                    for _, r in df.iterrows()]
        return exemplos
    else:
        # Para outras funções de perda, usamos similaridade contínua
        exemplos = [InputExample(texts=[r['sentence1'], r['sentence2']], label=float(r['label']))
                    for _, r in df.iterrows()]
        return exemplos

# Funções de avaliação
def avaliar_modelo(model, df_teste, dataset_nome):
    """
    Avalia um modelo em um dataset de teste.

    Args:
        model: Modelo SentenceTransformer treinado
        df_teste: DataFrame com dados de teste
        dataset_nome: Nome do dataset ('stsb' ou 'mrpc')

    Returns:
        Dicionário com métricas de avaliação
    """
    # Preparar dados
    sent1 = df_teste['sentence1'].tolist()
    sent2 = df_teste['sentence2'].tolist()
    labels = df_teste['label'].tolist()

    # Calcular embeddings e similaridades
    embeddings = model.encode(sent1 + sent2, batch_size=32, show_progress_bar=False)
    embeddings1 = embeddings[:len(sent1)]
    embeddings2 = embeddings[len(sent1):]

    # Calcular similaridades de cosseno
    similaridades = []
    for e1, e2 in zip(embeddings1, embeddings2):
        similaridades.append(cosine_similarity([e1], [e2])[0][0])

    # Métricas básicas
    mean_sim = np.mean(similaridades)
    std_sim = np.std(similaridades)
    resultados = {
        'mean_similarity': mean_sim,
        'std_similarity': std_sim
    }

    # Métricas específicas por dataset
    if dataset_nome.lower() == 'stsb':
        # Correlação para tarefas de similaridade
        if len(set(labels)) > 1 and len(set(similaridades)) > 1:
            resultados['pearson'] = pearsonr(labels, similaridades)[0]
            resultados['spearman'] = spearmanr(labels, similaridades)[0]
        else:
            resultados['pearson'] = float('nan')
            resultados['spearman'] = float('nan')

        # Exemplo para depuração
        print("\n[STS-B] Exemplo de avaliação:")
        for i in range(min(3, len(labels))):
            print(f"  Label: {labels[i]:.2f} | Similaridade: {similaridades[i]:.2f}")

    elif dataset_nome.lower() == 'mrpc':
        # Métricas de classificação
        binary_preds = [1 if s >= 0.5 else 0 for s in similaridades]
        resultados['accuracy'] = accuracy_score(labels, binary_preds)
        resultados['f1'] = f1_score(labels, binary_preds)
        resultados['precision'] = precision_score(labels, binary_preds)
        resultados['recall'] = recall_score(labels, binary_preds)

        # Exemplo para depuração
        print("\n[MRPC] Exemplo de avaliação:")
        for i in range(min(3, len(labels))):
            print(f"  Label: {labels[i]} | Predito: {binary_preds[i]} | Similaridade: {similaridades[i]:.2f}")

    return resultados

def plotar_resultados(results_df, metric, dataset_name):
    """
    Gera gráficos comparativos de resultados.

    Args:
        results_df: DataFrame com resultados
        metric: Métrica a visualizar
        dataset_name: Nome do dataset

    Returns:
        Caminho para o arquivo de figura salvo
    """
    plt.figure(figsize=(12, 8))

    # Filtrar dados para o dataset específico
    df = results_df[results_df['Dataset'] == dataset_name].copy()

    # Preparar gráfico de barras agrupadas
    pivot_df = df.pivot(index='Model', columns='Loss Function', values=metric)

    ax = pivot_df.plot(kind='bar', figsize=(12, 8))

    # Configurações do gráfico
    plt.title(f'{metric} por Modelo e Função de Perda - {dataset_name.upper()}', fontsize=14)
    plt.xlabel('Modelo', fontsize=12)
    plt.ylabel(metric, fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend(title='Função de Perda', fontsize=10)

    # Adicionar valores nas barras
    for container in ax.containers:
        ax.bar_label(container, fmt='%.3f', fontsize=8)

    plt.tight_layout()

    # Salvar figura
    filename = f"{dataset_name}_{metric}_comparison.png"
    filepath = FIGURES_DIR / filename
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    return filepath

def plot_training_curve(history, model_name, loss_name, dataset_name):
    """
    Plota curva de treinamento.

    Args:
        history: Histórico de treinamento
        model_name: Nome do modelo
        loss_name: Nome da função de perda
        dataset_name: Nome do dataset

    Returns:
        Caminho para o arquivo de figura salvo
    """
    plt.figure(figsize=(10, 6))

    # Extrair dados de histórico
    epochs = range(1, len(history['train_loss']) + 1)

    # Plotar perdas
    plt.plot(epochs, history['train_loss'], 'b-', label='Perda de Treinamento')

    # Adicionar informações ao gráfico
    plt.title(f'Curva de Treinamento: {model_name}\n{loss_name} em {dataset_name}', fontsize=14)
    plt.xlabel('Época', fontsize=12)
    plt.ylabel('Perda', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()

    # Salvar figura
    model_short = model_name.split('/')[-1] if '/' in model_name else model_name
    filename = f"{dataset_name}_{model_short}_{loss_name}_training.png"
    filepath = FIGURES_DIR / filename
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    return filepath

# Função principal de treinamento e avaliação
def treinar_e_avaliar(model_name, dataset_nome, loss_name, train_df, test_df,
                      epochs=3, batch_size=16, save_model=False):
    """
    Treina e avalia um modelo com uma função de perda específica.

    Args:
        model_name: Nome do modelo Sentence-Transformer
        dataset_nome: Nome do dataset ('stsb' ou 'mrpc')
        loss_name: Nome da função de perda
        train_df: DataFrame com dados de treinamento
        test_df: DataFrame com dados de teste
        epochs: Número de épocas de treinamento
        batch_size: Tamanho do lote
        save_model: Se True, salva o modelo treinado

    Returns:
        Dicionário com resultados e métricas
    """
    try:
        # Inicializar modelo
        model = SentenceTransformer(model_name).to(DEVICE)
        model_identificador = model_name.split('/')[-1] if '/' in model_name else model_name

        # Configurar treinamento
        fixed_negative = "Este é um exemplo de sentença negativa para triplas de treinamento."
        dataset = gerar_exemplos(train_df, loss_name, fixed_negative)
        dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
        loss_fn = loss_functions[loss_name](model)

        # Registrar histórico de treinamento
        history = {'train_loss': []}

        class LogCallback:
            def __init__(self, history):
                self.history = history

            def on_epoch_end(self, epoch, loss, *args, **kwargs):
                self.history['train_loss'].append(loss)

        # Executar treinamento
        start_time = time.time()
        model.fit(
            train_objectives=[(dataloader, loss_fn)],
            epochs=epochs,
            warmup_steps=int(len(dataloader) * 0.1),
            show_progress_bar=True,
            output_path=None,
            callback=LogCallback(history)
        )
        training_time = time.time() - start_time

        # Avaliar modelo
        evaluation_results = avaliar_modelo(model, test_df, dataset_nome)

        # Salvar modelo se solicitado
        model_path = None
        if save_model:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_path = MODELS_DIR / f"{dataset_nome}_{model_identificador}_{loss_name}_{timestamp}"
            model.save(str(model_path))

        # Plotar curva de treinamento
        training_plot = plot_training_curve(history, model_name, loss_name, dataset_nome)

        # Consolidar resultados
        results = {
            'Dataset': dataset_nome,
            'Model': model_name,
            'Loss Function': loss_name,
            'Training Time (s)': round(training_time, 2),
            'Mean Similarity': round(evaluation_results['mean_similarity'], 4),
            'STD Similarity': round(evaluation_results['std_similarity'], 4),
            'Epochs': epochs,
            'Batch Size': batch_size,
            'Training Plot': str(training_plot),
            'Model Path': str(model_path) if model_path else None
        }

        # Adicionar métricas específicas
        if dataset_nome.lower() == 'stsb':
            results['Pearson'] = round(evaluation_results['pearson'], 4) if 'pearson' in evaluation_results else None
            results['Spearman'] = round(evaluation_results['spearman'], 4) if 'spearman' in evaluation_results else None
        elif dataset_nome.lower() == 'mrpc':
            results['Accuracy'] = round(evaluation_results['accuracy'], 4) if 'accuracy' in evaluation_results else None
            results['F1 Score'] = round(evaluation_results['f1'], 4) if 'f1' in evaluation_results else None
            results['Precision'] = round(evaluation_results['precision'], 4) if 'precision' in evaluation_results else None
            results['Recall'] = round(evaluation_results['recall'], 4) if 'recall' in evaluation_results else None

        return results

    except Exception as e:
        print(f"Erro em treinar_e_avaliar({model_name}, {dataset_nome}, {loss_name}): {e}")
        import traceback
        traceback.print_exc()
        return {
            'Dataset': dataset_nome,
            'Model': model_name,
            'Loss Function': loss_name,
            'Error': str(e)
        }

# Função principal
def main():
    # Lista de modelos a avaliar
    model_names = [
        'sentence-transformers/paraphrase-MiniLM-L6-v2',
        'sentence-transformers/bert-base-nli-mean-tokens',
        'sentence-transformers/all-mpnet-base-v2'
    ]

    # Datasets a avaliar
    datasets = ['stsb', 'mrpc']

    # Configurações experimentais
    experiment_config = {
        'seed': SEED,
        'device': str(DEVICE),
        'epochs': NUM_EPOCHS,
        'batch_size': BATCH_SIZE,
        'sample_size': SAMPLE_SIZE,
        'save_models': SAVE_MODELS,
        'models': model_names,
        'datasets': datasets,
        'loss_functions': list(loss_functions.keys()),
        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }

    # Salvar configuração do experimento
    with open(RESULTS_DIR / "experiment_config.json", 'w') as f:
        json.dump(experiment_config, f, indent=2)

    # Inicializar carregador de datasets
    loader = DatasetLoader()

    # Resultados armazenados aqui
    all_results = []
    dataset_figures = {}

    # Loop principal
    for dataset_nome in datasets:
        print(f"\n\n{'='*60}")
        print(f"Dataset: {dataset_nome.upper()}")
        print(f"{'='*60}")

        # Carregar datasets
        train_df = loader.load_dataset(dataset_nome, 'train', sample_size=SAMPLE_SIZE)
        test_df = loader.load_dataset(dataset_nome, 'validation', sample_size=min(1000, SAMPLE_SIZE if SAMPLE_SIZE else 1000))

        # Visualização da distribuição
        dist_fig = loader.visualize_dataset_distribution(train_df, dataset_nome)
        dataset_figures[dataset_nome] = str(dist_fig)

        # Exemplo dos dados
        print(f"\nAmostra dos dados ({dataset_nome.upper()}):")
        print(train_df[['sentence1', 'sentence2', 'label']].head(3).to_string())

        # Executar avaliação para cada combinação
        results_dataset = []

        for model_name in model_names:
            model_short = model_name.split('/')[-1]
            print(f"\n{'-'*40}")
            print(f"Modelo: {model_short}")
            print(f"{'-'*40}")

            for loss_name in loss_functions.keys():
                print(f"\nAvaliando {model_short} com {loss_name} em {dataset_nome.upper()}...")

                result = treinar_e_avaliar(
                    model_name=model_name,
                    dataset_nome=dataset_nome,
                    loss_name=loss_name,
                    train_df=train_df,
                    test_df=test_df,
                    epochs=NUM_EPOCHS,
                    batch_size=BATCH_SIZE,
                    save_model=SAVE_MODELS
                )

                results_dataset.append(result)
                all_results.append(result)

                # Log imediato do resultado
                if 'Error' in result:
                    print(f"❌ Erro: {result['Error']}")
                else:
                    print(f"✅ Concluído: Mean Sim = {result['Mean Similarity']}")
                    if dataset_nome.lower() == 'stsb':
                        print(f"   Pearson = {result['Pearson']}")
                    else:
                        print(f"   Accuracy = {result['Accuracy']}, F1 = {result['F1 Score']}")

        # Salvar resultados por dataset
        results_df = pd.DataFrame(results_dataset)
        results_df.to_csv(RESULTS_DIR / f"resultados_{dataset_nome}.csv", index=False)

        # Gerar visualizações
        if dataset_nome.lower() == 'stsb':
            plotar_resultados(results_df, 'Pearson', dataset_nome)
        else:
            plotar_resultados(results_df, 'F1 Score', dataset_nome)
            plotar_resultados(results_df, 'Accuracy', dataset_nome)

    # Consolidar todos os resultados
    all_results_df = pd.DataFrame(all_results)
    all_results_df.to_csv(RESULTS_DIR / "resultados_completos.csv", index=False)

    # Gerar relatório HTML
    generate_html_report(all_results_df, experiment_config, dataset_figures)

    print("\n\nExperimento concluído. Resultados disponíveis em:", RESULTS_DIR)
    return all_results_df

def generate_html_report(results_df, config, dataset_figures):

    """
    Gera um relatório HTML com os resultados do experimento.

    Args:
        results_df: DataFrame com todos os resultados
        config: Configuração do experimento
        dataset_figures: Dicionário com caminhos para figuras de datasets
    """
    report_path = RESULTS_DIR / "relatorio_experimento.html"

    # Preparar tabelas de resultados
    stsb_df = results_df[results_df['Dataset'] == 'stsb'].copy()
    mrpc_df = results_df[results_df['Dataset'] == 'mrpc'].copy()

    # Selecionar colunas relevantes
    stsb_cols = ['Model', 'Loss Function', 'Training Time (s)', 'Mean Similarity', 'Pearson', 'Spearman']
    mrpc_cols = ['Model', 'Loss Function', 'Training Time (s)', 'Mean Similarity', 'Accuracy', 'F1 Score', 'Precision', 'Recall']

    # Limpar nomes de modelos para exibição
    for df in [stsb_df, mrpc_df]:
        df['Model'] = df['Model'].apply(lambda x: x.split('/')[-1] if '/' in x else x)

    # Gerar HTML
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Relatório de Avaliação de Sentence-Transformers</title>
        <style>
            body {{ font-family: Arial, sans-serif; line-height: 1.6; margin: 0; padding: 20px; color: #333; }}
            h1, h2, h3 {{ color: #2c3e50; }}
            table {{ border-collapse: collapse; width: 100%; margin-bottom: 20px; }}
            th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
            th {{ background-color: #f2f2f2; color: #333; font-weight: bold; }}
            tr:nth-child(even) {{ background-color: #f9f9f9; }}
            tr:hover {{ background-color: #f5f5f5; }}
            .container {{ max-width: 1200px; margin: 0 auto; padding: 20px; }}
            .section {{ margin-bottom: 30px; }}
            .best-result {{ font-weight: bold; color: #27ae60; }}
            img {{ max-width: 100%; height: auto; margin: 10px 0; border: 1px solid #ddd; }}
            .config {{ background-color: #f8f9fa; padding: 15px; border-radius: 4px; margin-bottom: 20px; }}
            footer {{ margin-top: 30px; padding-top: 10px; border-top: 1px solid #eee; color: #7f8c8d; font-size: 0.9em; }}
        </style>
    </head>
    <body>
        <div class="container">
            <header>
                <h1>Avaliação Comparativa de Modelos Sentence-Transformer</h1>
                <p>Relatório gerado em: {config['timestamp']}</p>
            </header>

            <div class="section">
                <h2>Configuração do Experimento</h2>
                <div class="config">
                    <p><strong>Dispositivo:</strong> {config['device']}</p>
                    <p><strong>Semente:</strong> {config['seed']}</p>
                    <p><strong>Épocas:</strong> {config['epochs']}</p>
                    <p><strong>Tamanho do Lote:</strong> {config['batch_size']}</p>
                    <p><strong>Amostra:</strong> {config['sample_size'] if config['sample_size'] else 'Dataset Completo'}</p>
                    <p><strong>Modelos:</strong> {', '.join([m.split('/')[-1] if '/' in m else m for m in config['models']])}</p>
                    <p><strong>Funções de Perda:</strong> {', '.join(config['loss_functions'])}</p>
                </div>
            </div>

            <div class="section">
                <h2>Resultados - STS-B (Similaridade Semântica)</h2>
                <p>Distribuição dos dados de treinamento:</p>
                <img src="{dataset_figures['stsb']}" alt="Distribuição STS-B">

                <h3>Métricas de Desempenho</h3>
                <table>
                    <tr>
                        <th>Modelo</th>
                        <th>Função de Perda</th>
                        <th>Tempo (s)</th>
                        <th>Similaridade Média</th>
                        <th>Correlação de Pearson</th>
                        <th>Correlação de Spearman</th>
                    </tr>
                    {stsb_df[stsb_cols].sort_values('Pearson', ascending=False).to_html(index=False, header=False, classes='results-table')}
                </table>

                <h3>Visualização de Resultados</h3>
                <img src="{FIGURES_DIR / 'stsb_Pearson_comparison.png'}" alt="Comparação Pearson STS-B">
            </div>

            <div class="section">
                <h2>Resultados - MRPC (Detecção de Paráfrase)</h2>
                <p>Distribuição dos dados de treinamento:</p>
                <img src="{dataset_figures['mrpc']}" alt="Distribuição MRPC">

                <h3>Métricas de Desempenho</h3>
                <table>
                    <tr>
                        <th>Modelo</th>
                        <th>Função de Perda</th>
                        <th>Tempo (s)</th>
                        <th>Similaridade Média</th>
                        <th>Acurácia</th>
                        <th>F1 Score</th>
                        <th>Precisão</th>
                        <th>Recall</th>
                    </tr>
                    {mrpc_df[mrpc_cols].sort_values('F1 Score', ascending=False).to_html(index=False, header=False, classes='results-table')}
                </table>

                <h3>Visualização de Resultados</h3>
                <img src="{FIGURES_DIR / 'mrpc_F1 Score_comparison.png'}" alt="Comparação F1 MRPC">
                <img src="{FIGURES_DIR / 'mrpc_Accuracy_comparison.png'}" alt="Comparação Acurácia MRPC">
            </div>

            <div class="section">
                <h2>Análise das Curvas de Treinamento</h2>
                <p>Exemplos de curvas de treinamento para os melhores modelos:</p>

                <h3>STS-B (Melhor modelo)</h3>
                <img src="{stsb_df.sort_values('Pearson', ascending=False).iloc[0]['Training Plot']}" alt="Melhor Curva STS-B">

                <h3>MRPC (Melhor modelo)</h3>
                <img src="{mrpc_df.sort_values('F1 Score', ascending=False).iloc[0]['Training Plot']}" alt="Melhor Curva MRPC">
            </div>

            <div class="section">
                <h2>Conclusões</h2>
                <p><strong>Melhor configuração para STS-B:</strong> {stsb_df.sort_values('Pearson', ascending=False).iloc[0]['Model']} com {stsb_df.sort_values('Pearson', ascending=False).iloc[0]['Loss Function']} (Pearson: {stsb_df.sort_values('Pearson', ascending=False).iloc[0]['Pearson']})</p>
                <p><strong>Melhor configuração para MRPC:</strong> {mrpc_df.sort_values('F1 Score', ascending=False).iloc[0]['Model']} com {mrpc_df.sort_values('F1 Score', ascending=False).iloc[0]['Loss Function']} (F1: {mrpc_df.sort_values('F1 Score', ascending=False).iloc[0]['F1 Score']})</p>

                <p>Observações gerais:</p>
                <ul>
                    <li>As funções de perda têm impacto significativo no desempenho dos modelos.</li>
                    <li>Modelos especializados em paráfrase tendem a ter melhor desempenho nas tarefas avaliadas.</li>
                    <li>O tempo de treinamento varia consideravelmente entre os modelos.</li>
                </ul>
            </div>

            <footer>
                <p>Relatório gerado automaticamente pelo script de avaliação de Sentence-Transformers.</p>
                <p>Todos os modelos e resultados estão disponíveis no diretório: {RESULTS_DIR}</p>
            </footer>
        </div>
    </body>
    </html>
    """

    # Salvar relatório
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"Relatório HTML gerado em: {report_path}")
    return report_path

# Funções adicionais para análise avançada

def analisar_correlacao_similaridade_metricas(results_df, dataset_nome):
    """
    Analisa a correlação entre similaridade média e métricas de desempenho.

    Args:
        results_df: DataFrame com resultados
        dataset_nome: Nome do dataset a analisar

    Returns:
        Figura com matriz de correlação
    """
    # Filtrar dados para o dataset específico
    df = results_df[results_df['Dataset'] == dataset_nome].copy()

    # Colunas para analisar
    if dataset_nome.lower() == 'stsb':
        cols = ['Mean Similarity', 'STD Similarity', 'Pearson', 'Spearman', 'Training Time (s)']
    else:  # MRPC
        cols = ['Mean Similarity', 'STD Similarity', 'Accuracy', 'F1 Score',
                'Precision', 'Recall', 'Training Time (s)']

    # Calcular matriz de correlação
    corr_matrix = df[cols].corr()

    # Visualizar matriz de correlação
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title(f'Correlação entre Métricas - {dataset_nome.upper()}')
    plt.tight_layout()

    # Salvar figura
    filepath = FIGURES_DIR / f"{dataset_nome}_metric_correlation.png"
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    return filepath

def analisar_tempo_vs_desempenho(results_df):
    """
    Analisa a relação entre tempo de treinamento e métricas de desempenho.

    Args:
        results_df: DataFrame com resultados

    Returns:
        Figura com gráficos de dispersão
    """
    plt.figure(figsize=(12, 10))

    # Dividir em subplots
    fig, axes = plt.subplots(2, 1, figsize=(12, 12))

    # Dados para STS-B
    stsb_df = results_df[results_df['Dataset'] == 'stsb'].copy()
    stsb_df['Model'] = stsb_df['Model'].apply(lambda x: x.split('/')[-1] if '/' in x else x)

    # Dados para MRPC
    mrpc_df = results_df[results_df['Dataset'] == 'mrpc'].copy()
    mrpc_df['Model'] = mrpc_df['Model'].apply(lambda x: x.split('/')[-1] if '/' in x else x)

    # Gráfico para STS-B
    ax = axes[0]
    for model in stsb_df['Model'].unique():
        model_df = stsb_df[stsb_df['Model'] == model]
        ax.scatter(model_df['Training Time (s)'], model_df['Pearson'],
                  label=model, alpha=0.7, s=80)

        # Adicionar rótulos para cada ponto
        for _, row in model_df.iterrows():
            ax.annotate(row['Loss Function'],
                       (row['Training Time (s)'], row['Pearson']),
                       fontsize=8, alpha=0.8)

    ax.set_title('STS-B: Correlação de Pearson vs. Tempo de Treinamento')
    ax.set_xlabel('Tempo de Treinamento (segundos)')
    ax.set_ylabel('Correlação de Pearson')
    ax.grid(True, linestyle='--', alpha=0.6)
    ax.legend()

    # Gráfico para MRPC
    ax = axes[1]
    for model in mrpc_df['Model'].unique():
        model_df = mrpc_df[mrpc_df['Model'] == model]
        ax.scatter(model_df['Training Time (s)'], model_df['F1 Score'],
                  label=model, alpha=0.7, s=80)

        # Adicionar rótulos para cada ponto
        for _, row in model_df.iterrows():
            ax.annotate(row['Loss Function'],
                       (row['Training Time (s)'], row['F1 Score']),
                       fontsize=8, alpha=0.8)

    ax.set_title('MRPC: F1 Score vs. Tempo de Treinamento')
    ax.set_xlabel('Tempo de Treinamento (segundos)')
    ax.set_ylabel('F1 Score')
    ax.grid(True, linestyle='--', alpha=0.6)
    ax.legend()

    plt.tight_layout()

    # Salvar figura
    filepath = FIGURES_DIR / "tempo_vs_desempenho.png"
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    return filepath

def analisar_impacto_funcao_perda(results_df):
    """
    Analisa o impacto das diferentes funções de perda no desempenho.

    Args:
        results_df: DataFrame com resultados

    Returns:
        DataFrame com estatísticas de impacto
    """
    # Estatísticas por função de perda
    impacto = []

    # Análise para STS-B
    stsb_df = results_df[results_df['Dataset'] == 'stsb'].copy()
    stsb_metrics = ['Pearson', 'Spearman']

    for loss_fn in stsb_df['Loss Function'].unique():
        loss_stats = {
            'Dataset': 'STS-B',
            'Loss Function': loss_fn,
            'Count': len(stsb_df[stsb_df['Loss Function'] == loss_fn])
        }

        for metric in stsb_metrics:
            loss_stats[f'Mean {metric}'] = stsb_df[stsb_df['Loss Function'] == loss_fn][metric].mean()
            loss_stats[f'Std {metric}'] = stsb_df[stsb_df['Loss Function'] == loss_fn][metric].std()
            loss_stats[f'Max {metric}'] = stsb_df[stsb_df['Loss Function'] == loss_fn][metric].max()
            loss_stats[f'Min {metric}'] = stsb_df[stsb_df['Loss Function'] == loss_fn][metric].min()

        impacto.append(loss_stats)

    # Análise para MRPC
    mrpc_df = results_df[results_df['Dataset'] == 'mrpc'].copy()
    mrpc_metrics = ['Accuracy', 'F1 Score']

    for loss_fn in mrpc_df['Loss Function'].unique():
        loss_stats = {
            'Dataset': 'MRPC',
            'Loss Function': loss_fn,
            'Count': len(mrpc_df[mrpc_df['Loss Function'] == loss_fn])
        }

        for metric in mrpc_metrics:
            loss_stats[f'Mean {metric}'] = mrpc_df[mrpc_df['Loss Function'] == loss_fn][metric].mean()
            loss_stats[f'Std {metric}'] = mrpc_df[mrpc_df['Loss Function'] == loss_fn][metric].std()
            loss_stats[f'Max {metric}'] = mrpc_df[mrpc_df['Loss Function'] == loss_fn][metric].max()
            loss_stats[f'Min {metric}'] = mrpc_df[mrpc_df['Loss Function'] == loss_fn][metric].min()

        impacto.append(loss_stats)

    # Criar DataFrame com estatísticas
    impacto_df = pd.DataFrame(impacto)

    # Salvar análise
    impacto_df.to_csv(RESULTS_DIR / "impacto_funcoes_perda.csv", index=False)

    return impacto_df

def experimento_estendido():
    """
    Função principal que executa o experimento e análises adicionais.
    """
    try:
        # Executar experimento principal
        results_df = main()

        # Validar se temos resultados para análises
        if results_df is None or len(results_df) == 0:
            print("❌ Sem resultados para análises adicionais.")
            return

        print("\n\n" + "="*60)
        print("Análises Adicionais")
        print("="*60)

        # Análise de correlação entre métricas
        print("\nAnalisando correlação entre métricas...")
        analisar_correlacao_similaridade_metricas(results_df, 'stsb')
        analisar_correlacao_similaridade_metricas(results_df, 'mrpc')

        # Análise de tempo vs. desempenho
        print("\nAnalisando relação tempo vs. desempenho...")
        analisar_tempo_vs_desempenho(results_df)

        # Análise do impacto das funções de perda
        print("\nAnalisando impacto das funções de perda...")
        impacto_df = analisar_impacto_funcao_perda(results_df)

        # Resumo das análises
        print("\nResumo das Análises:")
        print(f"- {len(results_df)} combinações de modelo-loss testadas")

        for dataset in ['STS-B', 'MRPC']:
            print(f"\n{dataset}:")
            dataset_impact = impacto_df[impacto_df['Dataset'] == dataset]

            if dataset == 'STS-B':
                best_loss = dataset_impact.sort_values('Mean Pearson', ascending=False).iloc[0]
                print(f"- Melhor função de perda: {best_loss['Loss Function']} (Pearson médio: {best_loss['Mean Pearson']:.4f})")
            else:
                best_loss = dataset_impact.sort_values('Mean F1 Score', ascending=False).iloc[0]
                print(f"- Melhor função de perda: {best_loss['Loss Function']} (F1 médio: {best_loss['Mean F1 Score']:.4f})")

        print("\n✅ Análises adicionais concluídas e salvas em:", RESULTS_DIR)

    except Exception as e:
        print(f"❌ Erro nas análises adicionais: {e}")
        import traceback
        traceback.print_exc()

# Executar se for o script principal
if __name__ == "__main__":
    print("="*80)
    print("Avaliação de Modelos Sentence-Transformer para Similaridade Semântica")
    print("="*80)
    print("Configurações:")
    print(f"- Seed: {SEED}")
    print(f"- Dispositivo: {DEVICE}")
    print(f"- Épocas: {NUM_EPOCHS}")
    print(f"- Batch Size: {BATCH_SIZE}")
    print(f"- Tamanho da amostra: {SAMPLE_SIZE if SAMPLE_SIZE else 'Dataset completo'}")
    print(f"- Diretório de resultados: {RESULTS_DIR}")
    print("="*80)

    # Executar experimento completo com análises adicionais
    experimento_estendido()

Configuração: Seed=42, Dispositivo=cuda
Avaliação de Modelos Sentence-Transformer para Similaridade Semântica
Configurações:
- Seed: 42
- Dispositivo: cuda
- Épocas: 3
- Batch Size: 16
- Tamanho da amostra: Dataset completo
- Diretório de resultados: results


Dataset: STSB


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/502k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/151k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/114k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]


[STS-B - train] Estatísticas:
- Exemplos: 5749
- Range de similaridade: [0.00, 1.00]
- Distribuição binária: {1: 3385, 0: 2364}

[STS-B - validation] Estatísticas:
- Exemplos: 1500
- Range de similaridade: [0.00, 1.00]
- Distribuição binária: {1: 750, 0: 750}
- Amostra utilizada: 1000 exemplos

Amostra dos dados (STSB):
                                       sentence1                                                 sentence2  label
0                         A plane is taking off.                               An air plane is taking off.   1.00
1                A man is playing a large flute.                                 A man is playing a flute.   0.76
2  A man is spreading shreded cheese on a pizza.  A man is spreading shredded cheese on an uncooked pizza.   0.76

----------------------------------------
Modelo: paraphrase-MiniLM-L6-v2
----------------------------------------

Avaliando paraphrase-MiniLM-L6-v2 com MSE em STSB...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mleonardp315[0m ([33mleonardp315-utad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.01
1000,0.0017



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.99
  Label: 0.48 | Similaridade: 0.98
  Label: 0.00 | Similaridade: 0.97
✅ Concluído: Mean Sim = 0.9581999778747559
   Pearson = 0.2387

Avaliando paraphrase-MiniLM-L6-v2 com Cosine em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0208
1000,0.0141



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.79
  Label: 0.48 | Similaridade: 0.76
  Label: 0.00 | Similaridade: -0.16
✅ Concluído: Mean Sim = 0.5178999900817871
   Pearson = 0.8851

Avaliando paraphrase-MiniLM-L6-v2 com Contrastive em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0142
1000,0.0111



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.91
  Label: 0.48 | Similaridade: 0.89
  Label: 0.00 | Similaridade: 0.10
✅ Concluído: Mean Sim = 0.7210000157356262
   Pearson = 0.83

Avaliando paraphrase-MiniLM-L6-v2 com InfoNCE em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2861
1000,0.1663



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.80
  Label: 0.48 | Similaridade: 0.88
  Label: 0.00 | Similaridade: 0.04
✅ Concluído: Mean Sim = 0.5982000231742859
   Pearson = 0.776

Avaliando paraphrase-MiniLM-L6-v2 com Euclidean em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


KeyboardInterrupt: 

In [None]:
"""
Avaliação Comparativa de Modelos Sentence-Transformer com Diferentes Funções de Perda
para Tarefas de Similaridade Semântica e Paráfrase

Este script realiza uma avaliação sistemática de diferentes modelos Sentence-Transformer
combinados com várias funções de perda em datasets de similaridade textual (STS-B) e
detecção de paráfrase (MRPC).
"""

import torch
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
import time
import os
import json
from datetime import datetime
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from scipy.stats import spearmanr, pearsonr
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Configuração de diretórios para resultados
RESULTS_DIR = Path("results")
FIGURES_DIR = RESULTS_DIR / "figures"
MODELS_DIR = RESULTS_DIR / "models"

for directory in [RESULTS_DIR, FIGURES_DIR, MODELS_DIR]:
    directory.mkdir(exist_ok=True, parents=True)

# Configuração para reprodutibilidade
def set_seed(seed_value=42):
    """Configura sementes para reprodutibilidade em múltiplos frameworks."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

    # Configurações adicionais para determinismo em PyTorch
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    return seed_value

SEED = set_seed(42)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Configuração: Seed={SEED}, Dispositivo={DEVICE}")

# Configurações de experimento
SAMPLE_SIZE = None  # Usar None para todo o dataset ou um número para amostragem
NUM_EPOCHS = 3
BATCH_SIZE = 16
SAVE_MODELS = True  # Salvar modelos treinados

# Carregar e preparar datasets
class DatasetLoader:
    """Gerenciador para carregar e preparar datasets de similaridade textual."""

    def __init__(self, cache_dir=None):
        self.cache_dir = cache_dir

    def load_dataset(self, name, split='train', sample_size=None, random_state=42):
        """
        Carrega e prepara datasets populares de similaridade textual.

        Args:
            name: Nome do dataset ('stsb' ou 'mrpc')
            split: Partição do dataset ('train', 'validation', 'test')
            sample_size: Número de exemplos para amostragem (None para usar todos)
            random_state: Semente para amostragem reproduzível

        Returns:
            DataFrame com os dados processados
        """
        if name.lower() == 'stsb':
            return self._load_stsb(split, sample_size, random_state)
        elif name.lower() == 'mrpc':
            return self._load_mrpc(split, sample_size, random_state)
        else:
            raise ValueError(f"Dataset não suportado: {name}. Use 'stsb' ou 'mrpc'")

    def _load_stsb(self, split, sample_size, random_state):
        """Carrega o dataset STS-B (Semantic Textual Similarity Benchmark)."""
        ds = load_dataset('glue', 'stsb', cache_dir=self.cache_dir)[split]
        df = pd.DataFrame(ds)

        # Processamento dos labels
        df['label'] = pd.to_numeric(df['label'], errors='coerce')
        df = df.dropna(subset=['label'])

        # Normalização para [0, 1]
        df['label'] = df['label'] / 5.0
        df['label'] = df['label'].clip(lower=0.0, upper=1.0)

        # Label binário para classificação
        df['label_bin'] = (df['label'] > 0.5).astype(int)

        # Estatísticas do dataset
        print(f"\n[STS-B - {split}] Estatísticas:")
        print(f"- Exemplos: {len(df)}")
        print(f"- Range de similaridade: [{df['label'].min():.2f}, {df['label'].max():.2f}]")
        print(f"- Distribuição binária: {df['label_bin'].value_counts().to_dict()}")

        # Aplicar amostragem se solicitado
        if sample_size is not None:
            sample_size = min(sample_size, len(df))
            df = df.sample(n=sample_size, random_state=random_state)
            print(f"- Amostra utilizada: {sample_size} exemplos")

        return df

    def _load_mrpc(self, split, sample_size, random_state):
        """Carrega o dataset MRPC (Microsoft Research Paraphrase Corpus)."""
        ds = load_dataset('glue', 'mrpc', cache_dir=self.cache_dir)[split]
        df = pd.DataFrame(ds)

        # Garantir que labels são inteiros
        df['label'] = df['label'].astype(int)
        df['label_bin'] = df['label']

        # Estatísticas do dataset
        print(f"\n[MRPC - {split}] Estatísticas:")
        print(f"- Exemplos: {len(df)}")
        print(f"- Distribuição: {df['label'].value_counts().to_dict()}")

        # Aplicar amostragem se solicitado
        if sample_size is not None:
            sample_size = min(sample_size, len(df))
            df = df.sample(n=sample_size, random_state=random_state)
            print(f"- Amostra utilizada: {sample_size} exemplos")

        return df

    def visualize_dataset_distribution(self, df, dataset_name):
        """Gera visualização da distribuição dos dados."""
        plt.figure(figsize=(10, 6))

        if dataset_name.lower() == 'stsb':
            sns.histplot(df['label'], bins=20, kde=True)
            plt.title('Distribuição de Similaridade no STS-B')
            plt.xlabel('Similaridade Normalizada [0,1]')
        else:  # MRPC
            counts = df['label'].value_counts().sort_index()
            sns.barplot(x=counts.index, y=counts.values)
            plt.title('Distribuição de Classes no MRPC')
            plt.xlabel('Classe (0=Não Paráfrase, 1=Paráfrase)')
            plt.xticks([0, 1], ['Não Paráfrase', 'Paráfrase'])

        plt.ylabel('Contagem')
        plt.tight_layout()

        fig_path = FIGURES_DIR / f"{dataset_name}_distribution.png"
        plt.savefig(fig_path, dpi=300, bbox_inches='tight')
        plt.close()

        return fig_path

# Classes para triplet learning
class TripletGenerator:
    """Gerador de triplas (âncora, positivo, negativo) para Triplet Loss."""

    def __init__(self, dataset, fixed_negative=None, hard_negatives=False):
        """
        Inicializa o gerador de triplas.

        Args:
            dataset: DataFrame com pares de sentenças
            fixed_negative: Sentença negativa fixa (opcional)
            hard_negatives: Se True, seleciona negativos difíceis do dataset
        """
        self.dataset = dataset
        self.fixed_negative = fixed_negative
        self.hard_negatives = hard_negatives

    def generate_triplets(self, n_triplets=None):
        """
        Gera triplas de sentenças para treinamento.

        Args:
            n_triplets: Número de triplas a gerar (padrão: tamanho do dataset)

        Returns:
            Lista de triplas (âncora, positivo, negativo)
        """
        if n_triplets is None:
            n_triplets = len(self.dataset)

        triplets = []
        indices = random.sample(range(len(self.dataset)), k=min(n_triplets, len(self.dataset)))

        for i in indices:
            anchor = self.dataset.iloc[i]['sentence1']
            positive = self.dataset.iloc[i]['sentence2']

            if self.fixed_negative:
                negative = self.fixed_negative
            elif self.hard_negatives:
                # Selecionar uma sentença diferente como negativo
                neg_idx = random.choice([j for j in range(len(self.dataset)) if j != i])
                negative = random.choice([self.dataset.iloc[neg_idx]['sentence1'],
                                         self.dataset.iloc[neg_idx]['sentence2']])
            else:
                # Usar uma sentença aleatória como negativo
                negative = "Esta é uma frase negativa para a tripla."

            triplets.append((anchor, positive, negative))

        return triplets

class TripletDataset(Dataset):
    """Dataset para triplas compatível com PyTorch DataLoader."""

    def __init__(self, triplets):
        self.triplets = triplets

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        a, p, n = self.triplets[idx]
        return InputExample(texts=[a, p, n])

# Função de perda customizada
# Funções de perda
class TripletLoss(torch.nn.Module):
    def __init__(self, model, margin=1.0): super().__init__(); self.model = model; self.margin = margin
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        return torch.mean(F.relu(torch.norm(e[0]-e[1], p=2, dim=1) - torch.norm(e[0]-e[2], p=2, dim=1) + self.margin))

class OnlineTripletLoss(TripletLoss): pass
class BatchHardTripletLoss(TripletLoss): pass
class BatchSemiHardTripletLoss(TripletLoss): pass
class BatchAllTripletLoss(TripletLoss): pass

class MSELoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [self.model(f)['sentence_embedding'] for f in sf]; return F.mse_loss(e[0], e[1])

class EuclideanLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        return torch.mean(torch.norm(e[0] - e[1], p=2, dim=1))

class NormalizedEuclideanLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        distance = torch.norm(e[0] - e[1], p=2, dim=1)
        return torch.mean(distance)

class AngularMarginLoss(torch.nn.Module):
    def __init__(self, model, margin=0.5): super().__init__(); self.model = model; self.margin = margin
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        cosine = torch.sum(e[0] * e[1], dim=1)
        theta = torch.acos(torch.clamp(cosine, -1.0 + 1e-7, 1.0 - 1e-7))
        return torch.mean((theta + self.margin * (1.0 - lbl.float())) ** 2)

class CircleLoss(torch.nn.Module):
    def __init__(self, model, m=0.25, gamma=256): super().__init__(); self.model = model; self.m = m; self.gamma = gamma
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        alpha_p = torch.clamp_min(1 + self.m - sim, min=0)
        alpha_n = torch.clamp_min(sim + self.m, min=0)
        delta_p = 1 - self.m
        delta_n = self.m
        logits_p = (-self.gamma) * alpha_p * (sim - delta_p)
        logits_n = self.gamma * alpha_n * (sim - delta_n)
        loss = torch.log1p(torch.exp(logits_n)) + torch.log1p(torch.exp(logits_p))
        return loss.mean()

class SphereLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        return torch.mean(1 - torch.sum(e[0] * e[1], dim=1))

class HistogramLoss(torch.nn.Module):
    def __init__(self, model, num_bins=10): super().__init__(); self.model = model; self.num_bins = num_bins
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        hist_pos = torch.histc(sim[lbl == 1], bins=self.num_bins, min=-1, max=1)
        hist_neg = torch.histc(sim[lbl == 0], bins=self.num_bins, min=-1, max=1)
        hist_pos /= (torch.sum(hist_pos) + 1e-10)
        hist_neg /= (torch.sum(hist_neg) + 1e-10)
        return torch.sum((hist_pos - hist_neg) ** 2)

class CentroidLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        pos_mask = (lbl == 1).unsqueeze(1)
        neg_mask = (lbl == 0).unsqueeze(1)
        pos_centroid = (e[0] * pos_mask).sum(0) / (pos_mask.sum() + 1e-10)
        neg_centroid = (e[0] * neg_mask).sum(0) / (neg_mask.sum() + 1e-10)
        return F.mse_loss(pos_centroid, neg_centroid)

class HyperSphereLoss(torch.nn.Module):
    def __init__(self, model, radius=1.0): super().__init__(); self.model = model; self.radius = radius
    def forward(self, sf, lbl):
        e = [self.model(f)['sentence_embedding'] for f in sf]
        norms = [torch.norm(emb, p=2, dim=1) for emb in e]
        return torch.mean((norms[0] - self.radius) ** 2 + (norms[1] - self.radius) ** 2)

class ProbabilisticLoss(torch.nn.Module):
    def __init__(self, model): super().__init__(); self.model = model
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        prob = torch.sigmoid(sim)
        return F.binary_cross_entropy(prob, lbl.float())

class LiftedStructuredLoss(torch.nn.Module):
    def __init__(self, model, margin=1.0): super().__init__(); self.model = model; self.margin = margin
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        dist_matrix = torch.cdist(e[0], e[1], p=2)
        pos_mask = (lbl == 1).float()
        neg_mask = (lbl == 0).float()
        pos_term = torch.log(torch.exp(dist_matrix * pos_mask).sum() + 1)
        neg_term = torch.log(torch.exp(-dist_matrix * neg_mask + self.margin).sum() + 1)
        return pos_term + neg_term

class GeneralPairLoss(torch.nn.Module):
    def __init__(self, model, pos_weight=1.0, neg_weight=1.0): super().__init__(); self.model = model; self.pos_weight = pos_weight; self.neg_weight = neg_weight
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        pos_pairs = sim[lbl == 1]
        neg_pairs = sim[lbl == 0]
        pos_loss = self.pos_weight * torch.mean((1 - pos_pairs) ** 2)
        neg_loss = self.neg_weight * torch.mean(neg_pairs ** 2)
        return pos_loss + neg_loss

class AngularLoss(torch.nn.Module):
    def __init__(self, model, angle_bound=1.0): super().__init__(); self.model = model; self.angle_bound = angle_bound
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        cos_theta = torch.sum(e[0] * e[1], dim=1)
        theta = torch.acos(torch.clamp(cos_theta, -1.0 + 1e-7, 1.0 - 1e-7))
        target = lbl.float()
        return torch.mean(target * theta + (1 - target) * torch.clamp(self.angle_bound - theta, min=0.0))

class MarginRankingLoss(torch.nn.Module):
    def __init__(self, model, margin=0.5): super().__init__(); self.model = model; self.margin = margin
    def forward(self, sf, lbl):
        e = [F.normalize(self.model(f)['sentence_embedding'], p=2, dim=1) for f in sf]
        sim = torch.sum(e[0] * e[1], dim=1)
        target = 2 * lbl.float() - 1
        return torch.mean(torch.clamp(self.margin - target * sim, min=0.0))

# Dicionário com funções de perda
loss_functions = {
    'MSE': MSELoss,
    'Cosine': losses.CosineSimilarityLoss,
    'Contrastive': losses.ContrastiveLoss,
    'InfoNCE': losses.MultipleNegativesRankingLoss,
    'Euclidean': EuclideanLoss,
    'NormaEuc': NormalizedEuclideanLoss,
    'NPairs': losses.BatchAllTripletLoss,
    'MultiSimilarity': losses.MultipleNegativesRankingLoss,
    'AngularMargin': AngularMarginLoss,
    'Sphere': SphereLoss,
    'HyperSphere': HyperSphereLoss,
    'Probabilistic': ProbabilisticLoss,
    'LiftedStructured': LiftedStructuredLoss,
    'GeneralPair': GeneralPairLoss,
    'Angular': AngularLoss,
    'MarginRanking': MarginRankingLoss,
    'Triplet': TripletLoss,
    'OnlineTriplet': OnlineTripletLoss,
    'BatchHardTriplet': BatchHardTripletLoss,
    'BatchSemiHardTriplet': BatchSemiHardTripletLoss,
    'BatchAllTriplet': BatchAllTripletLoss
}

def gerar_exemplos(df, loss_name, fixed_negative=None):
    """
    Gera exemplos de treinamento compatíveis com diferentes funções de perda.

    Args:
        df: DataFrame com os dados
        loss_name: Nome da função de perda a usar
        fixed_negative: Sentença negativa fixa para Triplet Loss

    Returns:
        Dataset com exemplos formatados para a função de perda especificada
    """
    if 'Triplet' in loss_name:
        triplets = TripletGenerator(df, fixed_negative, hard_negatives=True).generate_triplets()
        return TripletDataset(triplets)
    elif loss_name == 'Contrastive':
        # Para Contrastive Loss, usamos labels binários
        exemplos = [InputExample(texts=[r['sentence1'], r['sentence2']], label=float(r['label_bin']))
                    for _, r in df.iterrows()]
        return exemplos
    else:
        # Para outras funções de perda, usamos similaridade contínua
        exemplos = [InputExample(texts=[r['sentence1'], r['sentence2']], label=float(r['label']))
                    for _, r in df.iterrows()]
        return exemplos

# Funções de avaliação
def avaliar_modelo(model, df_teste, dataset_nome):
    """
    Avalia um modelo em um dataset de teste.

    Args:
        model: Modelo SentenceTransformer treinado
        df_teste: DataFrame com dados de teste
        dataset_nome: Nome do dataset ('stsb' ou 'mrpc')

    Returns:
        Dicionário com métricas de avaliação
    """
    # Preparar dados
    sent1 = df_teste['sentence1'].tolist()
    sent2 = df_teste['sentence2'].tolist()
    labels = df_teste['label'].tolist()

    # Calcular embeddings e similaridades
    embeddings = model.encode(sent1 + sent2, batch_size=32, show_progress_bar=False)
    embeddings1 = embeddings[:len(sent1)]
    embeddings2 = embeddings[len(sent1):]

    # Calcular similaridades de cosseno
    similaridades = []
    for e1, e2 in zip(embeddings1, embeddings2):
        similaridades.append(cosine_similarity([e1], [e2])[0][0])

    # Métricas básicas
    mean_sim = np.mean(similaridades)
    std_sim = np.std(similaridades)
    resultados = {
        'mean_similarity': mean_sim,
        'std_similarity': std_sim
    }

    # Métricas específicas por dataset
    if dataset_nome.lower() == 'stsb':
        # Correlação para tarefas de similaridade
        if len(set(labels)) > 1 and len(set(similaridades)) > 1:
            resultados['pearson'] = pearsonr(labels, similaridades)[0]
            resultados['spearman'] = spearmanr(labels, similaridades)[0]
        else:
            resultados['pearson'] = float('nan')
            resultados['spearman'] = float('nan')

        # Exemplo para depuração
        print("\n[STS-B] Exemplo de avaliação:")
        for i in range(min(3, len(labels))):
            print(f"  Label: {labels[i]:.2f} | Similaridade: {similaridades[i]:.2f}")

    elif dataset_nome.lower() == 'mrpc':
        # Métricas de classificação
        binary_preds = [1 if s >= 0.5 else 0 for s in similaridades]
        resultados['accuracy'] = accuracy_score(labels, binary_preds)
        resultados['f1'] = f1_score(labels, binary_preds)
        resultados['precision'] = precision_score(labels, binary_preds)
        resultados['recall'] = recall_score(labels, binary_preds)

        # Exemplo para depuração
        print("\n[MRPC] Exemplo de avaliação:")
        for i in range(min(3, len(labels))):
            print(f"  Label: {labels[i]} | Predito: {binary_preds[i]} | Similaridade: {similaridades[i]:.2f}")

    return resultados

def plotar_resultados(results_df, metric, dataset_name):
    """
    Gera gráficos comparativos de resultados.

    Args:
        results_df: DataFrame com resultados
        metric: Métrica a visualizar
        dataset_name: Nome do dataset

    Returns:
        Caminho para o arquivo de figura salvo
    """
    plt.figure(figsize=(12, 8))

    # Filtrar dados para o dataset específico
    df = results_df[results_df['Dataset'] == dataset_name].copy()

    # Preparar gráfico de barras agrupadas
    pivot_df = df.pivot(index='Model', columns='Loss Function', values=metric)

    ax = pivot_df.plot(kind='bar', figsize=(12, 8))

    # Configurações do gráfico
    plt.title(f'{metric} por Modelo e Função de Perda - {dataset_name.upper()}', fontsize=14)
    plt.xlabel('Modelo', fontsize=12)
    plt.ylabel(metric, fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend(title='Função de Perda', fontsize=10)

    # Adicionar valores nas barras
    for container in ax.containers:
        ax.bar_label(container, fmt='%.3f', fontsize=8)

    plt.tight_layout()

    # Salvar figura
    filename = f"{dataset_name}_{metric}_comparison.png"
    filepath = FIGURES_DIR / filename
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    return filepath

def plot_training_curve(history, model_name, loss_name, dataset_name):
    """
    Plota curva de treinamento.

    Args:
        history: Histórico de treinamento
        model_name: Nome do modelo
        loss_name: Nome da função de perda
        dataset_name: Nome do dataset

    Returns:
        Caminho para o arquivo de figura salvo
    """
    plt.figure(figsize=(10, 6))

    # Extrair dados de histórico
    epochs = range(1, len(history['train_loss']) + 1)

    # Plotar perdas
    plt.plot(epochs, history['train_loss'], 'b-', label='Perda de Treinamento')

    # Adicionar informações ao gráfico
    plt.title(f'Curva de Treinamento: {model_name}\n{loss_name} em {dataset_name}', fontsize=14)
    plt.xlabel('Época', fontsize=12)
    plt.ylabel('Perda', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()

    # Salvar figura
    model_short = model_name.split('/')[-1] if '/' in model_name else model_name
    filename = f"{dataset_name}_{model_short}_{loss_name}_training.png"
    filepath = FIGURES_DIR / filename
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    return filepath

# Função principal de treinamento e avaliação
def treinar_e_avaliar(model_name, dataset_nome, loss_name, train_df, test_df,
                      epochs=3, batch_size=16, save_model=False):
    """
    Treina e avalia um modelo com uma função de perda específica.

    Args:
        model_name: Nome do modelo Sentence-Transformer
        dataset_nome: Nome do dataset ('stsb' ou 'mrpc')
        loss_name: Nome da função de perda
        train_df: DataFrame com dados de treinamento
        test_df: DataFrame com dados de teste
        epochs: Número de épocas de treinamento
        batch_size: Tamanho do lote
        save_model: Se True, salva o modelo treinado

    Returns:
        Dicionário com resultados e métricas
    """
    try:
        # Inicializar modelo
        model = SentenceTransformer(model_name).to(DEVICE)
        model_identificador = model_name.split('/')[-1] if '/' in model_name else model_name

        # Configurar treinamento
        fixed_negative = "Este é um exemplo de sentença negativa para triplas de treinamento."
        dataset = gerar_exemplos(train_df, loss_name, fixed_negative)
        dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
        loss_fn = loss_functions[loss_name](model)

        # Registrar histórico de treinamento
        history = {'train_loss': []}

        class LogCallback:
            def __init__(self, history):
                self.history = history

            def on_epoch_end(self, epoch, loss, *args, **kwargs):
                self.history['train_loss'].append(loss)

        # Executar treinamento
        start_time = time.time()
        model.fit(
            train_objectives=[(dataloader, loss_fn)],
            epochs=epochs,
            warmup_steps=int(len(dataloader) * 0.1),
            show_progress_bar=True,
            output_path=None,
            callback=LogCallback(history)
        )
        training_time = time.time() - start_time

        # Avaliar modelo
        evaluation_results = avaliar_modelo(model, test_df, dataset_nome)

        # Salvar modelo se solicitado
        model_path = None
        if save_model:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_path = MODELS_DIR / f"{dataset_nome}_{model_identificador}_{loss_name}_{timestamp}"
            model.save(str(model_path))

        # Plotar curva de treinamento
        training_plot = plot_training_curve(history, model_name, loss_name, dataset_nome)

        # Consolidar resultados
        results = {
            'Dataset': dataset_nome,
            'Model': model_name,
            'Loss Function': loss_name,
            'Training Time (s)': round(training_time, 2),
            'Mean Similarity': round(evaluation_results['mean_similarity'], 4),
            'STD Similarity': round(evaluation_results['std_similarity'], 4),
            'Epochs': epochs,
            'Batch Size': batch_size,
            'Training Plot': str(training_plot),
            'Model Path': str(model_path) if model_path else None
        }

        # Adicionar métricas específicas
        if dataset_nome.lower() == 'stsb':
            results['Pearson'] = round(evaluation_results['pearson'], 4) if 'pearson' in evaluation_results else None
            results['Spearman'] = round(evaluation_results['spearman'], 4) if 'spearman' in evaluation_results else None
        elif dataset_nome.lower() == 'mrpc':
            results['Accuracy'] = round(evaluation_results['accuracy'], 4) if 'accuracy' in evaluation_results else None
            results['F1 Score'] = round(evaluation_results['f1'], 4) if 'f1' in evaluation_results else None
            results['Precision'] = round(evaluation_results['precision'], 4) if 'precision' in evaluation_results else None
            results['Recall'] = round(evaluation_results['recall'], 4) if 'recall' in evaluation_results else None

        return results

    except Exception as e:
        print(f"Erro em treinar_e_avaliar({model_name}, {dataset_nome}, {loss_name}): {e}")
        import traceback
        traceback.print_exc()
        return {
            'Dataset': dataset_nome,
            'Model': model_name,
            'Loss Function': loss_name,
            'Error': str(e)
        }

# Função principal
def main():
    # Lista de modelos a avaliar
    model_names = [
        'sentence-transformers/paraphrase-MiniLM-L6-v2',
        'sentence-transformers/bert-base-nli-mean-tokens',
        'sentence-transformers/all-mpnet-base-v2'
    ]

    # Datasets a avaliar
    datasets = ['stsb', 'mrpc']

    # Configurações experimentais
    experiment_config = {
        'seed': SEED,
        'device': str(DEVICE),
        'epochs': NUM_EPOCHS,
        'batch_size': BATCH_SIZE,
        'sample_size': SAMPLE_SIZE,
        'save_models': SAVE_MODELS,
        'models': model_names,
        'datasets': datasets,
        'loss_functions': list(loss_functions.keys()),
        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }

    # Salvar configuração do experimento
    with open(RESULTS_DIR / "experiment_config.json", 'w') as f:
        json.dump(experiment_config, f, indent=2)

    # Inicializar carregador de datasets
    loader = DatasetLoader()

    # Resultados armazenados aqui
    all_results = []
    dataset_figures = {}

    # Loop principal
    for dataset_nome in datasets:
        print(f"\n\n{'='*60}")
        print(f"Dataset: {dataset_nome.upper()}")
        print(f"{'='*60}")

        # Carregar datasets
        train_df = loader.load_dataset(dataset_nome, 'train', sample_size=SAMPLE_SIZE)
        test_df = loader.load_dataset(dataset_nome, 'validation', sample_size=min(1000, SAMPLE_SIZE if SAMPLE_SIZE else 1000))

        # Visualização da distribuição
        dist_fig = loader.visualize_dataset_distribution(train_df, dataset_nome)
        dataset_figures[dataset_nome] = str(dist_fig)

        # Exemplo dos dados
        print(f"\nAmostra dos dados ({dataset_nome.upper()}):")
        print(train_df[['sentence1', 'sentence2', 'label']].head(3).to_string())

        # Executar avaliação para cada combinação
        results_dataset = []

        for model_name in model_names:
            model_short = model_name.split('/')[-1]
            print(f"\n{'-'*40}")
            print(f"Modelo: {model_short}")
            print(f"{'-'*40}")

            for loss_name in loss_functions.keys():
                print(f"\nAvaliando {model_short} com {loss_name} em {dataset_nome.upper()}...")

                result = treinar_e_avaliar(
                    model_name=model_name,
                    dataset_nome=dataset_nome,
                    loss_name=loss_name,
                    train_df=train_df,
                    test_df=test_df,
                    epochs=NUM_EPOCHS,
                    batch_size=BATCH_SIZE,
                    save_model=SAVE_MODELS
                )

                results_dataset.append(result)
                all_results.append(result)

                # Log imediato do resultado
                if 'Error' in result:
                    print(f"❌ Erro: {result['Error']}")
                else:
                    print(f"✅ Concluído: Mean Sim = {result['Mean Similarity']}")
                    if dataset_nome.lower() == 'stsb':
                        print(f"   Pearson = {result['Pearson']}")
                    else:
                        print(f"   Accuracy = {result['Accuracy']}, F1 = {result['F1 Score']}")

        # Salvar resultados por dataset
        results_df = pd.DataFrame(results_dataset)
        results_df.to_csv(RESULTS_DIR / f"resultados_{dataset_nome}.csv", index=False)

        # Gerar visualizações
        if dataset_nome.lower() == 'stsb':
            plotar_resultados(results_df, 'Pearson', dataset_nome)
        else:
            plotar_resultados(results_df, 'F1 Score', dataset_nome)
            plotar_resultados(results_df, 'Accuracy', dataset_nome)

    # Consolidar todos os resultados
    all_results_df = pd.DataFrame(all_results)
    all_results_df.to_csv(RESULTS_DIR / "resultados_completos.csv", index=False)

    # Gerar relatório HTML
    generate_html_report(all_results_df, experiment_config, dataset_figures)

    print("\n\nExperimento concluído. Resultados disponíveis em:", RESULTS_DIR)
    return all_results_df

def generate_html_report(results_df, config, dataset_figures):

    """
    Gera um relatório HTML com os resultados do experimento.

    Args:
        results_df: DataFrame com todos os resultados
        config: Configuração do experimento
        dataset_figures: Dicionário com caminhos para figuras de datasets
    """
    report_path = RESULTS_DIR / "relatorio_experimento.html"

    # Preparar tabelas de resultados
    stsb_df = results_df[results_df['Dataset'] == 'stsb'].copy()
    mrpc_df = results_df[results_df['Dataset'] == 'mrpc'].copy()

    # Selecionar colunas relevantes
    stsb_cols = ['Model', 'Loss Function', 'Training Time (s)', 'Mean Similarity', 'Pearson', 'Spearman']
    mrpc_cols = ['Model', 'Loss Function', 'Training Time (s)', 'Mean Similarity', 'Accuracy', 'F1 Score', 'Precision', 'Recall']

    # Limpar nomes de modelos para exibição
    for df in [stsb_df, mrpc_df]:
        df['Model'] = df['Model'].apply(lambda x: x.split('/')[-1] if '/' in x else x)

    # Gerar HTML
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Relatório de Avaliação de Sentence-Transformers</title>
        <style>
            body {{ font-family: Arial, sans-serif; line-height: 1.6; margin: 0; padding: 20px; color: #333; }}
            h1, h2, h3 {{ color: #2c3e50; }}
            table {{ border-collapse: collapse; width: 100%; margin-bottom: 20px; }}
            th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
            th {{ background-color: #f2f2f2; color: #333; font-weight: bold; }}
            tr:nth-child(even) {{ background-color: #f9f9f9; }}
            tr:hover {{ background-color: #f5f5f5; }}
            .container {{ max-width: 1200px; margin: 0 auto; padding: 20px; }}
            .section {{ margin-bottom: 30px; }}
            .best-result {{ font-weight: bold; color: #27ae60; }}
            img {{ max-width: 100%; height: auto; margin: 10px 0; border: 1px solid #ddd; }}
            .config {{ background-color: #f8f9fa; padding: 15px; border-radius: 4px; margin-bottom: 20px; }}
            footer {{ margin-top: 30px; padding-top: 10px; border-top: 1px solid #eee; color: #7f8c8d; font-size: 0.9em; }}
        </style>
    </head>
    <body>
        <div class="container">
            <header>
                <h1>Avaliação Comparativa de Modelos Sentence-Transformer</h1>
                <p>Relatório gerado em: {config['timestamp']}</p>
            </header>

            <div class="section">
                <h2>Configuração do Experimento</h2>
                <div class="config">
                    <p><strong>Dispositivo:</strong> {config['device']}</p>
                    <p><strong>Semente:</strong> {config['seed']}</p>
                    <p><strong>Épocas:</strong> {config['epochs']}</p>
                    <p><strong>Tamanho do Lote:</strong> {config['batch_size']}</p>
                    <p><strong>Amostra:</strong> {config['sample_size'] if config['sample_size'] else 'Dataset Completo'}</p>
                    <p><strong>Modelos:</strong> {', '.join([m.split('/')[-1] if '/' in m else m for m in config['models']])}</p>
                    <p><strong>Funções de Perda:</strong> {', '.join(config['loss_functions'])}</p>
                </div>
            </div>

            <div class="section">
                <h2>Resultados - STS-B (Similaridade Semântica)</h2>
                <p>Distribuição dos dados de treinamento:</p>
                <img src="{dataset_figures['stsb']}" alt="Distribuição STS-B">

                <h3>Métricas de Desempenho</h3>
                <table>
                    <tr>
                        <th>Modelo</th>
                        <th>Função de Perda</th>
                        <th>Tempo (s)</th>
                        <th>Similaridade Média</th>
                        <th>Correlação de Pearson</th>
                        <th>Correlação de Spearman</th>
                    </tr>
                    {stsb_df[stsb_cols].sort_values('Pearson', ascending=False).to_html(index=False, header=False, classes='results-table')}
                </table>

                <h3>Visualização de Resultados</h3>
                <img src="{FIGURES_DIR / 'stsb_Pearson_comparison.png'}" alt="Comparação Pearson STS-B">
            </div>

            <div class="section">
                <h2>Resultados - MRPC (Detecção de Paráfrase)</h2>
                <p>Distribuição dos dados de treinamento:</p>
                <img src="{dataset_figures['mrpc']}" alt="Distribuição MRPC">

                <h3>Métricas de Desempenho</h3>
                <table>
                    <tr>
                        <th>Modelo</th>
                        <th>Função de Perda</th>
                        <th>Tempo (s)</th>
                        <th>Similaridade Média</th>
                        <th>Acurácia</th>
                        <th>F1 Score</th>
                        <th>Precisão</th>
                        <th>Recall</th>
                    </tr>
                    {mrpc_df[mrpc_cols].sort_values('F1 Score', ascending=False).to_html(index=False, header=False, classes='results-table')}
                </table>

                <h3>Visualização de Resultados</h3>
                <img src="{FIGURES_DIR / 'mrpc_F1 Score_comparison.png'}" alt="Comparação F1 MRPC">
                <img src="{FIGURES_DIR / 'mrpc_Accuracy_comparison.png'}" alt="Comparação Acurácia MRPC">
            </div>

            <div class="section">
                <h2>Análise das Curvas de Treinamento</h2>
                <p>Exemplos de curvas de treinamento para os melhores modelos:</p>

                <h3>STS-B (Melhor modelo)</h3>
                <img src="{stsb_df.sort_values('Pearson', ascending=False).iloc[0]['Training Plot']}" alt="Melhor Curva STS-B">

                <h3>MRPC (Melhor modelo)</h3>
                <img src="{mrpc_df.sort_values('F1 Score', ascending=False).iloc[0]['Training Plot']}" alt="Melhor Curva MRPC">
            </div>

            <div class="section">
                <h2>Conclusões</h2>
                <p><strong>Melhor configuração para STS-B:</strong> {stsb_df.sort_values('Pearson', ascending=False).iloc[0]['Model']} com {stsb_df.sort_values('Pearson', ascending=False).iloc[0]['Loss Function']} (Pearson: {stsb_df.sort_values('Pearson', ascending=False).iloc[0]['Pearson']})</p>
                <p><strong>Melhor configuração para MRPC:</strong> {mrpc_df.sort_values('F1 Score', ascending=False).iloc[0]['Model']} com {mrpc_df.sort_values('F1 Score', ascending=False).iloc[0]['Loss Function']} (F1: {mrpc_df.sort_values('F1 Score', ascending=False).iloc[0]['F1 Score']})</p>

                <p>Observações gerais:</p>
                <ul>
                    <li>As funções de perda têm impacto significativo no desempenho dos modelos.</li>
                    <li>Modelos especializados em paráfrase tendem a ter melhor desempenho nas tarefas avaliadas.</li>
                    <li>O tempo de treinamento varia consideravelmente entre os modelos.</li>
                </ul>
            </div>

            <footer>
                <p>Relatório gerado automaticamente pelo script de avaliação de Sentence-Transformers.</p>
                <p>Todos os modelos e resultados estão disponíveis no diretório: {RESULTS_DIR}</p>
            </footer>
        </div>
    </body>
    </html>
    """

    # Salvar relatório
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"Relatório HTML gerado em: {report_path}")
    return report_path

# Funções adicionais para análise avançada

def analisar_correlacao_similaridade_metricas(results_df, dataset_nome):
    """
    Analisa a correlação entre similaridade média e métricas de desempenho.

    Args:
        results_df: DataFrame com resultados
        dataset_nome: Nome do dataset a analisar

    Returns:
        Figura com matriz de correlação
    """
    # Filtrar dados para o dataset específico
    df = results_df[results_df['Dataset'] == dataset_nome].copy()

    # Colunas para analisar
    if dataset_nome.lower() == 'stsb':
        cols = ['Mean Similarity', 'STD Similarity', 'Pearson', 'Spearman', 'Training Time (s)']
    else:  # MRPC
        cols = ['Mean Similarity', 'STD Similarity', 'Accuracy', 'F1 Score',
                'Precision', 'Recall', 'Training Time (s)']

    # Calcular matriz de correlação
    corr_matrix = df[cols].corr()

    # Visualizar matriz de correlação
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title(f'Correlação entre Métricas - {dataset_nome.upper()}')
    plt.tight_layout()

    # Salvar figura
    filepath = FIGURES_DIR / f"{dataset_nome}_metric_correlation.png"
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    return filepath

def analisar_tempo_vs_desempenho(results_df):
    """
    Analisa a relação entre tempo de treinamento e métricas de desempenho.

    Args:
        results_df: DataFrame com resultados

    Returns:
        Figura com gráficos de dispersão
    """
    plt.figure(figsize=(12, 10))

    # Dividir em subplots
    fig, axes = plt.subplots(2, 1, figsize=(12, 12))

    # Dados para STS-B
    stsb_df = results_df[results_df['Dataset'] == 'stsb'].copy()
    stsb_df['Model'] = stsb_df['Model'].apply(lambda x: x.split('/')[-1] if '/' in x else x)

    # Dados para MRPC
    mrpc_df = results_df[results_df['Dataset'] == 'mrpc'].copy()
    mrpc_df['Model'] = mrpc_df['Model'].apply(lambda x: x.split('/')[-1] if '/' in x else x)

    # Gráfico para STS-B
    ax = axes[0]
    for model in stsb_df['Model'].unique():
        model_df = stsb_df[stsb_df['Model'] == model]
        ax.scatter(model_df['Training Time (s)'], model_df['Pearson'],
                  label=model, alpha=0.7, s=80)

        # Adicionar rótulos para cada ponto
        for _, row in model_df.iterrows():
            ax.annotate(row['Loss Function'],
                       (row['Training Time (s)'], row['Pearson']),
                       fontsize=8, alpha=0.8)

    ax.set_title('STS-B: Correlação de Pearson vs. Tempo de Treinamento')
    ax.set_xlabel('Tempo de Treinamento (segundos)')
    ax.set_ylabel('Correlação de Pearson')
    ax.grid(True, linestyle='--', alpha=0.6)
    ax.legend()

    # Gráfico para MRPC
    ax = axes[1]
    for model in mrpc_df['Model'].unique():
        model_df = mrpc_df[mrpc_df['Model'] == model]
        ax.scatter(model_df['Training Time (s)'], model_df['F1 Score'],
                  label=model, alpha=0.7, s=80)

        # Adicionar rótulos para cada ponto
        for _, row in model_df.iterrows():
            ax.annotate(row['Loss Function'],
                       (row['Training Time (s)'], row['F1 Score']),
                       fontsize=8, alpha=0.8)

    ax.set_title('MRPC: F1 Score vs. Tempo de Treinamento')
    ax.set_xlabel('Tempo de Treinamento (segundos)')
    ax.set_ylabel('F1 Score')
    ax.grid(True, linestyle='--', alpha=0.6)
    ax.legend()

    plt.tight_layout()

    # Salvar figura
    filepath = FIGURES_DIR / "tempo_vs_desempenho.png"
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    return filepath

def analisar_impacto_funcao_perda(results_df):
    """
    Analisa o impacto das diferentes funções de perda no desempenho.

    Args:
        results_df: DataFrame com resultados

    Returns:
        DataFrame com estatísticas de impacto
    """
    # Estatísticas por função de perda
    impacto = []

    # Análise para STS-B
    stsb_df = results_df[results_df['Dataset'] == 'stsb'].copy()
    stsb_metrics = ['Pearson', 'Spearman']

    for loss_fn in stsb_df['Loss Function'].unique():
        loss_stats = {
            'Dataset': 'STS-B',
            'Loss Function': loss_fn,
            'Count': len(stsb_df[stsb_df['Loss Function'] == loss_fn])
        }

        for metric in stsb_metrics:
            loss_stats[f'Mean {metric}'] = stsb_df[stsb_df['Loss Function'] == loss_fn][metric].mean()
            loss_stats[f'Std {metric}'] = stsb_df[stsb_df['Loss Function'] == loss_fn][metric].std()
            loss_stats[f'Max {metric}'] = stsb_df[stsb_df['Loss Function'] == loss_fn][metric].max()
            loss_stats[f'Min {metric}'] = stsb_df[stsb_df['Loss Function'] == loss_fn][metric].min()

        impacto.append(loss_stats)

    # Análise para MRPC
    mrpc_df = results_df[results_df['Dataset'] == 'mrpc'].copy()
    mrpc_metrics = ['Accuracy', 'F1 Score']

    for loss_fn in mrpc_df['Loss Function'].unique():
        loss_stats = {
            'Dataset': 'MRPC',
            'Loss Function': loss_fn,
            'Count': len(mrpc_df[mrpc_df['Loss Function'] == loss_fn])
        }

        for metric in mrpc_metrics:
            loss_stats[f'Mean {metric}'] = mrpc_df[mrpc_df['Loss Function'] == loss_fn][metric].mean()
            loss_stats[f'Std {metric}'] = mrpc_df[mrpc_df['Loss Function'] == loss_fn][metric].std()
            loss_stats[f'Max {metric}'] = mrpc_df[mrpc_df['Loss Function'] == loss_fn][metric].max()
            loss_stats[f'Min {metric}'] = mrpc_df[mrpc_df['Loss Function'] == loss_fn][metric].min()

        impacto.append(loss_stats)

    # Criar DataFrame com estatísticas
    impacto_df = pd.DataFrame(impacto)

    # Salvar análise
    impacto_df.to_csv(RESULTS_DIR / "impacto_funcoes_perda.csv", index=False)

    return impacto_df

def experimento_estendido():
    """
    Função principal que executa o experimento e análises adicionais.
    """
    try:
        # Executar experimento principal
        results_df = main()

        # Validar se temos resultados para análises
        if results_df is None or len(results_df) == 0:
            print("❌ Sem resultados para análises adicionais.")
            return

        print("\n\n" + "="*60)
        print("Análises Adicionais")
        print("="*60)

        # Análise de correlação entre métricas
        print("\nAnalisando correlação entre métricas...")
        analisar_correlacao_similaridade_metricas(results_df, 'stsb')
        analisar_correlacao_similaridade_metricas(results_df, 'mrpc')

        # Análise de tempo vs. desempenho
        print("\nAnalisando relação tempo vs. desempenho...")
        analisar_tempo_vs_desempenho(results_df)

        # Análise do impacto das funções de perda
        print("\nAnalisando impacto das funções de perda...")
        impacto_df = analisar_impacto_funcao_perda(results_df)

        # Resumo das análises
        print("\nResumo das Análises:")
        print(f"- {len(results_df)} combinações de modelo-loss testadas")

        for dataset in ['STS-B', 'MRPC']:
            print(f"\n{dataset}:")
            dataset_impact = impacto_df[impacto_df['Dataset'] == dataset]

            if dataset == 'STS-B':
                best_loss = dataset_impact.sort_values('Mean Pearson', ascending=False).iloc[0]
                print(f"- Melhor função de perda: {best_loss['Loss Function']} (Pearson médio: {best_loss['Mean Pearson']:.4f})")
            else:
                best_loss = dataset_impact.sort_values('Mean F1 Score', ascending=False).iloc[0]
                print(f"- Melhor função de perda: {best_loss['Loss Function']} (F1 médio: {best_loss['Mean F1 Score']:.4f})")

        print("\n✅ Análises adicionais concluídas e salvas em:", RESULTS_DIR)

    except Exception as e:
        print(f"❌ Erro nas análises adicionais: {e}")
        import traceback
        traceback.print_exc()

# Executar se for o script principal

# ============================
# >>> FUNÇÕES DE ROBUSTEZ <<<
# ============================

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet

def substituir_por_sinonimos(texto):
    palavras = texto.split()
    nova_sentenca = []
    for palavra in palavras:
        sinonimos = wordnet.synsets(palavra)
        if sinonimos:
            lemmas = sinonimos[0].lemma_names()
            if lemmas:
                nova_palavra = random.choice(lemmas).replace('_', ' ')
                nova_sentenca.append(nova_palavra)
                continue
        nova_sentenca.append(palavra)
    return ' '.join(nova_sentenca)

def reordenar_sentenca(texto):
    palavras = texto.split()
    random.shuffle(palavras)
    return ' '.join(palavras)

def perturbar_dados(df, tipo='sinonimos'):
    df_mod = df.copy()
    if tipo == 'sinonimos':
        df_mod['sentence1'] = df_mod['sentence1'].apply(substituir_por_sinonimos)
        df_mod['sentence2'] = df_mod['sentence2'].apply(substituir_por_sinonimos)
    elif tipo == 'reordenacao':
        df_mod['sentence1'] = df_mod['sentence1'].apply(reordenar_sentenca)
        df_mod['sentence2'] = df_mod['sentence2'].apply(reordenar_sentenca)
    elif tipo == 'desequilibrio':
        df_0 = df[df['label_bin'] == 0].sample(frac=0.8, random_state=42)
        df_1 = df[df['label_bin'] == 1].sample(frac=0.2, random_state=42)
        df_mod = pd.concat([df_0, df_1])
    return df_mod

def avaliar_cross_task(model, df_teste, origem, destino):
    from sklearn.metrics import accuracy_score, f1_score
    s1, s2 = df_teste['sentence1'].tolist(), df_teste['sentence2'].tolist()
    labels = df_teste['label_bin'].tolist()
    emb1 = model.encode(s1, convert_to_tensor=True)
    emb2 = model.encode(s2, convert_to_tensor=True)
    sim = np.array([np.dot(e1.cpu(), e2.cpu()) for e1, e2 in zip(emb1, emb2)])
    pred = (sim >= 0.5).astype(int)
    acc = accuracy_score(labels, pred)
    f1 = f1_score(labels, pred)
    print(f"\nAvaliação cruzada: treinado em {origem.upper()}, testado em {destino.upper()}")
    print(f"Acurácia: {acc:.4f} | F1: {f1:.4f}")
    return acc, f1

def plotar_embeddings(model, df, nome='embeddings'):
    from sklearn.manifold import TSNE
    sents = df['sentence1'].tolist() + df['sentence2'].tolist()
    labels = df['label_bin'].tolist() * 2
    embeddings = model.encode(sents)
    tsne = TSNE(n_components=2, perplexity=30, n_iter=1000)
    reduced = tsne.fit_transform(embeddings)
    reduced_df = pd.DataFrame(reduced, columns=['x', 'y'])
    reduced_df['label'] = labels

    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=reduced_df, x='x', y='y', hue='label', palette='Set1')
    plt.title(f'Visualização de Embeddings: {nome}')
    plt.tight_layout()
    plt.savefig(f"{nome}.png", dpi=300)
    plt.close()


if __name__ == "__main__":
    print("="*80)
    print("Avaliação de Modelos Sentence-Transformer para Similaridade Semântica")
    print("="*80)
    print("Configurações:")
    print(f"- Seed: {SEED}")
    print(f"- Dispositivo: {DEVICE}")
    print(f"- Épocas: {NUM_EPOCHS}")
    print(f"- Batch Size: {BATCH_SIZE}")
    print(f"- Tamanho da amostra: {SAMPLE_SIZE if SAMPLE_SIZE else 'Dataset completo'}")
    print(f"- Diretório de resultados: {RESULTS_DIR}")
    print("="*80)

    # Executar experimento completo com análises adicionais
    experimento_estendido()

Configuração: Seed=42, Dispositivo=cuda


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Avaliação de Modelos Sentence-Transformer para Similaridade Semântica
Configurações:
- Seed: 42
- Dispositivo: cuda
- Épocas: 3
- Batch Size: 16
- Tamanho da amostra: Dataset completo
- Diretório de resultados: results


Dataset: STSB


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/502k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/151k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/114k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]


[STS-B - train] Estatísticas:
- Exemplos: 5749
- Range de similaridade: [0.00, 1.00]
- Distribuição binária: {1: 3385, 0: 2364}

[STS-B - validation] Estatísticas:
- Exemplos: 1500
- Range de similaridade: [0.00, 1.00]
- Distribuição binária: {1: 750, 0: 750}
- Amostra utilizada: 1000 exemplos

Amostra dos dados (STSB):
                                       sentence1                                                 sentence2  label
0                         A plane is taking off.                               An air plane is taking off.   1.00
1                A man is playing a large flute.                                 A man is playing a flute.   0.76
2  A man is spreading shreded cheese on a pizza.  A man is spreading shredded cheese on an uncooked pizza.   0.76

----------------------------------------
Modelo: paraphrase-MiniLM-L6-v2
----------------------------------------

Avaliando paraphrase-MiniLM-L6-v2 com MSE em STSB...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mleonardp315[0m ([33mleonardp315-utad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.01
1000,0.0017



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.99
  Label: 0.48 | Similaridade: 0.98
  Label: 0.00 | Similaridade: 0.97
✅ Concluído: Mean Sim = 0.9581999778747559
   Pearson = 0.2387

Avaliando paraphrase-MiniLM-L6-v2 com Cosine em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0208
1000,0.0141



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.79
  Label: 0.48 | Similaridade: 0.76
  Label: 0.00 | Similaridade: -0.16
✅ Concluído: Mean Sim = 0.5178999900817871
   Pearson = 0.8851

Avaliando paraphrase-MiniLM-L6-v2 com Contrastive em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0142
1000,0.0111



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.91
  Label: 0.48 | Similaridade: 0.89
  Label: 0.00 | Similaridade: 0.10
✅ Concluído: Mean Sim = 0.7210000157356262
   Pearson = 0.83

Avaliando paraphrase-MiniLM-L6-v2 com InfoNCE em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2861
1000,0.1663



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.80
  Label: 0.48 | Similaridade: 0.88
  Label: 0.00 | Similaridade: 0.04
✅ Concluído: Mean Sim = 0.5982000231742859
   Pearson = 0.776

Avaliando paraphrase-MiniLM-L6-v2 com Euclidean em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1533
1000,0.0617



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 1.00
✅ Concluído: Mean Sim = 0.9998999834060669
   Pearson = 0.4104

Avaliando paraphrase-MiniLM-L6-v2 com NormaEuc em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1533
1000,0.0617



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 1.00
✅ Concluído: Mean Sim = 0.9998999834060669
   Pearson = 0.4104

Avaliando paraphrase-MiniLM-L6-v2 com NPairs em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,4.8476
1000,4.899



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.95
  Label: 0.48 | Similaridade: 0.85
  Label: 0.00 | Similaridade: 0.37
✅ Concluído: Mean Sim = 0.7116000056266785
   Pearson = 0.7477

Avaliando paraphrase-MiniLM-L6-v2 com MultiSimilarity em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2861
1000,0.1663



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.80
  Label: 0.48 | Similaridade: 0.88
  Label: 0.00 | Similaridade: 0.04
✅ Concluído: Mean Sim = 0.5982000231742859
   Pearson = 0.776

Avaliando paraphrase-MiniLM-L6-v2 com AngularMargin em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2367
1000,0.1105



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 1.00
✅ Concluído: Mean Sim = 0.9998999834060669
   Pearson = 0.4574

Avaliando paraphrase-MiniLM-L6-v2 com Sphere em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0387
1000,0.0037



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 1.00
✅ Concluído: Mean Sim = 0.9997000098228455
   Pearson = 0.5637

Avaliando paraphrase-MiniLM-L6-v2 com HyperSphere em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,3.6504
1000,0.0144



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.85
  Label: 0.48 | Similaridade: 0.42
  Label: 0.00 | Similaridade: 0.77
✅ Concluído: Mean Sim = 0.8442000150680542
   Pearson = 0.2453

Avaliando paraphrase-MiniLM-L6-v2 com Probabilistic em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.6432
1000,0.6301



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.89
  Label: 0.48 | Similaridade: 0.46
  Label: 0.00 | Similaridade: -0.12
✅ Concluído: Mean Sim = 0.45080000162124634
   Pearson = 0.8129

Avaliando paraphrase-MiniLM-L6-v2 com LiftedStructured em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,12.09
1000,12.0824



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 0.68
✅ Concluído: Mean Sim = 0.9702000021934509
   Pearson = 0.3643

Avaliando paraphrase-MiniLM-L6-v2 com GeneralPair em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0431
1000,0.0256



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.79
  Label: 0.48 | Similaridade: 0.93
  Label: 0.00 | Similaridade: -0.13
✅ Concluído: Mean Sim = 0.6119999885559082
   Pearson = 0.8414

Avaliando paraphrase-MiniLM-L6-v2 com Angular em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3928
1000,0.3705



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.99
  Label: 0.48 | Similaridade: 0.97
  Label: 0.00 | Similaridade: 0.26
✅ Concluído: Mean Sim = 0.8492000102996826
   Pearson = 0.633

Avaliando paraphrase-MiniLM-L6-v2 com MarginRanking em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3468
1000,0.3211



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.84
  Label: 0.48 | Similaridade: 0.70
  Label: 0.00 | Similaridade: -0.08
✅ Concluído: Mean Sim = 0.5422999858856201
   Pearson = 0.7988

Avaliando paraphrase-MiniLM-L6-v2 com Triplet em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0256
1000,0.0



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.96
  Label: 0.48 | Similaridade: 0.98
  Label: 0.00 | Similaridade: 0.82
✅ Concluído: Mean Sim = 0.9165999889373779
   Pearson = 0.7505

Avaliando paraphrase-MiniLM-L6-v2 com OnlineTriplet em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0256
1000,0.0



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.96
  Label: 0.48 | Similaridade: 0.98
  Label: 0.00 | Similaridade: 0.82
✅ Concluído: Mean Sim = 0.9165999889373779
   Pearson = 0.7505

Avaliando paraphrase-MiniLM-L6-v2 com BatchHardTriplet em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0256
1000,0.0



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.96
  Label: 0.48 | Similaridade: 0.98
  Label: 0.00 | Similaridade: 0.82
✅ Concluído: Mean Sim = 0.9165999889373779
   Pearson = 0.7505

Avaliando paraphrase-MiniLM-L6-v2 com BatchSemiHardTriplet em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0256
1000,0.0



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.96
  Label: 0.48 | Similaridade: 0.98
  Label: 0.00 | Similaridade: 0.82
✅ Concluído: Mean Sim = 0.9165999889373779
   Pearson = 0.7505

Avaliando paraphrase-MiniLM-L6-v2 com BatchAllTriplet em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0256
1000,0.0



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.96
  Label: 0.48 | Similaridade: 0.98
  Label: 0.00 | Similaridade: 0.82
✅ Concluído: Mean Sim = 0.9165999889373779
   Pearson = 0.7505

----------------------------------------
Modelo: bert-base-nli-mean-tokens
----------------------------------------

Avaliando bert-base-nli-mean-tokens com MSE em STSB...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0126
1000,0.0009



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 1.00
✅ Concluído: Mean Sim = 0.9998000264167786
   Pearson = 0.4957

Avaliando bert-base-nli-mean-tokens com Cosine em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0258
1000,0.0103



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.66
  Label: 0.48 | Similaridade: 0.70
  Label: 0.00 | Similaridade: 0.06
✅ Concluído: Mean Sim = 0.5149999856948853
   Pearson = 0.8557

Avaliando bert-base-nli-mean-tokens com Contrastive em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.013
1000,0.0074



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.92
  Label: 0.48 | Similaridade: 0.74
  Label: 0.00 | Similaridade: 0.19
✅ Concluído: Mean Sim = 0.7026000022888184
   Pearson = 0.8008

Avaliando bert-base-nli-mean-tokens com InfoNCE em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2999
1000,0.1312



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.79
  Label: 0.48 | Similaridade: 0.92
  Label: 0.00 | Similaridade: 0.38
✅ Concluído: Mean Sim = 0.6847000122070312
   Pearson = 0.7611

Avaliando bert-base-nli-mean-tokens com Euclidean em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1237
1000,0.0416



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 1.00
✅ Concluído: Mean Sim = 1.0
   Pearson = 0.5206

Avaliando bert-base-nli-mean-tokens com NormaEuc em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1237
1000,0.0416



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 1.00
✅ Concluído: Mean Sim = 1.0
   Pearson = 0.5206

Avaliando bert-base-nli-mean-tokens com NPairs em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,4.9918
1000,4.9797



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.91
  Label: 0.48 | Similaridade: 0.96
  Label: 0.00 | Similaridade: 0.62
✅ Concluído: Mean Sim = 0.8295999765396118
   Pearson = 0.7252

Avaliando bert-base-nli-mean-tokens com MultiSimilarity em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2999
1000,0.1312



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.79
  Label: 0.48 | Similaridade: 0.92
  Label: 0.00 | Similaridade: 0.38
✅ Concluído: Mean Sim = 0.6847000122070312
   Pearson = 0.7611

Avaliando bert-base-nli-mean-tokens com AngularMargin em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1835
1000,0.1019



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 1.00
✅ Concluído: Mean Sim = 1.0
   Pearson = 0.4885

Avaliando bert-base-nli-mean-tokens com Sphere em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0224
1000,0.0037



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 1.00
✅ Concluído: Mean Sim = 0.9998999834060669
   Pearson = 0.5286

Avaliando bert-base-nli-mean-tokens com HyperSphere em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,32.5382
1000,1.0689



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.05
  Label: 0.48 | Similaridade: -0.23
  Label: 0.00 | Similaridade: -0.24
✅ Concluído: Mean Sim = 0.5206999778747559
   Pearson = 0.1241

Avaliando bert-base-nli-mean-tokens com Probabilistic em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.6421
1000,0.6126



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.52
  Label: 0.48 | Similaridade: -0.06
  Label: 0.00 | Similaridade: 0.02
✅ Concluído: Mean Sim = 0.4171000123023987
   Pearson = 0.6805

Avaliando bert-base-nli-mean-tokens com LiftedStructured em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,12.085
1000,12.0793



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 0.49
✅ Concluído: Mean Sim = 0.9524000287055969
   Pearson = 0.3206

Avaliando bert-base-nli-mean-tokens com GeneralPair em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0628
1000,0.0126



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.80
  Label: 0.48 | Similaridade: 0.97
  Label: 0.00 | Similaridade: -0.01
✅ Concluído: Mean Sim = 0.6388000249862671
   Pearson = 0.7823

Avaliando bert-base-nli-mean-tokens com Angular em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3781
1000,0.3473



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.99
  Label: 0.48 | Similaridade: 0.91
  Label: 0.00 | Similaridade: 0.41
✅ Concluído: Mean Sim = 0.8360999822616577
   Pearson = 0.6541

Avaliando bert-base-nli-mean-tokens com MarginRanking em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3397
1000,0.2799



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.98
  Label: 0.48 | Similaridade: -0.24
  Label: 0.00 | Similaridade: 0.26
✅ Concluído: Mean Sim = 0.48649999499320984
   Pearson = 0.598

Avaliando bert-base-nli-mean-tokens com Triplet em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0176
1000,0.0



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.99
  Label: 0.48 | Similaridade: 0.99
  Label: 0.00 | Similaridade: 0.94
✅ Concluído: Mean Sim = 0.9718000292778015
   Pearson = 0.6366

Avaliando bert-base-nli-mean-tokens com OnlineTriplet em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0176
1000,0.0



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.99
  Label: 0.48 | Similaridade: 0.99
  Label: 0.00 | Similaridade: 0.94
✅ Concluído: Mean Sim = 0.9718000292778015
   Pearson = 0.6366

Avaliando bert-base-nli-mean-tokens com BatchHardTriplet em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0176
1000,0.0



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.99
  Label: 0.48 | Similaridade: 0.99
  Label: 0.00 | Similaridade: 0.94
✅ Concluído: Mean Sim = 0.9718000292778015
   Pearson = 0.6366

Avaliando bert-base-nli-mean-tokens com BatchSemiHardTriplet em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0176
1000,0.0



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.99
  Label: 0.48 | Similaridade: 0.99
  Label: 0.00 | Similaridade: 0.94
✅ Concluído: Mean Sim = 0.9718000292778015
   Pearson = 0.6366

Avaliando bert-base-nli-mean-tokens com BatchAllTriplet em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0176
1000,0.0



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.99
  Label: 0.48 | Similaridade: 0.99
  Label: 0.00 | Similaridade: 0.94
✅ Concluído: Mean Sim = 0.9718000292778015
   Pearson = 0.6366

----------------------------------------
Modelo: all-mpnet-base-v2
----------------------------------------

Avaliando all-mpnet-base-v2 com MSE em STSB...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0001
1000,0.0



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 1.00
✅ Concluído: Mean Sim = 0.9998000264167786
   Pearson = 0.3685

Avaliando all-mpnet-base-v2 com Cosine em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0168
1000,0.0079



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.55
  Label: 0.48 | Similaridade: 0.71
  Label: 0.00 | Similaridade: -0.03
✅ Concluído: Mean Sim = 0.508400022983551
   Pearson = 0.9027

Avaliando all-mpnet-base-v2 com Contrastive em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0121
1000,0.0074



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.87
  Label: 0.48 | Similaridade: 0.82
  Label: 0.00 | Similaridade: 0.07
✅ Concluído: Mean Sim = 0.6818000078201294
   Pearson = 0.8623

Avaliando all-mpnet-base-v2 com InfoNCE em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2269
1000,0.1171



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.79
  Label: 0.48 | Similaridade: 0.86
  Label: 0.00 | Similaridade: 0.23
✅ Concluído: Mean Sim = 0.6032999753952026
   Pearson = 0.8244

Avaliando all-mpnet-base-v2 com Euclidean em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1308
1000,0.0499



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 1.00
✅ Concluído: Mean Sim = 0.9998999834060669
   Pearson = 0.2858

Avaliando all-mpnet-base-v2 com NormaEuc em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1308
1000,0.0499



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 1.00
  Label: 0.48 | Similaridade: 1.00
  Label: 0.00 | Similaridade: 1.00
✅ Concluído: Mean Sim = 0.9998999834060669
   Pearson = 0.2858

Avaliando all-mpnet-base-v2 com NPairs em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,4.8887
1000,4.9043



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.89
  Label: 0.48 | Similaridade: 0.78
  Label: 0.00 | Similaridade: 0.93
✅ Concluído: Mean Sim = 0.7961999773979187
   Pearson = 0.4188

Avaliando all-mpnet-base-v2 com MultiSimilarity em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2269
1000,0.1171



[STS-B] Exemplo de avaliação:
  Label: 0.65 | Similaridade: 0.79
  Label: 0.48 | Similaridade: 0.86
  Label: 0.00 | Similaridade: 0.23
✅ Concluído: Mean Sim = 0.6032999753952026
   Pearson = 0.8244

Avaliando all-mpnet-base-v2 com AngularMargin em STSB...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


In [None]:
import shutil
import os
from datetime import datetime
from google.colab import files

# 1. Defina a pasta onde estão os resultados
pasta_resultados = '/content/resultados_experimentos'

# 2. Cria a pasta, se não existir
os.makedirs(pasta_resultados, exist_ok=True)

# 3. Copie arquivos relevantes para a pasta
# Adapte essas linhas para incluir tudo o que quiser no .zip
arquivos_para_incluir = [
    'stsb_original.png',
    'stsb_sinonimos.png',
    'mrpc_original.png',
    'mrpc_sinonimos.png',
    'comparacao_metricas.png',
    'tabela_resultados.csv',
]

for arquivo in arquivos_para_incluir:
    if os.path.exists(arquivo):
        shutil.copy(arquivo, pasta_resultados)

# 4. Compacta a pasta em um .zip
nome_zip = f"resultados_robustez_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
shutil.make_archive(nome_zip.replace(".zip", ""), 'zip', pasta_resultados)

# 5. Faz o download para o computador local
files.download(nome_zip)
