In [None]:
# Montar Google Drive y configurar el entorno
from google.colab import drive
drive.mount('/content/drive')

# Instalación de bibliotecas necesarias
!pip install transformers datasets torch evaluate scikit-learn pandas numpy seaborn matplotlib mlflow wandb nltk tqdm

# Importaciones básicas
import pandas as pd
import numpy as np
import torch
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification,
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, 
    precision_recall_fscore_support,
    confusion_matrix
)
# Añadir esta importación
from sklearn.utils.class_weight import compute_class_weight  # <- Añadir esta línea
import evaluate
import seaborn as sns
import matplotlib.pyplot as plt
import logging
import json
from datetime import datetime
import mlflow
import os  
import dataclasses
import torch.cuda.amp
import re
import nltk
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')


# Configuración de logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('training.log'),
        logging.StreamHandler()
    ]
)

# Configuración de dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Cargar los datos
def load_and_preprocess_data():
    logging.info("Iniciando carga y preprocesamiento de datos...")
    
    # Cargar dataset
    df = pd.read_csv('/content/drive/MyDrive/IronHack/Proyecto4/nlp-project/data/processed/consolidated_reviews.csv')
    logging.info(f"Dataset cargado con {len(df)} registros")

    # Limpieza básica
    df_cleaned = df.copy()
    df_cleaned = df_cleaned.dropna(subset=['reviews.text'])
    
    # Crear mapeo de etiquetas
    label_mapping = {'positive': 0, 'neutral': 1, 'negative': 2}
    df_cleaned['label'] = df_cleaned['sentiment'].map(label_mapping)
    
    # Calcular pesos de clase para manejar el desbalance
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(df_cleaned['label']),
        y=df_cleaned['label']
    )
    class_weights = torch.FloatTensor(class_weights).to(device)
    
    # División de datos
    train_df, test_df = train_test_split(
        df_cleaned, 
        test_size=0.2, 
        random_state=42, 
        stratify=df_cleaned['sentiment']
    )
    train_df, val_df = train_test_split(
        train_df, 
        test_size=0.2, 
        random_state=42, 
        stratify=train_df['sentiment']
    )
    
    # Mostrar distribución de clases
    print_class_distribution(train_df, val_df, test_df)
    
    return train_df, val_df, test_df, class_weights, label_mapping

def print_class_distribution(train_df, val_df, test_df):
    print("Distribución de clases en cada conjunto:")
    print("\nEntrenamiento:")
    print(train_df['sentiment'].value_counts(normalize=True))
    print("\nValidación:")
    print(val_df['sentiment'].value_counts(normalize=True))
    print("\nPrueba:")
    print(test_df['sentiment'].value_counts(normalize=True))

# Clase para el dataset de PyTorch
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors='pt'
        )
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

def create_datasets(train_df, val_df, test_df, tokenizer):
    logging.info("Creando datasets de PyTorch...")
    
    # Crear los datasets
    train_dataset = ReviewDataset(
        train_df['reviews.text'].tolist(), 
        train_df['label'].tolist(),
        tokenizer
    )
    val_dataset = ReviewDataset(
        val_df['reviews.text'].tolist(), 
        val_df['label'].tolist(),
        tokenizer
    )
    test_dataset = ReviewDataset(
        test_df['reviews.text'].tolist(), 
        test_df['label'].tolist(),
        tokenizer
    )
    
    # Verificar las dimensiones
    logging.info(f"Tamaño del conjunto de entrenamiento: {len(train_dataset)}")
    logging.info(f"Tamaño del conjunto de validación: {len(val_dataset)}")
    logging.info(f"Tamaño del conjunto de prueba: {len(test_dataset)}")
    
    return train_dataset, val_dataset, test_dataset

def analyze_text_lengths(train_df, tokenizer):
    reviews_length = train_df['reviews.text'].str.split().str.len()
    print("Estadísticas de longitud de las reseñas:")
    print(f"Media: {reviews_length.mean():.2f}")
    print(f"Mediana: {reviews_length.median():.2f}")
    print(f"95 percentil: {reviews_length.quantile(0.95):.2f}")
    print(f"Máximo: {reviews_length.max():.2f}")
    
    # Ejemplo de tokenización
    sample_text = train_df['reviews.text'].iloc[0]
    sample_tokens = tokenizer.encode(sample_text, truncation=True)
    print("\nEjemplo de tokenización:")
    print(f"Texto original: {sample_text}")
    print(f"Número de tokens: {len(sample_tokens)}")
    print(f"Tokens decodificados: {tokenizer.decode(sample_tokens)}")

# Ejecutar el preprocesamiento
train_df, val_df, test_df, class_weights, label_mapping = load_and_preprocess_data()

# Inicializar el tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
analyze_text_lengths(train_df, tokenizer)

# Crear datasets
train_dataset, val_dataset, test_dataset = create_datasets(
    train_df, val_df, test_df, tokenizer
)

In [None]:
def setup_training_args():
    return TrainingArguments(
        output_dir='./results',
        num_train_epochs=10,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_ratio=0.1,
        weight_decay=0.01,
        learning_rate=2e-5,
        logging_dir='./logs',
        logging_steps=50,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        gradient_accumulation_steps=2,
        fp16=True,  # Solo si tienes GPU compatible
        max_grad_norm=1.0,  # Cambiado de gradient_clipping a max_grad_norm
        report_to="none"
    )

# Clase personalizada de Trainer
class CustomTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, 3), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        preds,
        average='weighted'
    )
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
def train_and_evaluate_model():
    logging.info("Iniciando configuración del modelo y entrenamiento...")
    
    # Configurar el modelo BERT
    num_labels = 3
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels,
        problem_type="single_label_classification"
    ).to(device)
    
    # Configurar early stopping
    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=3,
        early_stopping_threshold=0.01
    )
    
    # Configurar argumentos de entrenamiento
    training_args = setup_training_args()
    
    # Crear trainer
    trainer = CustomTrainer(
        class_weights=class_weights,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )
    
    # Entrenar el modelo
    logging.info("Iniciando entrenamiento...")
    trainer.train()
    
    # Evaluar el modelo
    logging.info("Evaluando el modelo...")
    eval_results = trainer.evaluate()
    
    return trainer, model, eval_results

def hyperparameter_search(base_args):
    logging.info("Iniciando búsqueda de hiperparámetros...")
    param_combinations = [
        {'learning_rate': 2e-5, 'weight_decay': 0.01},
        {'learning_rate': 3e-5, 'weight_decay': 0.1},
        {'learning_rate': 5e-5, 'weight_decay': 0.01}
    ]
    
    results = []
    for params in param_combinations:
        logging.info(f"Probando parámetros: {params}")
        
        current_args = dataclasses.replace(
            base_args,
            learning_rate=params['learning_rate'],
            weight_decay=params['weight_decay']
        )
        
        temp_trainer = CustomTrainer(
            class_weights=class_weights,
            model=model,
            args=current_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )
        
        temp_trainer.train()
        eval_results = temp_trainer.evaluate()
        
        results.append({
            'params': params,
            'results': eval_results
        })
        
        logging.info(f"Resultados para parámetros {params}: {eval_results}")
    
    return results

# Ejecutar entrenamiento y evaluación
trainer, model, eval_results = train_and_evaluate_model()
hyperparam_results = hyperparameter_search(trainer.args)

In [None]:
def visualize_training_metrics(trainer):
    logging.info("Generando visualizaciones de métricas de entrenamiento...")
    
    # Obtener histórico de entrenamiento
    history = trainer.state.log_history
    
    # Extraer métricas
    train_loss = [x['loss'] for x in history if 'loss' in x]
    eval_loss = [x['eval_loss'] for x in history if 'eval_loss' in x]
    
    # Visualizar curvas de aprendizaje
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(train_loss, label='Training Loss')
    plt.title('Training Loss over Time')
    plt.xlabel('Steps')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(eval_loss, label='Validation Loss')
    plt.title('Validation Loss over Time')
    plt.xlabel('Evaluation Steps')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

def plot_confusion_matrix(trainer, test_dataset):
    logging.info("Generando matriz de confusión...")
    
    predictions = trainer.predict(test_dataset)
    preds = predictions.predictions.argmax(-1)
    labels = predictions.label_ids
    
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm, 
        annot=True, 
        fmt='d', 
        cmap='Blues',
        xticklabels=['Positive', 'Neutral', 'Negative'],
        yticklabels=['Positive', 'Neutral', 'Negative']
    )
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    
    return cm

def analyze_errors(trainer, test_dataset, tokenizer):
    logging.info("Analizando errores de predicción...")
    
    predictions = trainer.predict(test_dataset)
    preds = predictions.predictions.argmax(-1)
    labels = predictions.label_ids
    
    incorrect_indices = np.where(preds != labels)[0]
    print("\nAnálisis de errores más comunes:")
    
    for idx in incorrect_indices[:5]:
        text = test_dataset[idx]['input_ids']
        decoded_text = tokenizer.decode(text, skip_special_tokens=True)
        print(f"\nTexto: {decoded_text[:100]}...")
        print(f"Predicción: {preds[idx]}")
        print(f"Real: {labels[idx]}")

# Ejecutar análisis y visualizaciones
visualize_training_metrics(trainer)
cm = plot_confusion_matrix(trainer, test_dataset)
analyze_errors(trainer, test_dataset, tokenizer)

In [None]:
def test_model_robustness(trainer, test_samples):
    logging.info("Iniciando pruebas de robustez del modelo...")
    
    results = []
    for text in test_samples:
        variations = [
            text.lower(),  # Minúsculas
            text.upper(),  # Mayúsculas
            ' '.join(text.split()[:len(text.split())//2]),  # Texto truncado
            text + ' ' * 50,  # Espacios extra
            text.replace('.', '').replace(',', ''),  # Sin puntuación
            ' '.join(text.split()[::-1]),  # Palabras en orden inverso
            text.replace(' ', '  '),  # Doble espacio
            ''.join(char + ' ' for char in text)  # Espacios entre caracteres
        ]
        
        variation_results = []
        for variant in variations:
            inputs = tokenizer(
                variant,
                return_tensors='pt',
                truncation=True,
                padding=True,
                max_length=128
            ).to(device)
            
            with torch.no_grad():
                outputs = trainer.model(**inputs)
                pred = outputs.logits.argmax(-1).item()
                confidence = torch.softmax(outputs.logits, dim=1).max().item()
                variation_results.append({
                    'prediction': pred,
                    'confidence': confidence
                })
        
        # Calcular consistencia y confianza promedio
        predictions = [r['prediction'] for r in variation_results]
        confidences = [r['confidence'] for r in variation_results]
        consistency = len(set(predictions)) == 1
        avg_confidence = sum(confidences) / len(confidences)
        
        results.append({
            'original': text,
            'predictions': predictions,
            'confidences': confidences,
            'consistent': consistency,
            'avg_confidence': avg_confidence
        })
    
    return results

def analyze_robustness_results(robustness_results):
    logging.info("Analizando resultados de pruebas de robustez...")
    
    # Calcular estadísticas generales
    total_samples = len(robustness_results)
    consistent_samples = sum(1 for r in robustness_results if r['consistent'])
    consistency_rate = consistent_samples / total_samples
    
    avg_confidence = sum(r['avg_confidence'] for r in robustness_results) / total_samples
    
    print("\nResultados de Pruebas de Robustez:")
    print(f"Total de muestras analizadas: {total_samples}")
    print(f"Muestras con predicciones consistentes: {consistent_samples}")
    print(f"Tasa de consistencia: {consistency_rate:.2%}")
    print(f"Confianza promedio: {avg_confidence:.2%}")
    
    # Visualizar distribución de confianza
    plt.figure(figsize=(10, 6))
    confidences = [r['avg_confidence'] for r in robustness_results]
    plt.hist(confidences, bins=20)
    plt.title('Distribución de Confianza en Predicciones')
    plt.xlabel('Confianza')
    plt.ylabel('Frecuencia')
    plt.show()
    
    return {
        'consistency_rate': consistency_rate,
        'avg_confidence': avg_confidence,
        'total_samples': total_samples,
        'consistent_samples': consistent_samples
    }

# Ejecutar pruebas de robustez
test_samples = test_df['reviews.text'].head(10).tolist()  # Usar 10 muestras para prueba
robustness_results = test_model_robustness(trainer, test_samples)
robustness_metrics = analyze_robustness_results(robustness_results)

In [None]:
def save_model_and_artifacts(trainer, model, tokenizer, config):
    logging.info("Guardando modelo y artefactos...")
    
    # Crear directorios si no existen
    base_path = '/content/drive/MyDrive/IronHack/Proyecto4/nlp-project'
    model_path = f'{base_path}/models/bert_sentiment'
    os.makedirs(model_path, exist_ok=True)
    
    # Guardar modelo y tokenizer
    trainer.save_model(model_path)
    tokenizer.save_pretrained(f'{model_path}/tokenizer')
    
    # Guardar configuración
    config_path = f'{model_path}/config.json'
    with open(config_path, 'w') as f:
        json.dump(config, f, indent=4)
    
    logging.info(f"Modelo y artefactos guardados en: {model_path}")
    
    return model_path

def generate_final_report(trainer, eval_results, robustness_metrics, hyperparam_results):
    logging.info("Generando reporte final...")
    
    report = {
        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'model_performance': {
            'accuracy': eval_results['eval_accuracy'],
            'f1': eval_results['eval_f1'],
            'precision': eval_results['eval_precision'],
            'recall': eval_results['eval_recall']
        },
        'robustness_analysis': robustness_metrics,
        'hyperparameter_search': hyperparam_results,
        'training_config': {
            'batch_size': trainer.args.per_device_train_batch_size,
            'learning_rate': trainer.args.learning_rate,
            'epochs': trainer.args.num_train_epochs,
            'weight_decay': trainer.args.weight_decay
        }
    }
    
    # Guardar reporte
    report_path = 'model_report.json'
    with open(report_path, 'w') as f:
        json.dump(report, f, indent=4)
    
    # Imprimir resumen
    print("\n=== REPORTE FINAL ===")
    print("\nRendimiento del Modelo:")
    for metric, value in report['model_performance'].items():
        print(f"{metric}: {value:.4f}")
    
    print("\nAnálisis de Robustez:")
    print(f"Tasa de consistencia: {robustness_metrics['consistency_rate']:.2%}")
    print(f"Confianza promedio: {robustness_metrics['avg_confidence']:.2%}")
    
    logging.info(f"Reporte final guardado en: {report_path}")
    return report

# Configuración final para guardar
config = {
    'label_mapping': label_mapping,
    'max_length': 128,
    'model_type': 'bert-base-uncased',
    'num_labels': 3,
    'class_weights': class_weights.tolist()
}

# Guardar modelo y generar reporte
model_path = save_model_and_artifacts(trainer, model, tokenizer, config)
final_report = generate_final_report(trainer, eval_results, robustness_metrics, hyperparam_results)