In [None]:
# Configuración del entorno en Google Colab

# 1. Instalar dependencias
!pip install transformers==4.30.2 torch pandas scikit-learn tqdm matplotlib seaborn

# 2. Importar librerías necesarias
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.optim import AdamW
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import pickle
import json
from google.colab import files

# 3. Crear directorio para datos
os.makedirs('data', exist_ok=True)

# 4. Subir archivos CSV de datos (si ya los tienes procesados)
print("Sube los archivos train_data.csv y test_data.csv:")
uploaded = files.upload()

# 5. Cargar y explorar datos
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')

# Verificar los datos
print("Comprobando valores nulos:")
print(f"Hay {train_df['Reporte'].isna().sum()} valores nulos en train_df['Reporte']")
print(f"Hay {test_df['Reporte'].isna().sum()} valores nulos en test_df['Reporte']")

# Verificar tipos de datos
print("\nTipos de datos en los reportes:")
print(train_df['Reporte'].apply(type).value_counts())

# Mostrar algunos ejemplos de reportes
print("\nPrimeros 5 reportes:")
for i, report in enumerate(train_df['Reporte'].head(5)):
    print(f"Reporte {i+1}: {type(report)} - {report}")


print(f"Datos de entrenamiento: {train_df.shape}")
print(f"Datos de prueba: {test_df.shape}")

# Mostrar distribución de categorías
print("\nDistribución de categorías (entrenamiento):")
train_categories = train_df['Categoria'].value_counts()
print(train_categories)

plt.figure(figsize=(10, 6))
train_categories.plot(kind='bar')
plt.title('Distribución de Categorías (Conjunto de Entrenamiento)')
plt.ylabel('Número de ejemplos')
plt.xlabel('Categoría')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()




# 6. Configuración del modelo
TOKENIZER_NAME = 'bert-base-multilingual-cased'
MAX_LEN = 128
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 4

# 7. Tokenización
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_NAME)

def clean_and_validate_texts(texts):
    valid_texts = []
    for text in texts:
        if isinstance(text, str):
            valid_texts.append(text)
        elif text is None or pd.isna(text):
            valid_texts.append("")
        else:
            try:
                valid_texts.append(str(text))
            except:
                valid_texts.append("")
    return valid_texts

# Función para tokenizar los textos
def tokenize_texts(texts):
    return tokenizer(
        texts, 
        truncation=True, 
        padding='max_length', 
        max_length=MAX_LEN, 
        return_tensors='pt'
    )

# Tokenizar usando validación
train_encodings = tokenizer(
    clean_and_validate_texts(train_df['Reporte'].tolist()), 
    truncation=True, 
    padding='max_length', 
    max_length=MAX_LEN, 
    return_tensors='pt'
)

test_encodings = tokenizer(
    clean_and_validate_texts(test_df['Reporte'].tolist()), 
    truncation=True, 
    padding='max_length', 
    max_length=MAX_LEN, 
    return_tensors='pt'
)
# 8. Codificar etiquetas
label_encoder = LabelEncoder()
train_labels_encoded = torch.tensor(label_encoder.fit_transform(train_df['Categoria']))
test_labels_encoded = torch.tensor(label_encoder.transform(test_df['Categoria']))
num_labels = len(label_encoder.classes_)

print(f"\nNúmero de categorías: {num_labels}")
print(f"Categorías: {label_encoder.classes_}")

# 9. Manejar desbalance de clases
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=label_encoder.classes_,
    y=train_df['Categoria']
)
weights_tensor = torch.tensor(class_weights, dtype=torch.float)

print("\nPesos calculados para las clases:")
for i, cls in enumerate(label_encoder.classes_):
    print(f"{cls}: {class_weights[i]:.4f}")

# 10. Preparar DataLoaders
train_dataset = TensorDataset(
    train_encodings['input_ids'], 
    train_encodings['attention_mask'], 
    train_labels_encoded
)
test_dataset = TensorDataset(
    test_encodings['input_ids'], 
    test_encodings['attention_mask'], 
    test_labels_encoded
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 11. Configurar modelo BERT
model = BertForSequenceClassification.from_pretrained(
    TOKENIZER_NAME, 
    num_labels=num_labels
)

# Verificar si hay GPU disponible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nEntrenando en: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")

model.to(device)

# 12. Configurar optimizador y scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_scheduler(
    'linear', 
    optimizer=optimizer, 
    num_warmup_steps=0, 
    num_training_steps=total_steps
)

# 13. Función de pérdida con pesos de clase
loss_fn = nn.CrossEntropyLoss(weight=weights_tensor.to(device))

# 14. Función de evaluación
def evaluate_model(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = correct / total
    
    return accuracy, all_preds, all_labels

# 15. Entrenamiento con early stopping
best_val_accuracy = 0
best_model_state = None
patience = 2
no_improve_epochs = 0
train_losses = []
val_accuracies = []

model.train()
for epoch in range(NUM_EPOCHS):
    print(f"\n[Epoch {epoch+1}/{NUM_EPOCHS}]\n" + "="*30)
    total_loss = 0
    
    # Entrenamiento
    progress_bar = tqdm(train_loader)
    for batch in progress_bar:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        progress_bar.set_description(f"Loss: {loss.item():.4f}")
    
    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    
    # Evaluación
    val_accuracy, _, _ = evaluate_model(model, test_loader, device)
    val_accuracies.append(val_accuracy)
    
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} - Train Loss: {avg_train_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
    
    # Early stopping
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model_state = model.state_dict().copy()
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        if no_improve_epochs >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs")
            break

# Cargar el mejor modelo
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"Restaurado el mejor modelo con accuracy: {best_val_accuracy:.4f}")

# 16. Visualizar curva de entrenamiento
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses)
plt.title('Pérdida de Entrenamiento')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(1, 2, 2)
plt.plot(val_accuracies)
plt.title('Precisión de Validación')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

plt.tight_layout()
plt.show()

# 17. Evaluación final del modelo
final_accuracy, all_preds, all_labels = evaluate_model(model, test_loader, device)
print(f"\nPrecisión final: {final_accuracy:.4f}")

# 18. Mostrar métricas detalladas
print("\nReporte de clasificación:")
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

# 19. Mostrar matriz de confusión
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=label_encoder.classes_,
           yticklabels=label_encoder.classes_)
plt.xlabel('Predicción')
plt.ylabel('Real')
plt.title('Matriz de Confusión')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 20. Guardar el modelo
model_save_path = 'modelo_bert_finetuned.pth'
torch.save(model.state_dict(), model_save_path)

# Guardar también el label_encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Guardar el tokenizer para uso futuro
tokenizer.save_pretrained('tokenizer')

# 21. Descargar archivos generados
files.download(model_save_path)
files.download('label_encoder.pkl')

print("\nModelo entrenado y guardado exitosamente.")
print("Ahora puedes descargar los archivos generados para uso posterior.")



In [3]:
print(torch.cuda.is_available())


False
