In [None]:
# 1. Instalar dependencias si no las tienes (una sola vez en tu entorno)
# pip install transformers==4.30.2 torch pandas scikit-learn tqdm matplotlib seaborn

# 2. Importar librerías necesarias
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from torch.optim import AdamW
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import pickle
import json

# 3. Crear carpetas necesarias
os.makedirs('data', exist_ok=True)
os.makedirs('models', exist_ok=True)

# 4. Cargar archivos CSV
train_df = pd.read_csv('../data/train_data.csv')
test_df = pd.read_csv('../data/test_data.csv')

# Eliminar valores nulos en 'Reporte'
train_df.dropna(subset=['Reporte'], inplace=True)
test_df.dropna(subset=['Reporte'], inplace=True)

# Reset index por seguridad
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

print("Valores nulos eliminados.")

# 5. Verificar datos
print("Valores nulos:")
print(f"Train: {train_df['Reporte'].isna().sum()} nulos")
print(f"Test:  {test_df['Reporte'].isna().sum()} nulos")

# 6. Configuración del modelo
TOKENIZER_NAME = 'bert-base-multilingual-cased'
MAX_LEN = 128
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 4

# 7. Tokenización
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_NAME)

def clean_and_validate_texts(texts):
    return [str(text) if isinstance(text, str) else "" for text in texts]

train_encodings = tokenizer(
    clean_and_validate_texts(train_df['Reporte'].tolist()),
    truncation=True,
    padding='max_length',
    max_length=MAX_LEN,
    return_tensors='pt'
)

test_encodings = tokenizer(
    clean_and_validate_texts(test_df['Reporte'].tolist()),
    truncation=True,
    padding='max_length',
    max_length=MAX_LEN,
    return_tensors='pt'
)

# 8. Codificar etiquetas
label_encoder = LabelEncoder()
train_labels_encoded = torch.tensor(label_encoder.fit_transform(train_df['Categoria']), dtype=torch.long)
test_labels_encoded = torch.tensor(label_encoder.transform(test_df['Categoria']), dtype=torch.long)
num_labels = len(label_encoder.classes_)

# 9. Manejo de desbalance
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=label_encoder.classes_,
    y=train_df['Categoria']
)
weights_tensor = torch.tensor(class_weights, dtype=torch.float)

# 10. Dataloaders
train_dataset = TensorDataset(
    train_encodings['input_ids'], 
    train_encodings['attention_mask'], 
    train_labels_encoded
)
test_dataset = TensorDataset(
    test_encodings['input_ids'], 
    test_encodings['attention_mask'], 
    test_labels_encoded
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 11. Modelo
model = BertForSequenceClassification.from_pretrained(
    TOKENIZER_NAME, 
    num_labels=num_labels
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 12. Optimizador
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss(weight=weights_tensor.to(device))

# 13. Evaluación
def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (preds == labels).sum().item()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = correct / total
    return acc, all_preds, all_labels

# 14. Entrenamiento con Early Stopping
best_val_accuracy = 0
best_model_state = None
patience = 2
no_improve_epochs = 0
train_losses = []
val_accuracies = []

model.train()
for epoch in range(NUM_EPOCHS):
    print(f"\n[Epoch {epoch+1}/{NUM_EPOCHS}]")
    total_loss = 0
    for batch in tqdm(train_loader, desc="Entrenando"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    val_accuracy, _, _ = evaluate_model(model, test_loader, device)
    val_accuracies.append(val_accuracy)

    print(f"Train Loss: {avg_train_loss:.4f} - Val Accuracy: {val_accuracy:.4f}")

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model_state = model.state_dict().copy()
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        if no_improve_epochs >= patience:
            print("Early stopping")
            break

if best_model_state:
    model.load_state_dict(best_model_state)

# 15. Visualizar desempeño
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses)
plt.title("Loss de Entrenamiento")

plt.subplot(1, 2, 2)
plt.plot(val_accuracies)
plt.title("Accuracy de Validación")
plt.tight_layout()
plt.show()

# 16. Evaluación final
final_acc, y_pred, y_true = evaluate_model(model, test_loader, device)
print(f"Accuracy final: {final_acc:.4f}")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.title("Matriz de Confusión")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("models/confusion_matrix.png")
plt.show()

# 17. Guardar modelo y artefactos
torch.save(model.state_dict(), "models/modelo_bert_finetuned.pth")
with open("models/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)
tokenizer.save_pretrained("models/tokenizer")

print("\n✅ Modelo entrenado y guardado exitosamente.")


Valores nulos eliminados.
Valores nulos:
Train: 0 nulos
Test:  0 nulos


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual


[Epoch 1/4]


Entrenando:   0%|          | 0/699 [00:04<?, ?it/s]


RuntimeError: expected scalar type Long but found Int

In [3]:
print(torch.cuda.is_available())


False
