In [None]:
%pip install transformers datasets evaluate scikit-learn accelerate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
# ==== 0) Instalación ====
# pip install transformers datasets scikit-learn accelerate

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed
)

# ==== 1) Configuración ====
RUTA_EXCEL = "Datos_proyecto (1).xlsx"  
HOJA = "Sheet1"
MODEL_NAME = "dccuchile/bert-base-spanish-wwm-cased"  # BETO (cased, WWM)
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5
SEED = 42
set_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ==== 2) Cargar datos ====
df = pd.read_excel(RUTA_EXCEL, sheet_name=HOJA)
df = df[["textos", "labels"]].dropna()

# Asegurar tipo str para textos
df["textos"] = df["textos"].astype(str)

# ==== 3) Mapear etiquetas ====
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["labels"])
num_labels = len(le.classes_)
id2label = {i: str(c) for i, c in enumerate(le.classes_)}
label2id = {v: k for k, v in id2label.items()}

# ==== 4) Split estratificado ====
train_df, temp_df = train_test_split(
    df, test_size=0.2, random_state=SEED, stratify=df["label_id"]
)
valid_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=SEED, stratify=temp_df["label_id"]
)

print("Tamaños:", len(train_df), len(valid_df), len(test_df))

# ==== 5) Tokenizer BETO ====
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(
        batch["textos"],
        truncation=True,
        padding=False,         
        max_length=MAX_LENGTH
    )

# ==== 6) Pasar a datasets HF y tokenizar ====
ds_train = Dataset.from_pandas(train_df[["textos", "label_id"]], preserve_index=False)
ds_valid = Dataset.from_pandas(valid_df[["textos", "label_id"]], preserve_index=False)
ds_test  = Dataset.from_pandas(test_df[["textos",  "label_id"]], preserve_index=False)

ds_train = ds_train.rename_column("label_id", "labels").map(tokenize_fn, batched=True)
ds_valid = ds_valid.rename_column("label_id", "labels").map(tokenize_fn, batched=True)
ds_test  = ds_test.rename_column("label_id", "labels").map(tokenize_fn,  batched=True)

# Quitar columnas que no necesita el modelo
def keep_model_cols(ds):
    keep = ["input_ids", "attention_mask", "labels"]
    if "token_type_ids" in ds.column_names:
        keep.append("token_type_ids")
    return ds.remove_columns([c for c in ds.column_names if c not in keep])

ds_train = keep_model_cols(ds_train)
ds_valid = keep_model_cols(ds_valid)
ds_test  = keep_model_cols(ds_test)

# ==== 7) Modelo ====
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
).to(device)

# ==== 8) Métricas simples (accuracy + F1 ponderada) ====
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )
    return {"accuracy": acc, "f1_weighted": f1, "precision_w": prec, "recall_w": rec}

# ==== 9) Data collator (padding dinámico) ====
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ==== 10) Entrenamiento (simple) ====

args = TrainingArguments(
    output_dir="./beto_out",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    # load_best_model_at_end, metric_for_best_model, greater_is_better, report_to
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

# Como no hay evaluation automática por época, evalúa explícitamente:
print("Valid:", trainer.evaluate(ds_valid))
print("Test :", trainer.evaluate(ds_test))

# ==== 12) Inferencia sencilla ====
def predecir(textos):
    if isinstance(textos, str):
        textos = [textos]
    enc = tokenizer(textos, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(**enc).logits
        preds = logits.argmax(dim=-1).cpu().numpy()
        probs = torch.softmax(logits, dim=-1).cpu().numpy()
    etiquetas = [id2label[int(p)] for p in preds]
    return list(zip(textos, etiquetas, probs.max(axis=1).round(3)))

ejemplos = [
    "La educación inicial es clave para el desarrollo.",
    "Los pacientes pueden elegir a cualquier médico del sistema."
]
print(predecir(ejemplos))

# ==== 13) Guardar modelo y tokenizer ====
SAVE_DIR = "./beto_clasificador"
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print("Guardado en:", SAVE_DIR)


Device: cuda
Tamaños: 1939 242 243


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

Map:   0%|          | 0/1939 [00:00<?, ? examples/s]

Map:   0%|          | 0/242 [00:00<?, ? examples/s]

Map:   0%|          | 0/243 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjuandiego-nn04[0m ([33mjuandiego-nn04-universidad-de-los-andes[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.3563
100,0.0597
150,0.0215
200,0.0405
250,0.0169
300,0.0192
350,0.0217


Valid: {'eval_loss': 0.09744682908058167, 'eval_accuracy': 0.9834710743801653, 'eval_f1_weighted': 0.9834710743801653, 'eval_precision_w': 0.9834710743801653, 'eval_recall_w': 0.9834710743801653, 'eval_runtime': 3.2129, 'eval_samples_per_second': 75.321, 'eval_steps_per_second': 4.98, 'epoch': 3.0}
Test : {'eval_loss': 0.041164807975292206, 'eval_accuracy': 0.9876543209876543, 'eval_f1_weighted': 0.9876887732019797, 'eval_precision_w': 0.9879708768597658, 'eval_recall_w': 0.9876543209876543, 'eval_runtime': 3.3224, 'eval_samples_per_second': 73.14, 'eval_steps_per_second': 4.816, 'epoch': 3.0}
[('La educación inicial es clave para el desarrollo.', '4', np.float32(0.998)), ('Los pacientes pueden elegir a cualquier médico del sistema.', '3', np.float32(0.999))]
Guardado en: ./beto_clasificador


In [None]:
from google.colab import files
import shutil

# 1. Comprimir la carpeta beto_clasificador
shutil.make_archive("beto_clasificador", "zip", "/content/beto_clasificador")
# Descargarla
files.download("beto_clasificador.zip")

# 2. Comprimir la carpeta beto_out
shutil.make_archive("beto_out", "zip", "/content/beto_out")
files.download("beto_out.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>