In [1]:
!pip install torch transformers datasets accelerate seqeval



In [None]:
from datasets import load_dataset

print("Cargando dataset tner/bc5cdr")
dataset = load_dataset("tner/bc5cdr", trust_remote_code=True)
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


Cargando dataset bigbio/bc5cdr en formato CoNLL
DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5228
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5330
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5865
    })
})


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")


def tokenize_and_align_labels(batch, tokenizer):
    tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(batch["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            # Si es un token especial ([CLS], [SEP])
            if word_id is None:
                label_ids.append(-100)
            # Si es el primer token de una palabra
            elif word_id != previous_word_id:
                label_ids.append(label[word_id]) 
            # Si es un sub-token de continuación
            else:
                label_ids.append(-100) # Ignorar
            previous_word_id = word_id
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

print("Tokenizando y alineando etiquetas...")


# Aplicar la función de tokenización y alineación a todo el dataset
tokenized_dataset = dataset.map(
    lambda x: tokenize_and_align_labels(x, tokenizer), 
    batched=True,
    remove_columns=dataset["train"].column_names 
)

print(tokenized_dataset)

Tokenizando y alineando etiquetas...
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5228
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5330
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5865
    })
})


In [None]:

# Labels extraídos de la documentación
label_map = {"O": 0, "B-Chemical": 1, "B-Disease": 2, "I-Disease": 3, "I-Chemical": 4}

label_list = [label for label, id in sorted(label_map.items(), key=lambda item: item[1])]

num_labels = len(label_list)

print("Lista de etiquetas (label_list):", label_list)
print("Número de etiquetas (num_labels):", num_labels)

Lista de etiquetas (label_list): ['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical']
Número de etiquetas (num_labels): 5


In [None]:
from transformers import AutoModelForTokenClassification

# Crear mapas de etiquetas
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

model = AutoModelForTokenClassification.from_pretrained(
    "dmis-lab/biobert-v1.1",  
    num_labels=num_labels,
    label2id=label2id, 
    id2label=id2label
)

# Verificar la configuración del modelo
print(model.config)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-Chemical",
    "2": "B-Disease",
    "3": "I-Disease",
    "4": "I-Chemical"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-Chemical": 1,
    "B-Disease": 2,
    "I-Chemical": 4,
    "I-Disease": 3,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir ="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=10,
)

In [7]:
from transformers import Trainer, DataCollatorForTokenClassification 

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator 
)

trainer.train()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Epoch,Training Loss,Validation Loss
1,0.0965,0.081767
2,0.0614,0.066173
3,0.0286,0.07662


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


TrainOutput(global_step=981, training_loss=0.20823816482687824, metrics={'train_runtime': 6292.9743, 'train_samples_per_second': 2.492, 'train_steps_per_second': 0.156, 'total_flos': 658536722383800.0, 'train_loss': 0.20823816482687824, 'epoch': 3.0})

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score

# Definir la función de métricas para obtener precisión, recall y F1
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    true_labels = [[id2label[l] for l in label] for label in labels]
    true_preds = [[id2label[p] for p in pred] for pred in preds]
    
    precision = precision_score(true_labels, true_preds)
    recall = recall_score(true_labels, true_preds)
    f1 = f1_score(true_labels, true_preds)
    
    return {"precision": precision, "recall": recall, "f1": f1}

In [None]:
# Guardar el modelo y el tokenizer entrenados
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

print("Model and tokenizer saved to ./ner_model")

Model and tokenizer saved to ./ner_model
