In [18]:
from  transformers import BertTokenizer, BertForTokenClassification
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from tqdm import tqdm
import torch
import os

In [19]:
etiquetas = {
    'O' : 0,
    "FECHA" : 1,
    "DIRECTOR" : 2,
    "ROL_ENTITY": 3,
    "EDICION" : 4,
    "TIPO_DOCUMENTO" : 5,
    "ROMAN_NUM" : 6,
    "NUM_ISSN" : 7,
    "ENTITY": 8,
    "PRESENTATION": 9
}

# load model and tokenizer

tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased', num_labels=len(etiquetas))
model = BertForTokenClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-cased', num_labels=len(etiquetas))

# Move the model to the GPU 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

## TOKENIZAR AND CODIFY THE DATA

In [20]:
def tokenizar_codify(phrase, tokenizer, tags):
    tokens_codified = tokenizer(
        [token for token, tag in phrase],
        is_split_into_words=True,  # True if the input is already tokenized
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    tags_codified = torch.tensor([tags[tag] for token, tag in phrase])
    return tokens_codified, tags_codified

In [21]:
data_train = [
    [("Bogotá", "O"), ("D.C.", "O"), (",", "O"), ("jueves", "O"), (",", "O"), ("13", "FECHA"), ("de", "O"), ("diciembre", "FECHA"), ("de", "O"), ("2019", "FECHA")],
    [("GREGORIO", "DIRECTOR"), ("ELJACH", "DIRECTOR"), ("PACHECO", "DIRECTOR"), ("Y", "O"), ("MARIA", "DIRECTOR"), ("GOMEZ", "DIRECTOR"), (".", "O")],
    [("DIRECORES", "DIRECTOR"), ("SECRETARIO", "ROL_ENTITY"), ("GENERAL", "ROL_ENTITY"), ("DE", "O"), ("LA", "O"), ("CÁMARA", "ROL_ENTITY")],
    [("GREGORIO", "DIRECTOR"), ("ELJACH", "DIRECTOR"), ("PACHECO", "DIRECTOR"), ("Y", "O"), ("MARIA", "DIRECTOR"), ("GOMEZ", "DIRECTOR"), (".", "O")],
    [("DIRECORES", "DIRECTOR"), ("SECRETARIO", "ROL_ENTITY"), ("GENERAL", "ROL_ENTITY"), ("DEL", "O"), ("SENADO", "ROL_ENTITY")]
]

In [22]:
tokens_codified, tags_codified = tokenizar_codify(data_train[0], tokenizer, etiquetas)
print(tokens_codified)
print(tags_codified)

{'input_ids': tensor([[    4, 17521,  1090,  1009,  1059,  1009,  1017, 11573,  1017,  2083,
          1008,  2451,  1008, 17736,     5]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([0, 0, 0, 0, 0, 1, 0, 1, 0, 1])


Este resultado muestra la arquitectura del modelo BERT que se ha cargado, incluyendo las capas de embeddings, atención, y clasificación. La advertencia inicial indica que algunas de las capas del clasificador (classifier.bias y classifier.weight) no se han inicializado desde el punto de control del modelo y se han inicializado de nuevo. Esto es normal cuando se carga un modelo preentrenado y se ajusta para una tarea específica.



In [23]:
name_model = "model_bert_ner.pt_gacetas"
ruta_modelo = os.path.join("modelos", name_model)

# --- Existing the pro
os.makedirs("modelos", exist_ok=True)


## Train model 


In [24]:
def tokenizar_codify(phrase, tokenizer, tags):
    tokens = [token for token, tag in phrase]
    tags_list = [tags[tag] for token, tag in phrase]
    
    tokens_codified = tokenizer(
        tokens,
        is_split_into_words=True,  # True if the input is already tokenized
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    
    # Align the tags with the tokenized input
    word_ids = tokens_codified.word_ids()
    tags_codified = []
    for word_id in word_ids:
        if word_id is None:
            tags_codified.append(-100)  # Special token
        else:
            tags_codified.append(tags_list[word_id])
    
    tags_codified = torch.tensor(tags_codified)
    return tokens_codified, tags_codified


In [25]:

# Example training data and tags dictionary
data_train = [
    [("Hello", "O"), ("world", "O")],
    [("My", "O"), ("name", "O"), ("is", "O"), ("BERT", "B-PER")]
]
etiquetas = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-LOC": 3,
    "I-LOC": 4,
    "B-ORG": 5,
    "I-ORG": 6,
    "B-MISC": 7,
    "I-MISC": 8
}
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [26]:

# Tokenize and codify the data
tokens_codified, tags_codified = tokenizar_codify(data_train[0], tokenizer, etiquetas)

# Debug prints to check lengths
print(f"Length of input_ids: {len(tokens_codified['input_ids'][0])}")
print(f"Length of attention_mask: {len(tokens_codified['attention_mask'][0])}")
print(f"Length of tags_codified: {len(tags_codified)}")

ValueError: word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast` class).

In [None]:

# Ensure that the tensors have the same length
assert len(tokens_codified["input_ids"][0]) == len(tokens_codified["attention_mask"][0]) == len(tags_codified), "Size mismatch between tensors"

# Create a DataLoader
dataset = TensorDataset(tokens_codified["input_ids"], tokens_codified["attention_mask"], tags_codified)
data_loader = DataLoader(dataset, batch_size=8) 

# Initialize the model and optimizer
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(etiquetas))
optimizer = AdamW(model.parameters(), lr=5e-5)


In [None]:

# Training loop
epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    for batch in tqdm(data_loader, desc=f"Epoch {epoch + 1}"):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


In [None]:

# Save the model and tokenizer
ruta_modelo = "path/to/save/model"
model.save_pretrained(ruta_modelo)
tokenizer.save_pretrained(ruta_modelo)

print(f"Modelo guardado en: {ruta_modelo}")