# Validación Modelo jamesopeth/bert-base-uncased-finetuned-ner-lung-cancer

* Verificamos que están disponibles todas las dependencias

In [1]:
!pip install datasets transformers
!pip install seqeval
!pip install -U datasets evaluate
!pip install -U huggingface_hub

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=6ec499335f0fb200e10c0e7dcb1d7032a165cb140c0659207b66c7f836f59605
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-p

* Hacemos login en la plataforma de Hugging Face

In [None]:
from huggingface_hub import login

# token cuenta personal, maestria_laptop_james_lectura
maestria_laptop_james_lectura = '___TOKEN___PRUEBA____'
login(maestria_laptop_james_lectura)

* Cargamos la configuración, el tokenizer y el modelo preentrenado previamente

In [3]:
from transformers import AutoConfig
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

hugging_face_NER_model="jamesopeth/xlm-roberta-large-finetuned-ner-lung-cancer"
config = AutoConfig.from_pretrained(hugging_face_NER_model)
id2label = config.id2label
label2id = config.label2id
num_labels = config.num_labels

model = AutoModelForTokenClassification.from_pretrained(hugging_face_NER_model,
        num_labels = num_labels,
        id2label = id2label,
        label2id = {v: k for k, v in id2label.items()}
)

tokenizer = AutoTokenizer.from_pretrained(hugging_face_NER_model, use_fast = True)


# Usar GPU si está disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



all_results = []
batch_size = 8

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

* Algunos ejemplos para validar el modelo

In [4]:
texts = [
    "Paciente con carcinoma escamoso de pulmón estadio IIIB.",
    "Se inicia tratamiento con quimioterapia basada en Carboplatino y Paclitaxel.",
    "Fumadora crónica con diagnóstico reciente de adenocarcinoma pulmonar.",
    "TC muestra masa pulmonar de aspecto neoplásico en lóbulo superior derecho.",
    "Histología reporta carcinoma de células no pequeñas.",
    "Se planifica cirugía torácica para resección del tumor primario.",
    "Estadio clínico T2N1M0 confirmado por TAC y PET.",
    "Paciente exfumador en vigilancia por nódulo pulmonar sospechoso.",
    "Indicada radioterapia como tratamiento adyuvante tras lobectomía.",
    "Biopsia muestra tumor neuroendocrino de célula grande en pulmón izquierdo."
]

# Tokenización
encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        return_offsets_mapping=True,
        return_attention_mask=True,
        return_token_type_ids=False,
        max_length=512,
        is_split_into_words=False
        )


* Realizamos las predicciones usando el modelo preentrenado cargado

In [5]:
import torch
import torch.nn.functional as F

input_ids = torch.tensor(encodings["input_ids"]).to(device)

attention_mask = torch.tensor(encodings["attention_mask"]).to(device)


with torch.no_grad():
 outputs = model(input_ids=input_ids, attention_mask=attention_mask)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
probs = F.softmax(logits, dim=-1)

In [8]:
### Para cada oracion en la lista de oraciones.
for i, text in enumerate(texts):
  word_ids = encodings.word_ids(batch_index=i)
  tokens = tokenizer.convert_ids_to_tokens(encodings["input_ids"][i])
  print("\n \n=================================================================================================\n")
  print (word_ids)
  print (tokens)


  previous_word_id = None
  aligned_words, aligned_labels, aligned_scores = [], [], []

  for token, label_id, word_id in zip(tokens, predictions[i].tolist(), word_ids):
  #print (token, " ", label_id, " ", word_id)

    if word_id is None:
      continue


    if word_id != previous_word_id:
        aligned_words.append(token.replace("▁", ""))  # WordPiece tokens
        aligned_labels.append(id2label[label_id])
        aligned_scores.append(probs[i][word_id][label_id].item())
    else:
        aligned_words[-1] += token.replace("▁", "")
    previous_word_id = word_id



  filtered_results = [
            (word, label, score)
            for word, label, score in zip(aligned_words, aligned_labels, aligned_scores)
            if label != "O"
  ]

  ###Resultados
  print("\n ")
  print("Palabras: ", aligned_words)
  print("Labels: ", aligned_labels)
  print("Score: ", aligned_scores)
  print("\n ")




 

[None, 0, 0, 1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 6, 6, 7, 7, 7, None, None, None, None]
['<s>', '▁Pacient', 'e', '▁con', '▁car', 'cino', 'ma', '▁esca', 'mos', 'o', '▁de', '▁pul', 'món', '▁esta', 'dio', '▁III', 'B', '.', '</s>', '<pad>', '<pad>', '<pad>']

 
Palabras:  ['Paciente', 'con', 'carcinoma', 'escamoso', 'de', 'pulmón', 'estadio', 'IIIB.']
Labels:  ['O', 'O', 'B_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'B_STAGE', 'I_STAGE']
Score:  [0.9694349765777588, 0.9997991919517517, 6.370376013364876e-06, 1.0401334293419495e-05, 1.0698526239139028e-05, 9.873430099105462e-05, 5.4289025683829095e-06, 3.543192633514991e-06]

 

 

[None, 0, 1, 2, 3, 4, 4, 4, 5, 6, 7, 7, 7, 7, 8, 9, 9, 9, 9, None, None, None]
['<s>', '▁Se', '▁inicia', '▁tratamiento', '▁con', '▁qui', 'mi', 'oterapia', '▁basada', '▁en', '▁Car', 'bo', 'plati', 'no', '▁y', '▁Pac', 'lita', 'xel', '.', '</s>', '<pad>', '<pad>']

 
Palabras:  ['Se', 'inicia', 'tratamiento', 'con', 'quimioterapia', '

In [10]:
for i, text in enumerate(texts):
    word_ids = encodings.word_ids(batch_index=i)
    tokens = tokenizer.convert_ids_to_tokens(encodings["input_ids"][i])
    print("\n\n=================================================================================================\n")
    print(word_ids)
    print(tokens)

    previous_word_id = None
    aligned_words, aligned_labels, aligned_scores = [], [], []

    for token, label_id, word_id in zip(tokens, predictions[i].tolist(), word_ids):
        if word_id is None:
            continue

        # Unir subtokens con WordPiece (##) limpiamente
        if word_id != previous_word_id:
            aligned_words.append(token.replace("▁", ""))
            aligned_labels.append(id2label[label_id])
            aligned_scores.append(probs[i][word_id][label_id].item())
        else:
            aligned_words[-1] += token.replace("▁", "")
        previous_word_id = word_id

    # Filtrar etiquetas diferentes a 'O'
    filtered_results = [
        (word, label, score)
        for word, label, score in zip(aligned_words, aligned_labels, aligned_scores)
        if label != "O"
    ]

    # Mostrar alineación cruda
    print("\nPalabras: ", aligned_words)
    print("Labels: ", aligned_labels)
    print("Score: ", aligned_scores)

    ### Combinar etiquetas B- y I- en una sola entidad
    print("\n**** Se unen las etiquetas B, I en una sola entidad ****\n")
    combined_results = []
    temp_entity, temp_label, temp_score = "", "", 0

    for word, label, score in filtered_results:
        if label.startswith("B_"):
            if temp_entity:
                combined_results.append((temp_entity, temp_label, temp_score))
            temp_entity, temp_label, temp_score = word, label, score
        elif label.startswith("I_") and label[2:] == temp_label[2:]:
            temp_entity += " " + word
            temp_score += score
        else:
            if temp_entity:
                combined_results.append((temp_entity, temp_label, temp_score))
            temp_entity, temp_label, temp_score = word, label, score

    if temp_entity:
        combined_results.append((temp_entity, temp_label, temp_score))

    for entity, label, score in combined_results:
        result = {
            "Palabra": entity,
            "Entidad": label[2:],  # remove B_ or I_
            "Score": round(score, 4)
        }
        print(result)
        all_results.append(result)

    print("\n")





[None, 0, 0, 1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 6, 6, 7, 7, 7, None, None, None, None]
['<s>', '▁Pacient', 'e', '▁con', '▁car', 'cino', 'ma', '▁esca', 'mos', 'o', '▁de', '▁pul', 'món', '▁esta', 'dio', '▁III', 'B', '.', '</s>', '<pad>', '<pad>', '<pad>']

Palabras:  ['Paciente', 'con', 'carcinoma', 'escamoso', 'de', 'pulmón', 'estadio', 'IIIB.']
Labels:  ['O', 'O', 'B_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'B_STAGE', 'I_STAGE']
Score:  [0.9694349765777588, 0.9997991919517517, 6.370376013364876e-06, 1.0401334293419495e-05, 1.0698526239139028e-05, 9.873430099105462e-05, 5.4289025683829095e-06, 3.543192633514991e-06]

**** Se unen las etiquetas B, I en una sola entidad ****

{'Palabra': 'carcinoma escamoso de pulmón', 'Entidad': 'CANCER_CONCEPT', 'Score': 0.0001}
{'Palabra': 'estadio IIIB.', 'Entidad': 'STAGE', 'Score': 0.0}





[None, 0, 1, 2, 3, 4, 4, 4, 5, 6, 7, 7, 7, 7, 8, 9, 9, 9, 9, None, None, None]
['<s>', '▁Se', '▁inicia', '▁tratamiento', '▁con