#Proceso de Fine-Tuning para Extracción de Conceptos Medicos relacionados con el Cancer de Pulmon

### Primero se define la ruta a los archivos del corpus anotado de cancer de pulmon

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ruta_corpus = '/content/drive/MyDrive/Tareas_Analitica_Datos_Salud/Tarea_1/lung_cancer_corpus/'

Mounted at /content/drive


In [None]:
# prompt: I want to print files available on the route declared in ruta_corpus

import os

# List files in the specified directory
for filename in os.listdir(ruta_corpus):
  print(filename)

sentences_dev.csv
sentences_test.csv
sentences_train.csv


### Luego aseguramos de tener instaladas las librería necesarias para hacer el entrenamiento fino del modelo

In [None]:
!pip install datasets transformers
!pip install seqeval
!pip install -U datasets evaluate
!pip install -U huggingface_hub

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=e4dada25064e71b7c8e9d4397b2dfd10d36567f59f8861d806645cca06627cad
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-p

### Cargamos el corpus y alistamos los datos de entrenamiento, testeo y validacion

In [None]:
from datasets import DatasetDict, Dataset, Features, Sequence, Value, ClassLabel
from collections import defaultdict
import pandas as pd
from pathlib import Path

In [None]:
def leer_archivo_corpus(ruta_archivo):
    """Lee un archivo csv y devuelve un diccionario con tokens y etiquetas agrupados por sentencia."""
    datos = defaultdict(list)

    # Leer el archivo CSV usando pandas
    df = pd.read_csv(ruta_archivo)

    # Asegurar que Word y Tag sean strings
    df['Word'] = df['Word'].astype(str)
    df['Tag'] = df['Tag'].astype(str)

    # Rellenar valores faltantes en Sentence # con el último valor válido
    df['Sentence #'] = df['Sentence #'].ffill()

    # Agrupar por número de sentencia
    for _, grupo in df.groupby('Sentence #'):
        # Extraer tokens y etiquetas de la sentencia actual
        tokens_sentencia = grupo['Word'].tolist()
        labels_sentencia = grupo['Tag'].tolist()

        # Agregar los tokens y labels de la sentencia actual
        datos["tokens"].append(tokens_sentencia)
        datos["ner_tags"].append(labels_sentencia)

    return datos


def cargar_datasets(rutas_archivos):
    """Carga archivos .bio y devuelve un DatasetDict."""
    datasets = {}
    for nombre, ruta in rutas_archivos.items():
        datos = leer_archivo_corpus(ruta)
        print(f"Dataset {nombre} cargado correctamente.")
        print(f"Tamaño del dataset {nombre}: {len(datos['tokens'])}")
        print(f"Tokens del dataset {nombre}: {datos['tokens'][:5]}")
        print(f"Etiquetas del dataset {nombre}: {datos['ner_tags'][:5]}")
        datasets[nombre] = Dataset.from_dict(datos)

    return DatasetDict(datasets)

In [None]:
def detectar_etiquetas_unicas(rutas_archivos):
    """Detecta automáticamente todas las etiquetas únicas en los archivos."""
    todas_etiquetas = set()

    for ruta in rutas_archivos.values():
      df = pd.read_csv(ruta)
      etiquetas = df['Tag'].unique()
      todas_etiquetas.update(etiquetas)

    # Ordenamos las etiquetas para que 'O' sea la última
    etiquetas_ordenadas = sorted(todas_etiquetas - {'O'}) + ['O']
    return etiquetas_ordenadas


In [None]:
rutas_archivos = {
    "train": ruta_corpus + "sentences_train.csv",
    "test":  ruta_corpus + "sentences_test.csv",
    "valid": ruta_corpus + "sentences_dev.csv"
}


In [None]:
# Detectar automáticamente todas las etiquetas
LABELS = detectar_etiquetas_unicas(rutas_archivos)
print("Etiquetas detectadas:", LABELS)


Etiquetas detectadas: ['B_CANCER_CONCEPT', 'B_CHEMOTHERAPY', 'B_DATE', 'B_DRUG', 'B_FAMILY', 'B_FREQ', 'B_IMPLICIT_DATE', 'B_INTERVAL', 'B_METRIC', 'B_OCURRENCE_EVENT', 'B_QUANTITY', 'B_RADIOTHERAPY', 'B_SMOKER_STATUS', 'B_STAGE', 'B_SURGERY', 'B_TNM', 'I_CANCER_CONCEPT', 'I_DATE', 'I_DRUG', 'I_FAMILY', 'I_FREQ', 'I_IMPLICIT_DATE', 'I_INTERVAL', 'I_METRIC', 'I_OCURRENCE_EVENT', 'I_SMOKER_STATUS', 'I_STAGE', 'I_SURGERY', 'I_TNM', 'O']


In [None]:
# Cargar los datasets
dataset_dict = cargar_datasets(rutas_archivos)

# Definir la estructura de features con las etiquetas detectadas
features = Features({
    "tokens": Sequence(Value("string")),
    "ner_tags": Sequence(ClassLabel(names=LABELS))
})

# Aplicar el casting a cada split
for split in dataset_dict:
    dataset_dict[split] = dataset_dict[split].cast(features)

# Mostrar información del dataset
print("\nDataset cargado correctamente:")
print(dataset_dict)

Dataset train cargado correctamente.
Tamaño del dataset train: 9788
Tokens del dataset train: [['Abuela', 'materna', 'con', 'cancer', 'de', 'mama', 'a', 'los', '70', 'años', '.'], ['Abuela', 'materna', 'con', 'cancer', 'de', 'mama', 'a', 'los', '70', 'años', '.'], ['-', 'Quiste', 'renal', 'izquierdo', 'complicado', '(', 'ecografia', 'noviembre', '2013', 'quistes', 'renales', 'bilaterales', ')', '.'], ['-Insuficiencia', 'renal', 'cronica', 'etiologia', 'multifactorial', '(', 'inhibidores', 'de', 'calcineurina', '/', 'diabetes', '/', 'HTA)-', 'MDRD', '56', 'ml', '/', 'min', 'dic', '2017'], ['.']]
Etiquetas del dataset train: [['B_FAMILY', 'I_FAMILY', 'O', 'B_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'O', 'O', 'B_QUANTITY', 'B_METRIC', 'O'], ['B_FAMILY', 'I_FAMILY', 'O', 'B_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'O', 'O', 'B_QUANTITY', 'B_METRIC', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_DATE', 'I_DATE', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', '

Casting the dataset:   0%|          | 0/9788 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2496 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2758 [00:00<?, ? examples/s]


Dataset cargado correctamente:
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 9788
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2496
    })
    valid: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2758
    })
})


In [None]:
# Mostrar un ejemplo del conjunto de entrenamiento
print("\nEjemplo del train:")
print(dataset_dict["train"][0])


Ejemplo del train:
{'tokens': ['Abuela', 'materna', 'con', 'cancer', 'de', 'mama', 'a', 'los', '70', 'años', '.'], 'ner_tags': [4, 19, 29, 0, 16, 16, 29, 29, 10, 8, 29]}


In [None]:
# Mostrar las características del dataset
print("\nCaracterísticas del dataset:")
print(dataset_dict["train"].features)


Características del dataset:
{'tokens': List(Value('string')), 'ner_tags': List(ClassLabel(names=['B_CANCER_CONCEPT', 'B_CHEMOTHERAPY', 'B_DATE', 'B_DRUG', 'B_FAMILY', 'B_FREQ', 'B_IMPLICIT_DATE', 'B_INTERVAL', 'B_METRIC', 'B_OCURRENCE_EVENT', 'B_QUANTITY', 'B_RADIOTHERAPY', 'B_SMOKER_STATUS', 'B_STAGE', 'B_SURGERY', 'B_TNM', 'I_CANCER_CONCEPT', 'I_DATE', 'I_DRUG', 'I_FAMILY', 'I_FREQ', 'I_IMPLICIT_DATE', 'I_INTERVAL', 'I_METRIC', 'I_OCURRENCE_EVENT', 'I_SMOKER_STATUS', 'I_STAGE', 'I_SURGERY', 'I_TNM', 'O']))}


### Iniamos el proceso de Fine-Tuning

In [None]:
task = 'ner'
x = dataset_dict["train"].features[f"{task}_tags"].feature.names
print(x)

['B_CANCER_CONCEPT', 'B_CHEMOTHERAPY', 'B_DATE', 'B_DRUG', 'B_FAMILY', 'B_FREQ', 'B_IMPLICIT_DATE', 'B_INTERVAL', 'B_METRIC', 'B_OCURRENCE_EVENT', 'B_QUANTITY', 'B_RADIOTHERAPY', 'B_SMOKER_STATUS', 'B_STAGE', 'B_SURGERY', 'B_TNM', 'I_CANCER_CONCEPT', 'I_DATE', 'I_DRUG', 'I_FAMILY', 'I_FREQ', 'I_IMPLICIT_DATE', 'I_INTERVAL', 'I_METRIC', 'I_OCURRENCE_EVENT', 'I_SMOKER_STATUS', 'I_STAGE', 'I_SURGERY', 'I_TNM', 'O']


In [None]:
from huggingface_hub import login

# token cuenta personal, maestria_laptop_james_lectura
maestria_laptop_james_lectura = '___TOKEN___PRUEBA____'
colab_push_token = '___TOKEN___PRUEBA____'
login(maestria_laptop_james_lectura)

In [None]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "bert-base-uncased"
batch_size = 8

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset_dict.map(
    tokenize_and_align_labels,
    batched=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/9788 [00:00<?, ? examples/s]

Map:   0%|          | 0/2496 [00:00<?, ? examples/s]

Map:   0%|          | 0/2758 [00:00<?, ? examples/s]

In [None]:
label_list = dataset_dict["train"].features[f"{task}_tags"].feature.names
label_list

['B_CANCER_CONCEPT',
 'B_CHEMOTHERAPY',
 'B_DATE',
 'B_DRUG',
 'B_FAMILY',
 'B_FREQ',
 'B_IMPLICIT_DATE',
 'B_INTERVAL',
 'B_METRIC',
 'B_OCURRENCE_EVENT',
 'B_QUANTITY',
 'B_RADIOTHERAPY',
 'B_SMOKER_STATUS',
 'B_STAGE',
 'B_SURGERY',
 'B_TNM',
 'I_CANCER_CONCEPT',
 'I_DATE',
 'I_DRUG',
 'I_FAMILY',
 'I_FREQ',
 'I_IMPLICIT_DATE',
 'I_INTERVAL',
 'I_METRIC',
 'I_OCURRENCE_EVENT',
 'I_SMOKER_STATUS',
 'I_STAGE',
 'I_SURGERY',
 'I_TNM',
 'O']

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
model_bert_base = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_bert_base}-finetuned-{task}-lung-cancer",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=True,
    hub_token=colab_push_token, # hub token para escritura
)

### Evaluación del entrenamiento

In [None]:
from evaluate import load
metric = load("seqeval")

In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjames-payan[0m ([33mjames-payan-universidad-del-valle[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1551,0.089487,0.91988,0.950928,0.935146,0.976232
2,0.094,0.085314,0.926821,0.964093,0.94509,0.979487
3,0.0667,0.074328,0.940063,0.963016,0.951401,0.982512
4,0.0468,0.082466,0.939596,0.962537,0.950928,0.982099
5,0.0403,0.083546,0.935188,0.953321,0.944168,0.980128
6,0.0336,0.089633,0.936658,0.95751,0.94697,0.980495
7,0.028,0.089441,0.933396,0.954399,0.94378,0.980335
8,0.0235,0.098647,0.936616,0.958588,0.947474,0.980839
9,0.0195,0.099421,0.934476,0.952484,0.943394,0.980037
10,0.0181,0.101147,0.936568,0.954279,0.94534,0.980656




TrainOutput(global_step=12240, training_loss=0.06400082352893804, metrics={'train_runtime': 2384.6117, 'train_samples_per_second': 41.047, 'train_steps_per_second': 5.133, 'total_flos': 3774807235471680.0, 'train_loss': 0.06400082352893804, 'epoch': 10.0})

In [None]:
test_metrics = trainer.evaluate(tokenized_datasets["test"])
print("\n" + "="*50)
print(f"Resultados finales en conjunto de test:")
print(f"F1-score: {test_metrics['eval_f1']:.3f}")
print(f"Precisión: {test_metrics['eval_precision']:.3f}")
print(f"Recall: {test_metrics['eval_recall']:.3f}")
print("="*50)


Resultados finales en conjunto de test:
F1-score: 0.936
Precisión: 0.921
Recall: 0.952


In [None]:
trainer.push_to_hub()

events.out.tfevents.1752375900.431922a661de.2219.1:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

events.out.tfevents.1752378702.431922a661de.2219.2:   0%|          | 0.00/560 [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/jamesopeth/bert-base-uncased-finetuned-ner-lung-cancer/commit/112ca7858171e504d7adf4270a0f3b267a6578e5', commit_message='End of training', commit_description='', oid='112ca7858171e504d7adf4270a0f3b267a6578e5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jamesopeth/bert-base-uncased-finetuned-ner-lung-cancer', endpoint='https://huggingface.co', repo_type='model', repo_id='jamesopeth/bert-base-uncased-finetuned-ner-lung-cancer'), pr_revision=None, pr_num=None)

In [None]:
label_names =  dataset_dict["train"].features["ner_tags"].feature.names
label_names

['B_CANCER_CONCEPT',
 'B_CHEMOTHERAPY',
 'B_DATE',
 'B_DRUG',
 'B_FAMILY',
 'B_FREQ',
 'B_IMPLICIT_DATE',
 'B_INTERVAL',
 'B_METRIC',
 'B_OCURRENCE_EVENT',
 'B_QUANTITY',
 'B_RADIOTHERAPY',
 'B_SMOKER_STATUS',
 'B_STAGE',
 'B_SURGERY',
 'B_TNM',
 'I_CANCER_CONCEPT',
 'I_DATE',
 'I_DRUG',
 'I_FAMILY',
 'I_FREQ',
 'I_IMPLICIT_DATE',
 'I_INTERVAL',
 'I_METRIC',
 'I_OCURRENCE_EVENT',
 'I_SMOKER_STATUS',
 'I_STAGE',
 'I_SURGERY',
 'I_TNM',
 'O']

In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'_CANCER_CONCEPT': {'precision': np.float64(0.897364771151179),
  'recall': np.float64(0.9390420899854862),
  'f1': np.float64(0.9177304964539006),
  'number': np.int64(689)},
 '_CHEMOTHERAPY': {'precision': np.float64(0.9794871794871794),
  'recall': np.float64(1.0),
  'f1': np.float64(0.9896373056994818),
  'number': np.int64(191)},
 '_DATE': {'precision': np.float64(0.9833333333333333),
  'recall': np.float64(0.9845956354300385),
  'f1': np.float64(0.9839640795381653),
  'number': np.int64(779)},
 '_DRUG': {'precision': np.float64(0.9228611500701263),
  'recall': np.float64(0.9748148148148148),
  'f1': np.float64(0.9481268011527377),
  'number': np.int64(675)},
 '_FAMILY': {'precision': np.float64(0.9931506849315068),
  'recall': np.float64(0.9863945578231292),
  'f1': np.float64(0.9897610921501707),
  'number': np.int64(147)},
 '_FREQ': {'precision': np.float64(0.88268156424581),
  'recall': np.float64(0.9813664596273292),
  'f1': np.float64(0.9294117647058824),
  'number': np.int