#Proceso de Fine-Tuning para Extracción de Conceptos Medicos relacionados con el Cancer de Pulmon

### Primero se define la ruta a los archivos del corpus anotado de cancer de pulmon

In [1]:
from google.colab import drive
drive.mount('/content/drive')

ruta_corpus = '/content/drive/MyDrive/Tareas_Analitica_Datos_Salud/Tarea_1/lung_cancer_corpus/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# prompt: I want to print files available on the route declared in ruta_corpus

import os

# List files in the specified directory
for filename in os.listdir(ruta_corpus):
  print(filename)

sentences_dev.csv
sentences_test.csv
sentences_train.csv


### Luego aseguramos de tener instaladas las librería necesarias para hacer el entrenamiento fino del modelo

In [3]:
!pip install datasets transformers
!pip install seqeval
!pip install -U datasets evaluate
!pip install -U huggingface_hub



### Cargamos el corpus y alistamos los datos de entrenamiento, testeo y validacion

In [4]:
from datasets import DatasetDict, Dataset, Features, Sequence, Value, ClassLabel
from collections import defaultdict
import pandas as pd
from pathlib import Path

In [5]:
def leer_archivo_corpus(ruta_archivo):
    """Lee un archivo csv y devuelve un diccionario con tokens y etiquetas agrupados por sentencia."""
    datos = defaultdict(list)

    # Leer el archivo CSV usando pandas
    df = pd.read_csv(ruta_archivo)

    # Asegurar que Word y Tag sean strings
    df['Word'] = df['Word'].astype(str)
    df['Tag'] = df['Tag'].astype(str)

    # Rellenar valores faltantes en Sentence # con el último valor válido
    df['Sentence #'] = df['Sentence #'].ffill()

    # Agrupar por número de sentencia
    for _, grupo in df.groupby('Sentence #'):
        # Extraer tokens y etiquetas de la sentencia actual
        tokens_sentencia = grupo['Word'].tolist()
        labels_sentencia = grupo['Tag'].tolist()

        # Agregar los tokens y labels de la sentencia actual
        datos["tokens"].append(tokens_sentencia)
        datos["ner_tags"].append(labels_sentencia)

    return datos


def cargar_datasets(rutas_archivos):
    """Carga archivos .bio y devuelve un DatasetDict."""
    datasets = {}
    for nombre, ruta in rutas_archivos.items():
        datos = leer_archivo_corpus(ruta)
        print(f"Dataset {nombre} cargado correctamente.")
        print(f"Tamaño del dataset {nombre}: {len(datos['tokens'])}")
        print(f"Tokens del dataset {nombre}: {datos['tokens'][:5]}")
        print(f"Etiquetas del dataset {nombre}: {datos['ner_tags'][:5]}")
        datasets[nombre] = Dataset.from_dict(datos)

    return DatasetDict(datasets)

In [6]:
def detectar_etiquetas_unicas(rutas_archivos):
    """Detecta automáticamente todas las etiquetas únicas en los archivos."""
    todas_etiquetas = set()

    for ruta in rutas_archivos.values():
      df = pd.read_csv(ruta)
      etiquetas = df['Tag'].unique()
      todas_etiquetas.update(etiquetas)

    # Ordenamos las etiquetas para que 'O' sea la última
    etiquetas_ordenadas = sorted(todas_etiquetas - {'O'}) + ['O']
    return etiquetas_ordenadas

In [7]:
rutas_archivos = {
    "train": ruta_corpus + "sentences_train.csv",
    "test":  ruta_corpus + "sentences_test.csv",
    "valid": ruta_corpus + "sentences_dev.csv"
}

In [8]:
# Detectar automáticamente todas las etiquetas
LABELS = detectar_etiquetas_unicas(rutas_archivos)
print("Etiquetas detectadas:", LABELS)

Etiquetas detectadas: ['B_CANCER_CONCEPT', 'B_CHEMOTHERAPY', 'B_DATE', 'B_DRUG', 'B_FAMILY', 'B_FREQ', 'B_IMPLICIT_DATE', 'B_INTERVAL', 'B_METRIC', 'B_OCURRENCE_EVENT', 'B_QUANTITY', 'B_RADIOTHERAPY', 'B_SMOKER_STATUS', 'B_STAGE', 'B_SURGERY', 'B_TNM', 'I_CANCER_CONCEPT', 'I_DATE', 'I_DRUG', 'I_FAMILY', 'I_FREQ', 'I_IMPLICIT_DATE', 'I_INTERVAL', 'I_METRIC', 'I_OCURRENCE_EVENT', 'I_SMOKER_STATUS', 'I_STAGE', 'I_SURGERY', 'I_TNM', 'O']


In [9]:
# Cargar los datasets
dataset_dict = cargar_datasets(rutas_archivos)

# Definir la estructura de features con las etiquetas detectadas
features = Features({
    "tokens": Sequence(Value("string")),
    "ner_tags": Sequence(ClassLabel(names=LABELS))
})

# Aplicar el casting a cada split
for split in dataset_dict:
    dataset_dict[split] = dataset_dict[split].cast(features)

# Mostrar información del dataset
print("\nDataset cargado correctamente:")
print(dataset_dict)

Dataset train cargado correctamente.
Tamaño del dataset train: 9788
Tokens del dataset train: [['Abuela', 'materna', 'con', 'cancer', 'de', 'mama', 'a', 'los', '70', 'años', '.'], ['Abuela', 'materna', 'con', 'cancer', 'de', 'mama', 'a', 'los', '70', 'años', '.'], ['-', 'Quiste', 'renal', 'izquierdo', 'complicado', '(', 'ecografia', 'noviembre', '2013', 'quistes', 'renales', 'bilaterales', ')', '.'], ['-Insuficiencia', 'renal', 'cronica', 'etiologia', 'multifactorial', '(', 'inhibidores', 'de', 'calcineurina', '/', 'diabetes', '/', 'HTA)-', 'MDRD', '56', 'ml', '/', 'min', 'dic', '2017'], ['.']]
Etiquetas del dataset train: [['B_FAMILY', 'I_FAMILY', 'O', 'B_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'O', 'O', 'B_QUANTITY', 'B_METRIC', 'O'], ['B_FAMILY', 'I_FAMILY', 'O', 'B_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'O', 'O', 'B_QUANTITY', 'B_METRIC', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_DATE', 'I_DATE', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', '

Casting the dataset:   0%|          | 0/9788 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2496 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2758 [00:00<?, ? examples/s]


Dataset cargado correctamente:
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 9788
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2496
    })
    valid: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2758
    })
})


In [10]:
# Mostrar un ejemplo del conjunto de entrenamiento
print("\nEjemplo del train:")
print(dataset_dict["train"][0])


Ejemplo del train:
{'tokens': ['Abuela', 'materna', 'con', 'cancer', 'de', 'mama', 'a', 'los', '70', 'años', '.'], 'ner_tags': [4, 19, 29, 0, 16, 16, 29, 29, 10, 8, 29]}


In [11]:
# Mostrar las características del dataset
print("\nCaracterísticas del dataset:")
print(dataset_dict["train"].features)


Características del dataset:
{'tokens': List(Value('string')), 'ner_tags': List(ClassLabel(names=['B_CANCER_CONCEPT', 'B_CHEMOTHERAPY', 'B_DATE', 'B_DRUG', 'B_FAMILY', 'B_FREQ', 'B_IMPLICIT_DATE', 'B_INTERVAL', 'B_METRIC', 'B_OCURRENCE_EVENT', 'B_QUANTITY', 'B_RADIOTHERAPY', 'B_SMOKER_STATUS', 'B_STAGE', 'B_SURGERY', 'B_TNM', 'I_CANCER_CONCEPT', 'I_DATE', 'I_DRUG', 'I_FAMILY', 'I_FREQ', 'I_IMPLICIT_DATE', 'I_INTERVAL', 'I_METRIC', 'I_OCURRENCE_EVENT', 'I_SMOKER_STATUS', 'I_STAGE', 'I_SURGERY', 'I_TNM', 'O']))}


### Iniamos el proceso de Fine-Tuning

In [12]:
task = 'ner'
x = dataset_dict["train"].features[f"{task}_tags"].feature.names
print(x)

['B_CANCER_CONCEPT', 'B_CHEMOTHERAPY', 'B_DATE', 'B_DRUG', 'B_FAMILY', 'B_FREQ', 'B_IMPLICIT_DATE', 'B_INTERVAL', 'B_METRIC', 'B_OCURRENCE_EVENT', 'B_QUANTITY', 'B_RADIOTHERAPY', 'B_SMOKER_STATUS', 'B_STAGE', 'B_SURGERY', 'B_TNM', 'I_CANCER_CONCEPT', 'I_DATE', 'I_DRUG', 'I_FAMILY', 'I_FREQ', 'I_IMPLICIT_DATE', 'I_INTERVAL', 'I_METRIC', 'I_OCURRENCE_EVENT', 'I_SMOKER_STATUS', 'I_STAGE', 'I_SURGERY', 'I_TNM', 'O']


In [None]:
from huggingface_hub import login

# token cuenta personal, maestria_laptop_james_lectura
maestria_laptop_james_lectura = '___TOKEN___PRUEBA____'
colab_push_token = '___TOKEN___PRUEBA____'
login(maestria_laptop_james_lectura)

In [14]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "xlm-roberta-large"
batch_size = 8

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset_dict.map(
    tokenize_and_align_labels,
    batched=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/9788 [00:00<?, ? examples/s]

Map:   0%|          | 0/2496 [00:00<?, ? examples/s]

Map:   0%|          | 0/2758 [00:00<?, ? examples/s]

In [16]:
label_list = dataset_dict["train"].features[f"{task}_tags"].feature.names
label_list

['B_CANCER_CONCEPT',
 'B_CHEMOTHERAPY',
 'B_DATE',
 'B_DRUG',
 'B_FAMILY',
 'B_FREQ',
 'B_IMPLICIT_DATE',
 'B_INTERVAL',
 'B_METRIC',
 'B_OCURRENCE_EVENT',
 'B_QUANTITY',
 'B_RADIOTHERAPY',
 'B_SMOKER_STATUS',
 'B_STAGE',
 'B_SURGERY',
 'B_TNM',
 'I_CANCER_CONCEPT',
 'I_DATE',
 'I_DRUG',
 'I_FAMILY',
 'I_FREQ',
 'I_IMPLICIT_DATE',
 'I_INTERVAL',
 'I_METRIC',
 'I_OCURRENCE_EVENT',
 'I_SMOKER_STATUS',
 'I_STAGE',
 'I_SURGERY',
 'I_TNM',
 'O']

In [17]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# Create id2label and label2id mappings
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [19]:
model_bert_base = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_bert_base}-finetuned-{task}-lung-cancer",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4, # Reduced number of epochs
    weight_decay=0.01,
    push_to_hub=True,
    hub_token=colab_push_token, # hub token para escritura
)

### Evaluación del entrenamiento

In [20]:
from evaluate import load
metric = load("seqeval")

In [21]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [22]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [23]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mjames-payan[0m ([33mjames-payan-universidad-del-valle[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1302,0.090792,0.921756,0.944704,0.933089,0.975315
2,0.0943,0.08863,0.92149,0.956673,0.938752,0.97747
3,0.0583,0.077861,0.942116,0.960383,0.951162,0.982031
4,0.0399,0.085437,0.938709,0.958707,0.948603,0.981297




TrainOutput(global_step=4896, training_loss=0.11599353910271638, metrics={'train_runtime': 3236.8346, 'train_samples_per_second': 12.096, 'train_steps_per_second': 1.513, 'total_flos': 4634578997872080.0, 'train_loss': 0.11599353910271638, 'epoch': 4.0})

In [24]:
test_metrics = trainer.evaluate(tokenized_datasets["test"])
print("\n" + "="*50)
print(f"Resultados finales en conjunto de test:")
print(f"F1-score: {test_metrics['eval_f1']:.3f}")
print(f"Precisión: {test_metrics['eval_precision']:.3f}")
print(f"Recall: {test_metrics['eval_recall']:.3f}")
print("="*50)




Resultados finales en conjunto de test:
F1-score: 0.942
Precisión: 0.925
Recall: 0.961


In [25]:
trainer.push_to_hub()

events.out.tfevents.1752445764.2de79090b3f0.34796.1:   0%|          | 0.00/560 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jamesopeth/xlm-roberta-large-finetuned-ner-lung-cancer/commit/da0489a16048303cd82bc84b081507218d9152a6', commit_message='End of training', commit_description='', oid='da0489a16048303cd82bc84b081507218d9152a6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jamesopeth/xlm-roberta-large-finetuned-ner-lung-cancer', endpoint='https://huggingface.co', repo_type='model', repo_id='jamesopeth/xlm-roberta-large-finetuned-ner-lung-cancer'), pr_revision=None, pr_num=None)

In [26]:
label_names =  dataset_dict["train"].features["ner_tags"].feature.names
label_names

['B_CANCER_CONCEPT',
 'B_CHEMOTHERAPY',
 'B_DATE',
 'B_DRUG',
 'B_FAMILY',
 'B_FREQ',
 'B_IMPLICIT_DATE',
 'B_INTERVAL',
 'B_METRIC',
 'B_OCURRENCE_EVENT',
 'B_QUANTITY',
 'B_RADIOTHERAPY',
 'B_SMOKER_STATUS',
 'B_STAGE',
 'B_SURGERY',
 'B_TNM',
 'I_CANCER_CONCEPT',
 'I_DATE',
 'I_DRUG',
 'I_FAMILY',
 'I_FREQ',
 'I_IMPLICIT_DATE',
 'I_INTERVAL',
 'I_METRIC',
 'I_OCURRENCE_EVENT',
 'I_SMOKER_STATUS',
 'I_STAGE',
 'I_SURGERY',
 'I_TNM',
 'O']

In [27]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results



{'_CANCER_CONCEPT': {'precision': np.float64(0.9202797202797203),
  'recall': np.float64(0.9550072568940493),
  'f1': np.float64(0.9373219373219374),
  'number': np.int64(689)},
 '_CHEMOTHERAPY': {'precision': np.float64(0.9794871794871794),
  'recall': np.float64(1.0),
  'f1': np.float64(0.9896373056994818),
  'number': np.int64(191)},
 '_DATE': {'precision': np.float64(0.9859154929577465),
  'recall': np.float64(0.9884467265725289),
  'f1': np.float64(0.9871794871794871),
  'number': np.int64(779)},
 '_DRUG': {'precision': np.float64(0.9132231404958677),
  'recall': np.float64(0.9822222222222222),
  'f1': np.float64(0.9464668094218416),
  'number': np.int64(675)},
 '_FAMILY': {'precision': np.float64(0.9865771812080537),
  'recall': np.float64(1.0),
  'f1': np.float64(0.9932432432432432),
  'number': np.int64(147)},
 '_FREQ': {'precision': np.float64(0.9075144508670521),
  'recall': np.float64(0.9751552795031055),
  'f1': np.float64(0.940119760479042),
  'number': np.int64(161)},
 '_