In [1]:
from datasets import load_dataset, Dataset 
import datasets
import torch
import os
import pandas as pd
from tqdm import tqdm
from span_marker import SpanMarkerModel
import spacy

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

In [3]:
data = load_dataset(
    path="../../biomedical/bigbio/hub/hub_repos/symptemist/symptemist.py", 
    name="symptemist_entities_bigbio_kb"
)
data = data.sort("document_id")
data

Found cached dataset symptemist (/home/Ignacio.Rodriguez/.cache/huggingface/datasets/symptemist/symptemist_entities_bigbio_kb/2.0.0/2542aaab0d6c9963785fca5b4b0712501e06aa5a2e136b7b4d26d1fd7a2c382a)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached sorted indices for dataset at /home/Ignacio.Rodriguez/.cache/huggingface/datasets/symptemist/symptemist_entities_bigbio_kb/2.0.0/2542aaab0d6c9963785fca5b4b0712501e06aa5a2e136b7b4d26d1fd7a2c382a/cache-a1bc75b8e00708bf.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 744
    })
})

In [4]:
data_train = data["train"].select(range(600))
data_eval = data["train"].select(range(600,744))

In [5]:
labels = ["O", "SINTOMA"]

def bigbio2spanmarker(split: Dataset) -> Dataset:

    nlp = spacy.load("es_core_news_sm")

    output = {
        "filename": [],
        "document_id": [],
        "sentence_id": [],
        "tokens": [],
        "ner_tags": [],
        "text": [],        
    }
    
    for doc_id, row in tqdm(enumerate(split), desc="Document progress:"):
        
        text = row["passages"][0]["text"][0]
        entities = row["entities"]
        doc = nlp(text)
        
        for sentence_id, sentence in enumerate(doc.sents):
            
            tokens = []
            token_positions = []
            
            for token in sentence:
                tokens.append(token.text)
                token_positions.append(token.idx)
        
            ner_tags = [0] * len(tokens)
                
            for i, position in enumerate(token_positions):
                for entity in entities:
                    for offset in entity["offsets"]:
                        if position==offset[0] or position in range(offset[0],offset[1]):
                            ner_tags[i] = 1

            output["filename"].append(row["document_id"])
            output["document_id"].append(doc_id)
            output["sentence_id"].append(sentence_id)
            output["tokens"].append(tokens)
            output["ner_tags"].append(ner_tags)
            output["text"].append(sentence.text)
        
    return datasets.Dataset.from_dict(output)

data_train = bigbio2spanmarker(data_train)
data_eval = bigbio2spanmarker(data_eval)

Document progress:: 600it [00:18, 31.76it/s]
Document progress:: 144it [00:04, 30.83it/s]


In [14]:
model_name = "PlanTL-GOB-ES/roberta-base-biomedical-clinical-es"
model = SpanMarkerModel.from_pretrained(model_name, labels=labels, model_max_length=256, entity_max_length=15, device = os.environ["CUDA_VISIBLE_DEVICES"])

Some weights of RobertaModel were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-biomedical-clinical-es and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 52002. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


In [15]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir = "../checkpoints/v2",
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    fp16 = True,
    save_strategy = "epoch",
    overwrite_output_dir = True,
    evaluation_strategy = "epoch",
    save_total_limit = 2,
    num_train_epochs = 30,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_overall_f1",
    lr_scheduler_type = "linear",
    learning_rate = 0.00005,
    warmup_ratio = 0.0,
    label_smoothing_factor = 0.0,
    weight_decay = 0.0,
)


In [16]:
from span_marker import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset = data_train,
    eval_dataset = data_eval,
)

In [17]:
import wandb
with wandb.init(project=f'symptemist_ner',tags=["dev"]):
    trainer.train()

Label normalizing the train dataset:   0%|          | 0/10463 [00:00<?, ? examples/s]

Tokenizing the train dataset:   0%|          | 0/10463 [00:00<?, ? examples/s]

This SpanMarker model will ignore 1.928413% of all annotated entities in the train dataset. This is caused by the SpanMarkerModel maximum entity length of 15 words.
These are the frequencies of the missed entities due to maximum entity length out of 7208 total entities:
- 33 missed entities with 16 words (0.457825%)
- 11 missed entities with 17 words (0.152608%)
- 22 missed entities with 18 words (0.305216%)
- 10 missed entities with 19 words (0.138735%)
- 9 missed entities with 20 words (0.124861%)
- 14 missed entities with 21 words (0.194229%)
- 6 missed entities with 22 words (0.083241%)
- 11 missed entities with 23 words (0.152608%)
- 2 missed entities with 24 words (0.027747%)
- 2 missed entities with 25 words (0.027747%)
- 1 missed entities with 26 words (0.013873%)
- 2 missed entities with 28 words (0.027747%)
- 1 missed entities with 29 words (0.013873%)
- 1 missed entities with 31 words (0.013873%)
- 1 missed entities with 32 words (0.013873%)
- 1 missed entities with 33 words

Adding document-level context:   0%|          | 0/10463 [00:00<?, ?it/s]

Spreading data between multiple samples:   0%|          | 0/10463 [00:00<?, ? examples/s]

VBox(children=(Label(value='0.015 MB of 0.015 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, maxâ€¦

KeyboardInterrupt: 