# Loading data

In [1]:
import json

with open('clinais.train.json') as f:
    data = json.load(f)

from tqdm import tqdm

finalresult = []
for key in tqdm(data['annotated_entries'].keys()):
    ident = data['annotated_entries'][key]['note_id']
    res = []
    tags = []
    gold = data['annotated_entries'][key]['boundary_annotation']['gold']
    currentboundary = ''
    for g in gold:
        res.append(g['span'])
        if(g['boundary'] is None):
            tags.append('I-'+currentboundary)
        else:
            currentboundary = g['boundary']
            tags.append('B-'+currentboundary)
    finalresult.append([ident,res,tags])

# finalresult    

import numpy as np
import itertools
tags = [x[2] for x in finalresult]
tags = np.unique(list(itertools.chain(*tags)))
id2label = {}
label2id = {}
for i,tag in enumerate(tags):
    id2label[i] = tag
    label2id[tag] = i

100%|██████████| 781/781 [00:00<00:00, 9228.90it/s]


In [2]:
from datasets import load_from_disk

dataset = load_from_disk('augmented_dataset')

# Processing dataset

In [4]:
from transformers import AutoTokenizer

In [5]:
modelCheckpoint = "joheras/xlm-roberta-base-finetuned-clinais"
tokenizer = AutoTokenizer.from_pretrained(modelCheckpoint)

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2850
    })
    val: Dataset({
        features: ['tokens', 'tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 127
    })
})

In [9]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [10]:
import evaluate

seqeval = evaluate.load("seqeval")

In [11]:
import numpy as np



def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [12]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    modelCheckpoint, num_labels=len(id2label), id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at joheras/xlm-roberta-base-finetuned-clinais were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at joheras/xlm-roberta-base-finetuned-clinais and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRA

In [13]:
training_args = TrainingArguments(
    output_dir="clinico-xlm-roberta-finetuned-augmented1",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=50,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Cloning https://huggingface.co/joheras/clinico-xlm-roberta-large-finetuned-augmented1 into local empty directory.


Download file pytorch_model.bin:   0%|          | 1.40k/2.08G [00:00<?, ?B/s]

Download file runs/Mar16_21-03-10_minion/1678996997.521049/events.out.tfevents.1678996997.minion.41089.1: 100%…

Download file training_args.bin: 100%|##########| 3.50k/3.50k [00:00<?, ?B/s]

Download file runs/Mar16_21-03-10_minion/events.out.tfevents.1678996997.minion.41089.0:  55%|#####5    | 17.0k…

Clean file runs/Mar16_21-03-10_minion/1678996997.521049/events.out.tfevents.1678996997.minion.41089.1:  18%|#7…

Clean file training_args.bin:  29%|##8       | 1.00k/3.50k [00:00<?, ?B/s]

Clean file runs/Mar16_21-03-10_minion/events.out.tfevents.1678996997.minion.41089.0:   3%|3         | 1.00k/30…

Download file tokenizer.json:   0%|          | 16.4k/16.3M [00:00<?, ?B/s]

Download file sentencepiece.bpe.model:   0%|          | 16.5k/4.83M [00:00<?, ?B/s]

Clean file sentencepiece.bpe.model:   0%|          | 1.00k/4.83M [00:00<?, ?B/s]

Clean file tokenizer.json:   0%|          | 1.00k/16.3M [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/2.08G [00:00<?, ?B/s]

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2850
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 8950
  Number of trainable parameters = 277463822
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjoheras[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.607421,0.300272,0.505721,0.376812,0.814729
2,No log,0.575764,0.30914,0.526316,0.3895,0.841496
3,0.492100,0.70894,0.335659,0.550343,0.416992,0.843825
4,0.492100,0.821916,0.354094,0.578947,0.439427,0.839134
5,0.492100,0.885721,0.404984,0.594966,0.481928,0.850681
6,0.062800,0.938622,0.388806,0.59611,0.470641,0.848548
7,0.062800,1.030217,0.407213,0.620137,0.49161,0.845826
8,0.062800,0.996015,0.43517,0.614416,0.509488,0.853469
9,0.019500,1.059258,0.434887,0.618993,0.510859,0.854617
10,0.019500,1.126231,0.436642,0.618993,0.512068,0.851173


The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 127
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to clinico-xlm-roberta-large-finetuned-augmented1/checkpoint-179
Configuration saved in clinico-xlm-roberta-large-finetuned-augmented1/checkpoint-179/config.json
Model weights saved in clinico-xlm-roberta-large-finetuned-augmented1/checkpoint-179/pytorch_model.bin
tokenizer config file saved in clinico-xlm-roberta-large-finetuned-augmented1/checkpoint-179/tokenizer_config.json
Special tokens file saved in clinico-xlm-roberta-large-finetuned-augmented1/checkpoint-179/special_tokens_map.json
tokenizer config file saved in clinico-xlm-roberta-large-finetuned-augmented

TrainOutput(global_step=8950, training_loss=0.033443204037970006, metrics={'train_runtime': 5114.718, 'train_samples_per_second': 27.861, 'train_steps_per_second': 1.75, 'total_flos': 3.723745042335199e+16, 'train_loss': 0.033443204037970006, 'epoch': 50.0})

In [14]:
trainer.push_to_hub()

Saving model checkpoint to clinico-xlm-roberta-large-finetuned-augmented1
Configuration saved in clinico-xlm-roberta-large-finetuned-augmented1/config.json
Model weights saved in clinico-xlm-roberta-large-finetuned-augmented1/pytorch_model.bin
tokenizer config file saved in clinico-xlm-roberta-large-finetuned-augmented1/tokenizer_config.json
Special tokens file saved in clinico-xlm-roberta-large-finetuned-augmented1/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 32.0k/1.03G [00:00<?, ?B/s]

Upload file runs/Mar17_08-07-55_minion/events.out.tfevents.1679037210.minion.117171.0: 100%|##########| 30.8k/…

To https://huggingface.co/joheras/clinico-xlm-roberta-large-finetuned-augmented1
   2e5baa2..4bdbd51  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Token Classification', 'type': 'token-classification'}, 'metrics': [{'name': 'Precision', 'type': 'precision', 'value': 0.507380073800738}, {'name': 'Recall', 'type': 'recall', 'value': 0.6292906178489702}, {'name': 'F1', 'type': 'f1', 'value': 0.5617977528089888}, {'name': 'Accuracy', 'type': 'accuracy', 'value': 0.8559619484992619}]}
To https://huggingface.co/joheras/clinico-xlm-roberta-large-finetuned-augmented1
   4bdbd51..f908991  main -> main



'https://huggingface.co/joheras/clinico-xlm-roberta-large-finetuned-augmented1/commit/4bdbd51389de0903ec74c225c25b495b46c20ac3'

In [15]:
!rm -rf clinico-xlm-roberta-large-finetuned-augmented1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
!

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
