# Loading data

In [1]:
import json

In [2]:
with open('clinais.train.json') as f:
    data = json.load(f)

In [3]:
from tqdm import tqdm

In [4]:
finalresult = []
for key in tqdm(data['annotated_entries'].keys()):
    ident = data['annotated_entries'][key]['note_id']
    res = []
    tags = []
    gold = data['annotated_entries'][key]['boundary_annotation']['gold']
    currentboundary = ''
    for g in gold:
        res.append(g['span'])
        if(g['boundary'] is None):
            tags.append('I-'+currentboundary)
        else:
            currentboundary = g['boundary']
            tags.append('B-'+currentboundary)
    finalresult.append([ident,res,tags])

# finalresult    

100%|██████████| 781/781 [00:00<00:00, 9436.02it/s]


In [5]:
import numpy as np
import itertools
tags = [x[2] for x in finalresult]
tags = np.unique(list(itertools.chain(*tags)))
id2label = {}
label2id = {}
for i,tag in enumerate(tags):
    id2label[i] = tag
    label2id[tag] = i

In [6]:
finalresult = [[x[0],x[1],[label2id[y] for y in x[2]]] for x in finalresult]
#finalresult[0]


In [7]:
with open('clinais.dev.json') as f:
    data = json.load(f)

In [8]:
finalresultdev = []
for key in tqdm(data['annotated_entries'].keys()):
    ident = data['annotated_entries'][key]['note_id']
    res = []
    tags = []
    gold = data['annotated_entries'][key]['boundary_annotation']['gold']
    currentboundary = ''
    for g in gold:
        res.append(g['span'])
        if(g['boundary'] is None):
            tags.append('I-'+currentboundary)
        else:
            currentboundary = g['boundary']
            tags.append('B-'+currentboundary)
    finalresultdev.append([ident,res,tags])

100%|██████████| 127/127 [00:00<00:00, 11479.11it/s]


In [9]:
finalresultdev = [[x[0],x[1],[label2id[y] for y in x[2]]] for x in finalresultdev]


In [10]:
from datasets import Dataset,DatasetDict

In [11]:
import pandas as pd
df = pd.DataFrame(data=finalresult,columns=['id','tokens','tags'])
dataset_train = Dataset.from_pandas(df)

In [12]:
df = pd.DataFrame(data=finalresultdev,columns=['id','tokens','tags'])
dataset_val = Dataset.from_pandas(df)

In [13]:
dataset = DatasetDict(train=dataset_train,val=dataset_val)

# Processing dataset

In [14]:
from transformers import AutoTokenizer

In [15]:
modelCheckpoint = "joheras/xlm-roberta-large-finetuned-clinais"
tokenizer = AutoTokenizer.from_pretrained(modelCheckpoint)

In [16]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [17]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [18]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 781
    })
    val: Dataset({
        features: ['id', 'tokens', 'tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 127
    })
})

In [19]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [20]:
import evaluate

seqeval = evaluate.load("seqeval")

In [21]:
import numpy as np



def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [22]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    modelCheckpoint, num_labels=len(id2label), id2label=id2label, label2id=label2id
)

Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joheras/xlm-roberta-large-finetuned-clinais were not used when initializing XLMRobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at joheras/xlm-roberta-large-finetuned-clinais and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably T

In [23]:
training_args = TrainingArguments(
    output_dir="clinico-xlm-roberta-large-finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Cloning https://huggingface.co/joheras/clinico-xlm-roberta-large-finetuned into local empty directory.


Download file pytorch_model.bin:   0%|          | 8.00k/2.08G [00:00<?, ?B/s]

Download file runs/Mar15_08-48-07_minion/1678866492.937202/events.out.tfevents.1678866492.minion.3916099.1: 10…

Download file runs/Mar15_07-36-32_minion/1678862200.6369078/events.out.tfevents.1678862200.minion.3903462.1: 1…

Download file runs/Mar15_08-48-07_minion/events.out.tfevents.1678866492.minion.3916099.0:  43%|####2     | 8.1…

Clean file runs/Mar15_08-48-07_minion/1678866492.937202/events.out.tfevents.1678866492.minion.3916099.1:  18%|…

Clean file runs/Mar15_07-36-32_minion/1678862200.6369078/events.out.tfevents.1678862200.minion.3903462.1:  18%…

Clean file runs/Mar15_08-48-07_minion/events.out.tfevents.1678866492.minion.3916099.0:   5%|5         | 1.00k/…

Download file runs/Mar15_07-40-39_minion/1678862444.515115/events.out.tfevents.1678862444.minion.3905698.1: 10…

Download file runs/Mar15_07-40-39_minion/events.out.tfevents.1678862444.minion.3905698.0:  43%|####2     | 8.1…

Clean file runs/Mar15_07-40-39_minion/1678862444.515115/events.out.tfevents.1678862444.minion.3905698.1:  18%|…

Download file sentencepiece.bpe.model:   0%|          | 17.4k/4.83M [00:00<?, ?B/s]

Clean file runs/Mar15_07-40-39_minion/events.out.tfevents.1678862444.minion.3905698.0:   5%|5         | 1.00k/…

Download file tokenizer.json:   0%|          | 12.5k/16.3M [00:00<?, ?B/s]

Download file runs/Mar15_07-36-32_minion/events.out.tfevents.1678862200.minion.3903462.0: 100%|##########| 5.6…

Download file training_args.bin: 100%|##########| 3.50k/3.50k [00:00<?, ?B/s]

Clean file runs/Mar15_07-36-32_minion/events.out.tfevents.1678862200.minion.3903462.0:  18%|#7        | 1.00k/…

Clean file training_args.bin:  29%|##8       | 1.00k/3.50k [00:00<?, ?B/s]

Clean file sentencepiece.bpe.model:   0%|          | 1.00k/4.83M [00:00<?, ?B/s]

Clean file tokenizer.json:   0%|          | 1.00k/16.3M [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/2.08G [00:00<?, ?B/s]

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: tokens, id, tags. If tokens, id, tags are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 781
  Num Epochs = 100
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4900
  Number of trainable parameters = 558855182
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjoheras[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.59928,0.224215,0.457666,0.300978,0.816762
2,No log,0.576157,0.236522,0.466819,0.313967,0.828014
3,No log,0.562665,0.24605,0.498856,0.329554,0.832573
4,No log,0.526019,0.302934,0.543478,0.389025,0.853403
5,No log,0.5838,0.305828,0.534325,0.389005,0.855864
6,No log,0.608459,0.336949,0.56865,0.423159,0.854486
7,No log,0.648066,0.353227,0.58238,0.439741,0.859144
8,No log,0.68091,0.352281,0.574371,0.436712,0.857963
9,No log,0.803953,0.386381,0.577803,0.46309,0.856848
10,No log,0.750515,0.377483,0.586957,0.459472,0.863539


The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: tokens, id, tags. If tokens, id, tags are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 127
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to clinico-xlm-roberta-large-finetuned/checkpoint-49
Configuration saved in clinico-xlm-roberta-large-finetuned/checkpoint-49/config.json
Model weights saved in clinico-xlm-roberta-large-finetuned/checkpoint-49/pytorch_model.bin
tokenizer config file saved in clinico-xlm-roberta-large-finetuned/checkpoint-49/tokenizer_config.json
Special tokens file saved in clinico-xlm-roberta-large-finetuned/checkpoint-49/special_tokens_map.json
tokenizer config file saved in clinico-xlm-roberta-large-finetuned/tokenizer_config.json
Special tokens file saved in clinico-xl

TrainOutput(global_step=4900, training_loss=0.035870392730041424, metrics={'train_runtime': 7290.9678, 'train_samples_per_second': 10.712, 'train_steps_per_second': 0.672, 'total_flos': 7.25349671405568e+16, 'train_loss': 0.035870392730041424, 'epoch': 100.0})

In [24]:
trainer.push_to_hub()

Saving model checkpoint to clinico-xlm-roberta-large-finetuned
Configuration saved in clinico-xlm-roberta-large-finetuned/config.json
Model weights saved in clinico-xlm-roberta-large-finetuned/pytorch_model.bin
tokenizer config file saved in clinico-xlm-roberta-large-finetuned/tokenizer_config.json
Special tokens file saved in clinico-xlm-roberta-large-finetuned/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 32.0k/2.08G [00:00<?, ?B/s]

Upload file runs/Mar15_20-23-36_minion/events.out.tfevents.1678908575.minion.4017099.0:  61%|######    | 32.0k…

remote: Scanning LFS files of refs/heads/main for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/joheras/clinico-xlm-roberta-large-finetuned
   d79c48e..63438aa  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Token Classification', 'type': 'token-classification'}, 'metrics': [{'name': 'Precision', 'type': 'precision', 'value': 0.5417867435158501}, {'name': 'Recall', 'type': 'recall', 'value': 0.6453089244851259}, {'name': 'F1', 'type': 'f1', 'value': 0.5890339425587467}, {'name': 'Accuracy', 'type': 'accuracy', 'value': 0.870821715597835}]}
To https://huggingface.co/joheras/clinico-xlm-roberta-large-finetuned
   63438aa..a0d03c0  main -> main



'https://huggingface.co/joheras/clinico-xlm-roberta-large-finetuned/commit/63438aa372b4c090265b75c0093dade6c7604136'

In [25]:
!rm -rf xlm-roberta-large-finetuned-clinais/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [26]:
!

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
