# Loading data

In [1]:
import json

In [2]:
with open('clinais.train.json') as f:
    data = json.load(f)

In [3]:
from tqdm import tqdm

In [4]:
finalresult = []
for key in tqdm(data['annotated_entries'].keys()):
    ident = data['annotated_entries'][key]['note_id']
    res = data['annotated_entries'][key]['note_text']
    finalresult.append([ident,res])

# finalresult    

100%|██████████| 781/781 [00:00<00:00, 424870.48it/s]


In [5]:
with open('clinais.dev.json') as f:
    data = json.load(f)

In [6]:
finalresultdev = []
for key in tqdm(data['annotated_entries'].keys()):
    ident = data['annotated_entries'][key]['note_id']
    res = data['annotated_entries'][key]['note_text']
    finalresultdev.append([ident,res])

100%|██████████| 127/127 [00:00<00:00, 486462.66it/s]


In [7]:
from datasets import Dataset,DatasetDict

In [8]:
import pandas as pd
df = pd.DataFrame(data=finalresult,columns=['id','text'])
dataset_train = Dataset.from_pandas(df)

In [9]:
df = pd.DataFrame(data=finalresultdev,columns=['id','text'])
dataset_val = Dataset.from_pandas(df)

In [10]:
dataset = DatasetDict(train=dataset_train,val=dataset_val)

# Processing dataset

In [11]:
from transformers import AutoTokenizer

In [12]:
modelCheckpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(modelCheckpoint)

In [13]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


tokenized_dataset = dataset.map(tokenize_function, batched=True,remove_columns=["id","text"])

  0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (876 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
chunk_size = 128

In [15]:
tokenized_samples = tokenized_dataset["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 876'
'>>> Review 1 length: 608'
'>>> Review 2 length: 724'


In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 781
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 127
    })
})

In [17]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 2208'


In [18]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [19]:
lm_datasets = tokenized_dataset.map(group_texts, batched=True)
lm_datasets

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 3562
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 565
    })
})

In [20]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [21]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> <s> En Mayo de 1997, una mujer de 29 años<mask> edad fue intervenida, en otro centro, de un carcinoma de la glándula supra<mask>al izquierda clínicamente<mask> funciona<mask><mask> se manifest<mask> clínicamente como molestias en el flanco izquier叹<mask> poco específicas<mask><mask> el postparto inmediato; la ecografía y la tomografía axi<mask> computerizada abdominales mostraron una masa suprarren<mask> izquierda de 10 cmts<mask>, sólida y con áreas АТО calcificación y necrosis en su interior, siendo la radiografía de tóra<mask> y la<mask>grafía ósea'

'>>> normales. En los análisis, presentaba ligero nämligen de la cortisoluria (284.5 m<mask>gr.<mask>h.<mask> y de 17-OH deineesteroides en<mask>rina (12.7 mcg./24h.), sin síntomas de hipercor<mask>solismo sistémico.<mask> realizó<mask>cción completa de la tumoración, con el diagnóstico histológico de carcinoma supra<mask>al de 10 x 7 x 5 cmts. (215 gr pump.) bien<mask><mask><mask>ado<mask><mask> con invasión vascular, amplias zon

In [22]:
from transformers import TrainingArguments


batch_size = 8
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
model_name = modelCheckpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-clinais",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [23]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained(modelCheckpoint)

In [24]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Cloning https://huggingface.co/joheras/xlm-roberta-base-finetuned-clinais into local empty directory.
Using cuda_amp half precision backend


In [25]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `XLMRobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 565
  Batch size = 16


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjoheras[0m. Use [1m`wandb login --relogin`[0m to force relogin


>>> Perplexity: 10.34


In [26]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `XLMRobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `XLMRobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3562
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1115
  Number of trainable parameters = 278295186


Epoch,Training Loss,Validation Loss
1,No log,1.781785
2,1.959100,1.689564
3,1.959100,1.61952
4,1.705500,1.580444
5,1.705500,1.610419


The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `XLMRobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 565
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `XLMRobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 565
  Batch size = 16
Saving model checkpoint to xlm-roberta-base-finetuned-clinais/checkpoint-500
Configuration saved in xlm-roberta-base-finetuned-clinais/checkpoint-500/config.json
Model weights saved in xlm-roberta-base-finetuned-clinais/checkpoint-500/pytorch_model.bin
tokenizer config file saved in xlm-roberta-base-finetuned-clinais/checkpoint-500/tokenizer_config.json
Spec

TrainOutput(global_step=1115, training_loss=1.798773439689602, metrics={'train_runtime': 640.6893, 'train_samples_per_second': 27.798, 'train_steps_per_second': 1.74, 'total_flos': 1174921493990400.0, 'train_loss': 1.798773439689602, 'epoch': 5.0})

In [27]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `XLMRobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 565
  Batch size = 16


>>> Perplexity: 5.19


In [28]:
trainer.push_to_hub()

Saving model checkpoint to xlm-roberta-base-finetuned-clinais
Configuration saved in xlm-roberta-base-finetuned-clinais/config.json
Model weights saved in xlm-roberta-base-finetuned-clinais/pytorch_model.bin
tokenizer config file saved in xlm-roberta-base-finetuned-clinais/tokenizer_config.json
Special tokens file saved in xlm-roberta-base-finetuned-clinais/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 32.0k/1.04G [00:00<?, ?B/s]

Upload file runs/Mar15_12-43-39_minion/events.out.tfevents.1678880640.minion.3960056.0: 100%|##########| 6.05k…

Upload file runs/Mar15_12-43-39_minion/events.out.tfevents.1678881295.minion.3960056.2: 100%|##########| 311/3…

remote: Scanning LFS files of refs/heads/main for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/joheras/xlm-roberta-base-finetuned-clinais
   a058922..7a38899  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Masked Language Modeling', 'type': 'fill-mask'}}
To https://huggingface.co/joheras/xlm-roberta-base-finetuned-clinais
   7a38899..64cd2e9  main -> main



'https://huggingface.co/joheras/xlm-roberta-base-finetuned-clinais/commit/7a38899a0e13752ec4b23426ff6ab25c58a5973b'

In [31]:
!rm -rf clinico-roberta-biomedical-finetuned/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
