# Loading data

In [1]:
import json

In [2]:
with open('clinais.train.json') as f:
    data = json.load(f)

In [3]:
from tqdm import tqdm

In [4]:
finalresult = []
for key in tqdm(data['annotated_entries'].keys()):
    ident = data['annotated_entries'][key]['note_id']
    res = data['annotated_entries'][key]['note_text']
    finalresult.append([ident,res])

# finalresult    

100%|██████████| 781/781 [00:00<00:00, 503342.26it/s]


In [5]:
with open('clinais.test&background.blind.json') as f:
    data = json.load(f)

In [6]:
for key in tqdm(data['annotated_entries'].keys()):
    ident = data['annotated_entries'][key]['note_id']
    res = data['annotated_entries'][key]['note_text']
    finalresult.append([ident,res])


100%|██████████| 2843/2843 [00:00<00:00, 44739.96it/s]


In [7]:
with open('clinais.dev.json') as f:
    data = json.load(f)

In [8]:
finalresultdev = []
for key in tqdm(data['annotated_entries'].keys()):
    ident = data['annotated_entries'][key]['note_id']
    res = data['annotated_entries'][key]['note_text']
    finalresultdev.append([ident,res])

100%|██████████| 127/127 [00:00<00:00, 393409.61it/s]


In [9]:
from datasets import Dataset,DatasetDict

In [10]:
import pandas as pd
df = pd.DataFrame(data=finalresult,columns=['id','text'])
dataset_train = Dataset.from_pandas(df)

In [11]:
df = pd.DataFrame(data=finalresultdev,columns=['id','text'])
dataset_val = Dataset.from_pandas(df)

In [12]:
dataset = DatasetDict(train=dataset_train,val=dataset_val)

# Processing dataset

In [13]:
from transformers import AutoTokenizer

In [14]:
modelCheckpoint = "PlanTL-GOB-ES/bsc-bio-ehr-es"
tokenizer = AutoTokenizer.from_pretrained(modelCheckpoint)

In [15]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


tokenized_dataset = dataset.map(tokenize_function, batched=True,remove_columns=["id","text"])

  0%|          | 0/4 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (659 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
chunk_size = 128

In [17]:
tokenized_samples = tokenized_dataset["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 659'
'>>> Review 1 length: 458'
'>>> Review 2 length: 544'


In [18]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 3624
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 127
    })
})

In [19]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 1661'


In [20]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [21]:
lm_datasets = tokenized_dataset.map(group_texts, batched=True)
lm_datasets

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 13865
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 433
    })
})

In [22]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [23]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> <s> En Mayo de 1997, una mujer de 29 años<mask> edad fue<mask>, en<mask> centro<mask> de 1935 carcinoma de la<mask> suprarrenal izquierda clínicamente<mask> funcionante que se manifestó clínicamente como molestias en el flanco izquierdo,<mask> específicas, en el postparto inmediato<mask> la ecografía y la tomografía axial computerizada<mask> mostraron una masa suprarrenal izquierda<mask> 10 cm percibida., sólida y con áreas de calcificación<mask><mask><mask> su<mask>,<mask> la<mask> de tórax y la gammagrafía ósea normales. En los análisis, presentaba ligero aumento de la cortisoluria (284.5 mcgr<mask>24h<mask> y de 17-OH-esteroides en orina (12.'

'>>> 7 mcg./24h.), sin síntomas de hipercortisolismo sistémico<mask> Se realizó resección completa<mask> la tumoración, con el diagnóstico histológico<mask> carcinoma suprarrenal de 10 x<mask> x 5 cmts. (215 grs.) bien encapsulado, aunque con<mask> vascular, amplias zonas de necrosis y un índice mitótico de 5.8/50<mask> el estudio histo

In [24]:
from transformers import TrainingArguments


batch_size = 8
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
model_name = modelCheckpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-clinais-v2",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [25]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained(modelCheckpoint)

In [26]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Cloning https://huggingface.co/joheras/bsc-bio-ehr-es-finetuned-clinais-v2 into local empty directory.
Using cuda_amp half precision backend


In [27]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 433
  Batch size = 16


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjoheras[0m. Use [1m`wandb login --relogin`[0m to force relogin


>>> Perplexity: 7.10


In [28]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13865
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4335
  Number of trainable parameters = 124695126


Epoch,Training Loss,Validation Loss
1,No log,1.356743
2,1.415500,1.321178
3,1.415500,1.297978
4,1.326300,1.262797
5,1.326300,1.269203


Saving model checkpoint to bsc-bio-ehr-es-finetuned-clinais-v2/checkpoint-500
Configuration saved in bsc-bio-ehr-es-finetuned-clinais-v2/checkpoint-500/config.json
Model weights saved in bsc-bio-ehr-es-finetuned-clinais-v2/checkpoint-500/pytorch_model.bin
tokenizer config file saved in bsc-bio-ehr-es-finetuned-clinais-v2/checkpoint-500/tokenizer_config.json
Special tokens file saved in bsc-bio-ehr-es-finetuned-clinais-v2/checkpoint-500/special_tokens_map.json
tokenizer config file saved in bsc-bio-ehr-es-finetuned-clinais-v2/tokenizer_config.json
Special tokens file saved in bsc-bio-ehr-es-finetuned-clinais-v2/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 433
  Batch size = 16
Saving model checkpoint to bsc-bio-ehr-es-fi

TrainOutput(global_step=4335, training_loss=1.355947626063293, metrics={'train_runtime': 484.7874, 'train_samples_per_second': 143.001, 'train_steps_per_second': 8.942, 'total_flos': 4562719401024000.0, 'train_loss': 1.355947626063293, 'epoch': 5.0})

In [29]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 433
  Batch size = 16


>>> Perplexity: 3.83


In [30]:
trainer.push_to_hub()

Saving model checkpoint to bsc-bio-ehr-es-finetuned-clinais-v2
Configuration saved in bsc-bio-ehr-es-finetuned-clinais-v2/config.json
Model weights saved in bsc-bio-ehr-es-finetuned-clinais-v2/pytorch_model.bin
tokenizer config file saved in bsc-bio-ehr-es-finetuned-clinais-v2/tokenizer_config.json
Special tokens file saved in bsc-bio-ehr-es-finetuned-clinais-v2/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 1.00/476M [00:00<?, ?B/s]

Upload file runs/May03_08-31-20_minion/events.out.tfevents.1683095500.minion.533668.0:   0%|          | 1.00/5…

Upload file runs/May03_08-31-20_minion/events.out.tfevents.1683095994.minion.533668.2:   0%|          | 1.00/3…

To https://huggingface.co/joheras/bsc-bio-ehr-es-finetuned-clinais-v2
   dbc70fc..24459f2  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Masked Language Modeling', 'type': 'fill-mask'}}
To https://huggingface.co/joheras/bsc-bio-ehr-es-finetuned-clinais-v2
   24459f2..7212c7e  main -> main



'https://huggingface.co/joheras/bsc-bio-ehr-es-finetuned-clinais-v2/commit/24459f27f79351431da3fb00e26f5fdb1e0e4a73'

In [31]:
!rm -rf bsc-bio-ehr-es-finetuned-clinais-v2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
