# Loading data

In [1]:
import json

In [2]:
with open('clinais.train.json') as f:
    data = json.load(f)

In [3]:
from tqdm import tqdm

In [4]:
finalresult = []
for key in tqdm(data['annotated_entries'].keys()):
    ident = data['annotated_entries'][key]['note_id']
    res = data['annotated_entries'][key]['note_text']
    finalresult.append([ident,res])

# finalresult    

100%|██████████| 781/781 [00:00<00:00, 200954.02it/s]


In [5]:
with open('clinais.dev.json') as f:
    data = json.load(f)

In [6]:
finalresultdev = []
for key in tqdm(data['annotated_entries'].keys()):
    ident = data['annotated_entries'][key]['note_id']
    res = data['annotated_entries'][key]['note_text']
    finalresultdev.append([ident,res])

100%|██████████| 127/127 [00:00<00:00, 472650.05it/s]


In [7]:
from datasets import Dataset,DatasetDict

In [8]:
import pandas as pd
df = pd.DataFrame(data=finalresult,columns=['id','text'])
dataset_train = Dataset.from_pandas(df)

In [9]:
df = pd.DataFrame(data=finalresultdev,columns=['id','text'])
dataset_val = Dataset.from_pandas(df)

In [10]:
dataset = DatasetDict(train=dataset_train,val=dataset_val)

# Processing dataset

In [11]:
from transformers import AutoTokenizer

In [12]:
modelCheckpoint = "dccuchile/distilbert-base-spanish-uncased"
tokenizer = AutoTokenizer.from_pretrained(modelCheckpoint)

In [13]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


tokenized_dataset = dataset.map(tokenize_function, batched=True,remove_columns=["id","text"])

  0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (826 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
chunk_size = 128

In [15]:
tokenized_samples = tokenized_dataset["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 826'
'>>> Review 1 length: 550'
'>>> Review 2 length: 640'


In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 781
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 127
    })
})

In [17]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 2016'


In [18]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [19]:
lm_datasets = tokenized_dataset.map(group_texts, batched=True)
lm_datasets

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 3267
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 521
    })
})

In [20]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [21]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> [CLS] en mayo de 1997, una mujer de 29 años de edad fue interven [MASK], en otro centro, de un carcino [MASK] [MASK] la glándula suprarrenal izquierda clínicamente no funcionan [MASK] que se manifestó clínicamente como molestias en el flanco izquierdo, poco específicas, en [MASK] postparto inmediato ; la ecografía y la [MASK]grafía axial bodega [MASK] [MASK] abdominales mostraron una masa suprarrenal izquierda de 10 protestantets., sólida y con áreas de calcificación y necros [MASK] en su interior, siendo la radiografía de tó [MASK] [MASK] la gammagrafía ósea normales. en los análisis, presentaba ligero aumento'

'>>> de la cortisolu [MASK] ( 284. 5 mcgr [MASK] / 24h. ) y vegetación 1 [MASK] - oh - esteroides [MASK] [MASK] ( 12. 7 mcg. / 24h. ), sin síntomas de hipercortisolismo sistémico. se realizó [MASK] [MASK] completa de [MASK] tumoración [MASK] con el diagnóstico [MASK]ológico de carcinoma supra [MASK] [MASK]al de [MASK] x [MASK] x 5 cmts. ( [MASK] grs. ) bien encapsulado, 

In [22]:
from transformers import TrainingArguments


batch_size = 8
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
model_name = modelCheckpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-clinais",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [23]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained(modelCheckpoint)

In [24]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Cloning https://huggingface.co/joheras/distilbert-base-spanish-uncased-finetuned-clinais into local empty directory.
Using cuda_amp half precision backend


In [25]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 521
  Batch size = 16


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjoheras[0m. Use [1m`wandb login --relogin`[0m to force relogin


>>> Perplexity: 221.96


In [26]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3267
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1025
  Number of trainable parameters = 66961434


Epoch,Training Loss,Validation Loss
1,No log,3.387962
2,3.574500,3.118402
3,3.574500,2.993953
4,3.072100,2.967062
5,3.072100,2.909892


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 521
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 521
  Batch size = 16
Saving model checkpoint to distilbert-base-spanish-uncased-finetuned-clinais/checkpoint-500
Configuration saved in distilbert-base-spanish-uncased-finetuned-clinais/checkpoint-500/config.json
Model weights saved in distilbert-base-spanish-uncased-finetuned-clinais/checkpoint-500/pytorch_model.bin
tokenizer config file saved in distilbert-base-spanish-uncased

TrainOutput(global_step=1025, training_loss=3.2524210413490855, metrics={'train_runtime': 155.8893, 'train_samples_per_second': 104.786, 'train_steps_per_second': 6.575, 'total_flos': 541352642941440.0, 'train_loss': 3.2524210413490855, 'epoch': 5.0})

In [27]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 521
  Batch size = 16


>>> Perplexity: 17.99


In [28]:
trainer.push_to_hub()

Saving model checkpoint to distilbert-base-spanish-uncased-finetuned-clinais
Configuration saved in distilbert-base-spanish-uncased-finetuned-clinais/config.json
Model weights saved in distilbert-base-spanish-uncased-finetuned-clinais/pytorch_model.bin
tokenizer config file saved in distilbert-base-spanish-uncased-finetuned-clinais/tokenizer_config.json
Special tokens file saved in distilbert-base-spanish-uncased-finetuned-clinais/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 32.0k/257M [00:00<?, ?B/s]

Upload file runs/Mar16_07-45-17_minion/events.out.tfevents.1678949131.minion.4087395.0: 100%|##########| 5.99k…

Upload file runs/Mar16_07-45-17_minion/events.out.tfevents.1678949296.minion.4087395.2: 100%|##########| 311/3…

To https://huggingface.co/joheras/distilbert-base-spanish-uncased-finetuned-clinais
   622e728..905beb9  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Masked Language Modeling', 'type': 'fill-mask'}}
To https://huggingface.co/joheras/distilbert-base-spanish-uncased-finetuned-clinais
   905beb9..7172067  main -> main



'https://huggingface.co/joheras/distilbert-base-spanish-uncased-finetuned-clinais/commit/905beb95c7bc81a9ce382a33b8520d87b05ad529'