In [None]:
# Adapted HuggingFace example notebook on Github
# https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('../Thesis/data/complete-clean-preprocessed-data-2010-2020-1.tsv', sep='\t')


X_train, X_test = train_test_split(df.preprocessed_hlead, test_size=0.33, random_state=42)
X_train.to_csv('train_texts_1.txt', sep=' ', index=False)
X_test.to_csv('test_texts_1.txt', sep=' ', index=False)

In [3]:
from datasets import load_dataset

datasets = load_dataset("text", data_files={"train": 'train_texts_1.txt', "validation": 'test_texts_1.txt'})

Using custom data configuration default-0157bbaf91f80aec


Downloading and preparing dataset text/default to /data/michellechan/.cache/huggingface/datasets/text/default-0157bbaf91f80aec/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /data/michellechan/.cache/huggingface/datasets/text/default-0157bbaf91f80aec/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
datasets["train"]

Dataset({
    features: ['text'],
    num_rows: 7399
})

In [5]:
from transformers import RobertaTokenizer

block_size = 128 #256

tokenizer = RobertaTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")

def tokenize_function(examples, tokenizer = tokenizer):
    return tokenizer(examples["text"])

def group_texts(examples, block_size = block_size):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [6]:
group_texts

<function __main__.group_texts(examples, block_size=128)>

In [5]:
from transformers import RobertaTokenizer, RobertaForMaskedLM

model = RobertaForMaskedLM.from_pretrained("pdelobelle/robbert-v2-dutch-base")

In [7]:
from transformers import Trainer, TrainingArguments
model_name = 'pdelobelle/robbert-v2-dutch-base'.split("/")[-1]
training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-model_1",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    per_gpu_train_batch_size=8,
    save_steps=1_000,
#     push_to_hub=True,
)

In [8]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

In [10]:
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 29386
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 11022


Epoch,Training Loss,Validation Loss
1,2.1031,1.956935
2,1.9983,1.898727
3,1.9592,1.87365


Saving model checkpoint to robbert-v2-dutch-base-finetuned-model\checkpoint-1000
Configuration saved in robbert-v2-dutch-base-finetuned-model\checkpoint-1000\config.json
Model weights saved in robbert-v2-dutch-base-finetuned-model\checkpoint-1000\pytorch_model.bin
Saving model checkpoint to robbert-v2-dutch-base-finetuned-model\checkpoint-2000
Configuration saved in robbert-v2-dutch-base-finetuned-model\checkpoint-2000\config.json
Model weights saved in robbert-v2-dutch-base-finetuned-model\checkpoint-2000\pytorch_model.bin
Saving model checkpoint to robbert-v2-dutch-base-finetuned-model\checkpoint-3000
Configuration saved in robbert-v2-dutch-base-finetuned-model\checkpoint-3000\config.json
Model weights saved in robbert-v2-dutch-base-finetuned-model\checkpoint-3000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 14652
  Batch size = 8
Saving model checkpoint to robbert-v2-dutch-base-finetuned-model\checkpoint-4000
Configuration saved in robbert-v2-dutch-base-finetune

TrainOutput(global_step=11022, training_loss=2.0629372303422273, metrics={'train_runtime': 1277.2859, 'train_samples_per_second': 69.02, 'train_steps_per_second': 8.629, 'total_flos': 5801544182956032.0, 'train_loss': 2.0629372303422273, 'epoch': 3.0})

In [12]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 14652
  Batch size = 8


Perplexity: 6.52


# Demo

In [16]:
?AutoModelForMaskedLM.from_pretrained

[1;31mSignature:[0m [0mAutoModelForMaskedLM[0m[1;33m.[0m[0mfrom_pretrained[0m[1;33m([0m[1;33m*[0m[0mmodel_args[0m[1;33m,[0m [1;33m**[0m[0mkwargs[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Instantiate one of the model classes of the library (with a masked language modeling head) from a pretrained model.

The model class to instantiate is selected based on the :obj:`model_type` property of the config object (either
passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's missing,
by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:

    - **albert** -- :class:`~transformers.AlbertForMaskedLM` (ALBERT model)
    - **bart** -- :class:`~transformers.BartForConditionalGeneration` (BART model)
    - **bert** -- :class:`~transformers.BertForMaskedLM` (BERT model)
    - **big_bird** -- :class:`~transformers.BigBirdForMaskedLM` (BigBird model)
    - **camembert** -- :class:`~transfor

In [4]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import RobertaTokenizer
#import torch

#fine-tuned model
tokenizer = RobertaTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")
#trained_model = AutoModelForMaskedLM.from_pretrained(os.path.join(data_dir, "roberta_model_tf_idf_tokens"))
trained_model = AutoModelForMaskedLM.from_pretrained("../Thesis/robbert-v2-dutch-base-finetuned-model/checkpoint-11000")
vanilla_model = AutoModelForMaskedLM.from_pretrained("pdelobelle/robbert-v2-dutch-base")


In [15]:
import torch
#sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
sequence = f"Pfas is aangetroffen in  {tokenizer.mask_token}."

input = tokenizer.encode(sequence, return_tensors="pt")
mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
for model in [vanilla_model, trained_model]:
    token_logits = model(input)[0]
    mask_token_logits = token_logits[0, mask_token_index, :]

    top_5_tokens = torch.topk(mask_token_logits, 10, dim=1).indices[0].tolist()

    for token in top_5_tokens:
        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
    print()


Pfas is aangetroffen in   Groningen.
Pfas is aangetroffen in   Nederland.
Pfas is aangetroffen in   Amsterdam.
Pfas is aangetroffen in   Leiden.
Pfas is aangetroffen in   Rotterdam.
Pfas is aangetroffen in   Utrecht.
Pfas is aangetroffen in   Canada.
Pfas is aangetroffen in   Zeeland.
Pfas is aangetroffen in   Duitsland.
Pfas is aangetroffen in   Suriname.

Pfas is aangetroffen in   groningen.
Pfas is aangetroffen in   duitsland.
Pfas is aangetroffen in   nederland.
Pfas is aangetroffen in   zwolle.
Pfas is aangetroffen in   rotterdam.
Pfas is aangetroffen in   tuinen.
Pfas is aangetroffen in   2016.
Pfas is aangetroffen in   nijmegen.
Pfas is aangetroffen in   2011.
Pfas is aangetroffen in   amsterdam.

