In [None]:
with open('../input/articles/2015_articles.txt', 'r') as f:
    articles = f.readlines()
    print(f'Found {len(articles)} articles')

In [None]:
articles[0]

In [None]:
len(articles[0].split())

# Training the MLM

In [1]:
!pip install transformers datasets evaluate



In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

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

In [2]:
!huggingface-cli whoami

imene-kolli


In [3]:
%env PYTORCH_ENABLE_MPS_FALLBACK= 1

env: PYTORCH_ENABLE_MPS_FALLBACK=1


In [4]:
from datasets import load_dataset

In [5]:
eli5 = load_dataset("eli5", split="train_asks[:5000]")

Found cached dataset eli5 (/Users/imenekolli/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


In [6]:
eli5 = eli5.train_test_split(test_size=0.2)

In [7]:
eli5["train"][0]

{'q_id': '6v2om3',
 'title': 'Please clear things up for me about the Great Barrier Reef. How much is gone, how much is bleached, can it be saved, what are the long term effects?',
 'selftext': '',
 'document': '',
 'subreddit': 'askscience',
 'answers': {'a_id': ['dlxlj4h', 'dlyb289', 'dlyjz9a'],
  'text': ['*Apologies in advance for a lack of scientific sourcing. If you would like anything sourced better, let me know.*\n\nCoral "bleaching" is the algae having left the "coral". That is, the coral itself is literally the skeleton, so being "bleached" is literally  to be dead.\n\n_URL_2_\n\nBut, since the "coral" is [mostly] just the skeleton, new algae can be reintroduced to the skeleton. However, if the cause is not mitigated, then recolonization of the coral is very unlikely. Whatever caused it to bleach, would prevent it from coming back. In this case, the main causes seem to be particulate pollution (eg sediments), increased temperature, increased acidity and increased nitrogen.\n\

### Preprocess

In [8]:
from transformers import AutoTokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

In [10]:
eli5 = eli5.flatten()
eli5["train"][0]

{'q_id': '6v2om3',
 'title': 'Please clear things up for me about the Great Barrier Reef. How much is gone, how much is bleached, can it be saved, what are the long term effects?',
 'selftext': '',
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['dlxlj4h', 'dlyb289', 'dlyjz9a'],
 'answers.text': ['*Apologies in advance for a lack of scientific sourcing. If you would like anything sourced better, let me know.*\n\nCoral "bleaching" is the algae having left the "coral". That is, the coral itself is literally the skeleton, so being "bleached" is literally  to be dead.\n\n_URL_2_\n\nBut, since the "coral" is [mostly] just the skeleton, new algae can be reintroduced to the skeleton. However, if the cause is not mitigated, then recolonization of the coral is very unlikely. Whatever caused it to bleach, would prevent it from coming back. In this case, the main causes seem to be particulate pollution (eg sediments), increased temperature, increased acidity and increased nitrogen.

In [11]:
max_sequence_length = 512  # Maximum sequence length allowed by the model

def preprocess_function(examples):
    # Split the input sequences into smaller parts if they exceed the maximum length
    texts = [" ".join(x) for x in examples["answers.text"]]
    tokenized_examples = tokenizer(texts, truncation=True, max_length=max_sequence_length)
    return tokenized_examples

In [12]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [14]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [16]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [17]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
from transformers import TrainingArguments
from transformers import Trainer
import torch

In [19]:
training_args = TrainingArguments(
    output_dir="my_awesome_eli5_mlm_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

  0%|          | 0/2817 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask


{'loss': 0.35, 'learning_rate': 1.6450124245651406e-05, 'epoch': 0.53}


  0%|          | 0/226 [00:00<?, ?it/s]

{'eval_loss': 0.30741310119628906, 'eval_runtime': 25.8692, 'eval_samples_per_second': 69.851, 'eval_steps_per_second': 8.736, 'epoch': 1.0}
{'loss': 0.3317, 'learning_rate': 1.2900248491302805e-05, 'epoch': 1.06}
{'loss': 0.3279, 'learning_rate': 9.350372736954207e-06, 'epoch': 1.6}


  0%|          | 0/226 [00:00<?, ?it/s]

{'eval_loss': 0.30345389246940613, 'eval_runtime': 25.7735, 'eval_samples_per_second': 70.111, 'eval_steps_per_second': 8.769, 'epoch': 2.0}
{'loss': 0.3209, 'learning_rate': 5.800496982605609e-06, 'epoch': 2.13}
{'loss': 0.3167, 'learning_rate': 2.250621228257011e-06, 'epoch': 2.66}


  0%|          | 0/226 [00:00<?, ?it/s]

{'eval_loss': 0.29302874207496643, 'eval_runtime': 141.8687, 'eval_samples_per_second': 12.737, 'eval_steps_per_second': 1.593, 'epoch': 3.0}
{'train_runtime': 3513.6486, 'train_samples_per_second': 6.414, 'train_steps_per_second': 0.802, 'train_loss': 0.32798996757098364, 'epoch': 3.0}


TrainOutput(global_step=2817, training_loss=0.32798996757098364, metrics={'train_runtime': 3513.6486, 'train_samples_per_second': 6.414, 'train_steps_per_second': 0.802, 'train_loss': 0.32798996757098364, 'epoch': 3.0})

In [20]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/226 [00:00<?, ?it/s]

Perplexity: 1.35


In [26]:
trainer.push_to_hub()

'https://huggingface.co/imene-kolli/my_awesome_eli5_mlm_model/tree/main/'

In [31]:
tokenizer.push_to_hub('imene-kolli/my_awesome_eli5_mlm_model')

CommitInfo(commit_url='https://huggingface.co/imene-kolli/my_awesome_eli5_mlm_model/commit/814a95c64b632d44c7d1f86f768b6760619da3b9', commit_message='Upload tokenizer', commit_description='', oid='814a95c64b632d44c7d1f86f768b6760619da3b9', pr_url=None, pr_revision=None, pr_num=None)

### Inference

In [22]:
text = "The Milky Way is a <mask> galaxy."

In [23]:
from transformers import pipeline

mask_filler = pipeline("fill-mask", "stevhliu/my_awesome_eli5_mlm_model")
mask_filler(text, top_k=3)

Downloading (…)lve/main/config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/386 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

[{'score': 0.5150937438011169,
  'token': 21300,
  'token_str': ' spiral',
  'sequence': 'The Milky Way is a spiral galaxy.'},
 {'score': 0.07087340205907822,
  'token': 2232,
  'token_str': ' massive',
  'sequence': 'The Milky Way is a massive galaxy.'},
 {'score': 0.06434684991836548,
  'token': 650,
  'token_str': ' small',
  'sequence': 'The Milky Way is a small galaxy.'}]

In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
inputs = tokenizer(text, return_tensors="pt")
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

In [28]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
logits = model(**inputs).logits
mask_token_logits = logits[0, mask_token_index, :]

In [29]:
top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()

for token in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))

The Milky Way is a  spiral galaxy.
The Milky Way is a  massive galaxy.
The Milky Way is a  small galaxy.
