## Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

import torch
import torch.nn as nn
from torch.nn.functional import gelu
from torch.nn import CrossEntropyLoss

from datasets import load_dataset, DatasetDict

from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

from transformers import (RobertaTokenizer, PreTrainedModel, RobertaConfig, 
                          RobertaForMaskedLM, DataCollatorForLanguageModeling,
                          Trainer, TrainingArguments)

from transformers.modeling_outputs import MaskedLMOutput

## Helper functions

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

## Paths

In [None]:
data = Path("data/")
!ls {data}

oscar.eo.ds  oscar.eo.txt


In [None]:
model_dir = "models/esperberto"
!ls {model_dir}

config.json  merges.txt  pytorch_model.bin  training_args.bin  vocab.json


## Get data

In [None]:
!wget -c -O data/oscar.eo.txt https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt

--2021-04-16 09:19:52--  https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt
Resolving cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)... 99.84.114.112, 99.84.114.24, 99.84.114.120, ...
Connecting to cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)|99.84.114.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 312733741 (298M) [text/plain]
Saving to: ‘data/oscar.eo.txt’


2021-04-16 09:19:58 (56.3 MB/s) - ‘data/oscar.eo.txt’ saved [312733741/312733741]



In [None]:
!head {data/"oscar.eo.txt"}

Ĉu ... preĝi | mediti | ricevi instigojn || kanti | muziki || informiĝi | legi | studi || prepari Diservon
Temas pri kolekto de kristanaj kantoj, eldonita de Adolf Burkhardt inter 1974 kaj 1990 en dek kajeretoj. Ili estas reeldonitaj inter 1995 kaj 1998 de Bernhard Eichkorn en tri kajeroj, kies tria estas pliampleksigita per Dek Novaj Kantoj kaj suplemento, same de Adolf Burkhardt.
En la dua kaj tria kajero oni adiciis 300 al la originaj kantonumeroj, por ke oni povu pli facile uzi la kajerojn kune kun la KELI-himnaro Adoru Kantante, kiu havas malpli ol 300 numerojn.
Ni ĝojus, se iu trovus bonajn ekzemplerojn de la dek originaj kajeretoj kaj tempon por skani ankaŭ ilin. Bonvolu ekkontaktiĝi kun ni!
Lerni Esperanton per telefono, novaĵoj Poŝtkarto 120 jaroj de fervojo Svitavy-Polička 189… T.n.migranta poŝtkarto el 1908 BK - Kongresa Biblioteko en Vaŝingtono 1- 910 BK - Nederlando- Esperanta elektra tramo en Hago (… La lernolibro "Esperanto per rekta metodo" jam en… IMG 7181 Nova poŝtkar

## Train tokenizer

In [None]:
paths = [str(x) for x in data.glob("**/*.txt")]
paths

['data/oscar.eo.txt']

In [None]:
tokenizer = ByteLevelBPETokenizer()

In [None]:
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [None]:
tokenizer.save_model(model_dir)

['models/esperberto/vocab.json', 'models/esperberto/merges.txt']

In [None]:
tokenizer = ByteLevelBPETokenizer(
    f"{model_dir}/vocab.json",
    f"{model_dir}/merges.txt",
)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [None]:
tokenizer.encode("Mi estas Julien.")

Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [None]:
tokenizer.encode("Mi estas Julien.").tokens

['<s>', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '</s>']

## Load data

In [None]:
ds = load_dataset('text', data_files={'train': [paths[0]]})
ds

Using custom data configuration default-31220d7f73477105
Reusing dataset text (/root/.cache/huggingface/datasets/text/default-31220d7f73477105/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 974616
    })
})

In [None]:
show_random_elements(ds["train"])

Unnamed: 0,text
0,"Ĉiu uzanto havas sian propran profilon, en kiu li povas skribi pri si mem, kaj aldoni ligilojn al siaj aliaj kontoj."
1,"Egalmezure kaj obtuze ekmurmuris la tamburoj. Samtakte kun iliaj malrapidaj batoj la junulino, facile paŝante per la nudaj piedoj, alproksimiĝis al Pandiono kaj per fleksa, besta moviĝo kliniĝis antaŭ la statueto de la nekonata diino, etendante antaŭen la manojn en sopira kaj pasia atendo. Ravita Pandiono observis ĉiun geston de Iruma. Nun eĉ ombro de ruza moko ne estis sur la vizaĝo de la junulino — serioza, severa, kun sulkigitaj brovoj, ŝi, ŝajne, aŭskultis voĉojn de sia koro. Laŭ la etenditaj al Pandiono brakoj onde moviĝis streĉiĝintaj muskoloj. Tiuj ondoj dekuradis de la glataj ŝultroj al la fingroj, balanciĝantaj antaŭ la vizaĝo de Pandiono, kvazaŭ ĉiu ero de ŝia korpo strebis al li. La juna heleno neniam vidis ion similan — la mistera vivo de la brakoj estis kuniĝanta kun la verva impeto de la supren levita vizaĝo de la junulino."
2,"Inĝ. Miroslav Hruška (*1989) laboras por la turisma informcentro de la urbo Brno. Li estas membro de la komitato de la Ĉeĥa Esperanto-Asocio, interesiĝas pri publika transporto, fervojo kaj Pollando, kontribuas al la Esperanta Vikipedio. Krom tio li ankaŭ aktivas en la ĉeĥa civitana societo “Movado Brontosaurus”."
3,"La nun recenzata kvara volumo de la vortaro komenciĝas per la kutimaj listoj de la mallongigoj kaj fontoj. Iom konfuze, la mallongigo de la portugala estas nun P, ne Pg kiel en la unua volumo; oni atendus, ke nura P estus rezervita por la pola, kiel unu el la lingvoj de la Fundamento (ĝia mallongigo estas Po!). Enestas ankaŭ bibliografio, kiu pro iu kaŭzo mankis almenaŭ en la unua volumo."
4,"1627 Ferdinando la 2-a proklamis Renovigitan landan establon, permesita nur katolika religio, forpreno de preskaŭ plena politika potenco al la reĝaj urboj kaj firmigo de absoluta potenco de la reganto kaj liaj centraj oficejoj en Vieno"
5,En la jaro 1925 fondis membro de bolŝevika gvidantaro Jemeljan Jaroslavskij (Minej Izrailjeviĉ Gubelman - Мине́й Изра́илевич Губельма́н) tutsovetian „Asocion de ateistoj“ (ekde la jaro 1929 ĝi ekzistis sub nomo „Asocio de batalantaj ateistoj“).
6,"En frazoj, kiuj havas ke-frazon kiel objekton, aperas ofte alia frazparto kun N-finaĵo. Tiam povas ofte ŝajni, ke estas du rektaj objektoj kun malsama rilato al la ĉefverbo. Tio normale estus eraro."
7,"La plej konata turisma loko estas Altaj Tatroj. Tio estas plej alta slovaka montaro, en kiu troveblas multaj raraj specoj de bestoj kaj kreskaĵoj. En Altaj Tatroj estas tri ĉefaj turismaj centroj - Štrbské Pleso, Tatranská Lomnica kaj Starý Smokovec."
8,"6 Kaj ekiris kurieroj kun leteroj de la reĝo kaj de liaj eminentuloj en la tutan landon de Izrael kaj Jehuda, kun jena ordono de la reĝo:Ho idoj de Izrael, revenu al la Eternulo, Dio de Abraham, Isaak, kaj Izrael, kaj tiam Li revenos al la saviĝintoj, kiuj restis ĉe vi de la mano de la reĝoj de Asirio."
9,"nafto estas ""frakcio"" de rafinita petrolo, kiu enhavas ""naftenojn"", t.e. specifa tipo de hidrokarbono. Kiam la petrolo jam de sia natura formo enhavas multege da naftenoj, ni ja povas diri ĝin naftena oleo, sed ne nafto mem. Nafto portas la ideon rafinita."


In [None]:
tokenizer = RobertaTokenizer.from_pretrained(model_dir, max_len=512)

In [None]:
# ds_enc = ds.map(lambda x: tokenizer(x["text"], truncation=True))

In [None]:
# ds_enc.save_to_disk("data/oscar.eo.ds")

In [None]:
ds_enc = DatasetDict.load_from_disk("data/oscar.eo.ds")
ds_enc

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'text'],
        num_rows: 974616
    })
})

In [None]:
tokenizer.decode(ds_enc["train"][0]["input_ids"])

'<s>Ĉu... preĝi | mediti | ricevi instigojn || kanti | muziki || informiĝi | legi | studi || prepari Diservon</s>'

## Baseline model

In [None]:
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
def baseline_init():
    return RobertaForMaskedLM(config=config)

In [None]:
# reference values
# Step	Training Loss	Validation Loss
# 16	10.374800	9.909284
# 32	9.829200	9.645468
# 48	9.572800	9.329382
# 64	9.437400	9.316128

sample_ds = ds_enc["train"].train_test_split(train_size=512, test_size=128, seed=42)
bs = 8
logging_steps = sample_ds["train"].num_rows // bs // 4

training_args = TrainingArguments(
    output_dir="models/esperberto",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=bs,
    prediction_loss_only=True,
    evaluation_strategy="steps",
    disable_tqdm=False,
    logging_steps=logging_steps
)

trainer = Trainer(
    model_init=baseline_init,
    args=training_args,
    data_collator=data_collator,
    train_dataset=sample_ds["train"],
    eval_dataset=sample_ds["test"]
)

trainer.train();

Loading cached split indices for dataset at data/oscar.eo.ds/train/cache-8b96f05ff2fc9096.arrow and data/oscar.eo.ds/train/cache-e43a2b26405c9c65.arrow


Step,Training Loss,Validation Loss
16,10.3748,9.909284
32,9.8292,9.645468
48,9.5728,9.329382
64,9.4374,9.316128


## Custom model

Goal: implement RoBERTa LM from scratch :) Remove as much boilerplate as possible while maintaining compatibility with the trainer.

In [None]:
from transformers.modeling_outputs import BaseModelOutput
import math
import torch.nn.functional as F


In [None]:
class MyRobertaPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = RobertaConfig
    base_model_prefix = "roberta"

    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, nn.Linear):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

In [None]:
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    """
    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
    mask = input_ids.ne(padding_idx).int()
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    return incremental_indices.long() + padding_idx

In [None]:
class MyRobertaEmbeddings(nn.Module):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    
    LT: For some reason, removing the token_type_embeddings produces small numerical differences in the losses - safe to remove?
    """

    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")

        # End copy
        self.padding_idx = config.pad_token_id
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

    def forward(
        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
    ):

        # Create the position ids from the input token ids. Any padded tokens remain padded.
        position_ids = create_position_ids_from_input_ids(
            input_ids, self.padding_idx, past_key_values_length
        ).to(input_ids.device)

        input_shape = input_ids.size()


        token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)


        inputs_embeds = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + token_type_embeddings
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [None]:
class MyRobertaOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

In [None]:
ACT2FN = {
    "gelu": F.gelu,
}

In [None]:
class MyRobertaIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        self.intermediate_act_fn = ACT2FN[config.hidden_act]

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states

In [None]:
class MyRobertaSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

In [None]:
class MyRobertaSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
    ):
        mixed_query_layer = self.query(hidden_states)
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        value_layer = self.transpose_for_scores(self.value(hidden_states))

        query_layer = self.transpose_for_scores(mixed_query_layer)
        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
        attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)


        context_layer = torch.matmul(attention_probs, value_layer)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        outputs = (context_layer,) 

        return outputs

In [None]:
class MyRobertaAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.self = MyRobertaSelfAttention(config)
        self.output = MyRobertaSelfOutput(config)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
    ):
        self_outputs = self.self(
            hidden_states,
            attention_mask,
        )
        attention_output = self.output(self_outputs[0], hidden_states)
        outputs = (attention_output,)
        return outputs

In [None]:
class MyRobertaLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = MyRobertaAttention(config)
        self.intermediate = MyRobertaIntermediate(config)
        self.output = MyRobertaOutput(config)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
    ):
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
        )
        attention_output = self_attention_outputs[0]
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        outputs = (layer_output,) #+ self_attention_outputs[1:]  # add self attentions if we output attention weights

        return outputs

In [None]:
class MyRobertaEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.layer = nn.ModuleList([MyRobertaLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(
        self,
        hidden_states,
        attention_mask=None,
    ):
        for i, layer_module in enumerate(self.layer):

            layer_outputs = layer_module(
                hidden_states,
                attention_mask,
            )

            hidden_states = layer_outputs[0]

        return BaseModelOutput(
            last_hidden_state=hidden_states,
        )

In [None]:
class MyRobertaModel(MyRobertaPreTrainedModel):

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.embeddings = MyRobertaEmbeddings(config)
        self.encoder = MyRobertaEncoder(config)
        self.init_weights()
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
    ):
        input_shape = input_ids.size()
        batch_size, seq_length = input_shape

        device = input_ids.device 
        token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)


        embedding_output = self.embeddings(
            input_ids=input_ids,
            token_type_ids=token_type_ids
        )
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
        )
        sequence_output = encoder_outputs.last_hidden_state
        

        return BaseModelOutput(
            last_hidden_state=sequence_output,
        )

In [None]:
class MyRobertaLMHead(nn.Module):
    """Roberta Head for masked language modeling."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
        self.decoder.bias = self.bias

    def forward(self, features, **kwargs):
        x = self.dense(features)
        x = F.gelu(x)
        x = self.layer_norm(x)

        # project back to size of vocabulary with bias
        x = self.decoder(x)

        return x

In [None]:
class MyRobertaForMaskedLM(MyRobertaPreTrainedModel):

    def __init__(self, config):
        super().__init__(config)
        self.roberta = MyRobertaModel(config)
        self.lm_head = MyRobertaLMHead(config)
        self.init_weights()
        
    def get_output_embeddings(self):
        return self.lm_head.decoder

    def set_output_embeddings(self, new_embeddings):
        self.lm_head.decoder = new_embeddings


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        labels=None,
    ):

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
        )
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        loss_fct = CrossEntropyLoss()
        masked_lm_loss = loss_fct(
            prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
def model_init():
    return MyRobertaForMaskedLM(config=config)

In [None]:
# reference values

# Step	Training Loss	Validation Loss
# 16	10.272600	9.798569
# 32	9.720300	9.541902
# 48	9.513300	9.259138
# 64	9.403300	9.244128

In [None]:
sample_ds = ds_enc["train"].train_test_split(train_size=512, test_size=128, seed=42)
bs = 8
logging_steps = sample_ds["train"].num_rows // bs // 4

training_args = TrainingArguments(
    output_dir="models/esperberto",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=bs,
    prediction_loss_only=True,
    evaluation_strategy="steps",
    disable_tqdm=False,
    logging_steps=logging_steps
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=data_collator,
    train_dataset=sample_ds["train"],
    eval_dataset=sample_ds["test"]
)

trainer.train();

Loading cached split indices for dataset at data/oscar.eo.ds/train/cache-8b96f05ff2fc9096.arrow and data/oscar.eo.ds/train/cache-e43a2b26405c9c65.arrow


NameError: name 'model_init' is not defined

In [None]:
from transformers import pipeline

# reference outputs
# [{'sequence': 'La suno la.',
#   'score': 0.032828059047460556,
#   'token': 264,
#   'token_str': ' la'},
#  {'sequence': 'La suno,.',
#   'score': 0.02230573445558548,
#   'token': 16,
#   'token_str': ','},
#  {'sequence': 'La suno..',
#   'score': 0.011032713577151299,
#   'token': 18,
#   'token_str': '.'},
#  {'sequence': 'La suno de.',
#   'score': 0.0063017127104103565,
#   'token': 274,
#   'token_str': ' de'},
#  {'sequence': 'La suno kaj.',
#   'score': 0.0009269547881558537,
#   'token': 288,
#   'token_str': ' kaj'}]

fill_mask = pipeline(
    "fill-mask",
    model="models/esperberto",
    tokenizer="models/esperberto"
)

# The sun <mask>.
# =>

result = fill_mask("La suno <mask>.")
result

[{'sequence': 'La suno la.',
  'score': 0.032828059047460556,
  'token': 264,
  'token_str': ' la'},
 {'sequence': 'La suno,.',
  'score': 0.02230573445558548,
  'token': 16,
  'token_str': ','},
 {'sequence': 'La suno..',
  'score': 0.011032713577151299,
  'token': 18,
  'token_str': '.'},
 {'sequence': 'La suno de.',
  'score': 0.0063017127104103565,
  'token': 274,
  'token_str': ' de'},
 {'sequence': 'La suno kaj.',
  'score': 0.0009269547881558537,
  'token': 288,
  'token_str': ' kaj'}]

## Custom model without bugs :)

In [None]:
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaModel, RobertaLMHead

class MyRobertaForMaskedLM(MyRobertaPreTrainedModel):
#     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.bias"]
#     _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)

        if config.is_decoder:
            logger.warning(
                "If you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.lm_head = MyRobertaLMHead(config)

        self.init_weights()

    def get_output_embeddings(self):
        return self.lm_head.decoder

    def set_output_embeddings(self, new_embeddings):
        self.lm_head.decoder = new_embeddings

#     @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
#     @add_code_sample_docstrings(
#         tokenizer_class=_TOKENIZER_FOR_DOC,
#         checkpoint=_CHECKPOINT_FOR_DOC,
#         output_type=MaskedLMOutput,
#         config_class=_CONFIG_FOR_DOC,
#         mask="<mask>",
#     )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
def roberta_init():
    return MyRobertaForMaskedLM(config=config)

In [None]:
# baseline reference values
# Step	Training Loss	Validation Loss
# 16	10.374800	9.909284
# 32	9.829200	9.645468
# 48	9.572800	9.329382
# 64	9.437400	9.316128

sample_ds = ds_enc["train"].train_test_split(train_size=512, test_size=128, seed=42)
bs = 8
logging_steps = sample_ds["train"].num_rows // bs // 4

training_args = TrainingArguments(
    output_dir="models/esperberto",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=bs,
    prediction_loss_only=True,
    evaluation_strategy="steps",
    disable_tqdm=False,
    logging_steps=logging_steps
)

trainer = Trainer(
    model_init=roberta_init,
    args=training_args,
    data_collator=data_collator,
    train_dataset=sample_ds["train"],
    eval_dataset=sample_ds["test"]
)

trainer.train();

Loading cached split indices for dataset at data/oscar.eo.ds/train/cache-8b96f05ff2fc9096.arrow and data/oscar.eo.ds/train/cache-e43a2b26405c9c65.arrow


Step,Training Loss,Validation Loss
16,10.2898,9.839578


KeyboardInterrupt: 