In [73]:
from transformers import DataCollatorForLanguageModeling
from datasets import *
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import create_optimizer, AdamWeightDecay
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf
from transformers import TFAutoModelForMaskedLM
import datetime



## Loading

In [50]:
dataset = load_from_disk("AR-dotted-2MediumPlus-arrow")


In [51]:
dataset.push_to_hub("AR-dotted-2MediumPlus-arrow")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4447 [00:00<?, ?ba/s]

In [52]:
tokenizer = AutoTokenizer.from_pretrained("AR-dotted-tokenizer")


In [53]:
dataset = dataset['train']

In [54]:
dataset_tt = dataset.train_test_split(test_size=0.2)


In [55]:
dataset_tt

DatasetDict({
    train: Dataset({
        features: ['clean'],
        num_rows: 3557064
    })
    test: Dataset({
        features: ['clean'],
        num_rows: 889266
    })
})

## Mapping / Processing


In [56]:
def preprocess_function(examples):
    return tokenizer([x for x in examples['clean']])


In [57]:
tokenized_dataset = dataset_tt.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset_tt["train"].column_names,
)


Map (num_proc=4):   0%|          | 0/3557064 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/889266 [00:00<?, ? examples/s]

In [58]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [59]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)


Map (num_proc=4):   0%|          | 0/3557064 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/889266 [00:00<?, ? examples/s]

In [60]:
lm_dataset.save_to_disk("AR-dotted-tokenized-mediumPlus-arrow")

Saving the dataset (0/1 shards):   0%|          | 0/334273 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/83558 [00:00<?, ? examples/s]

In [61]:
lm_dataset.push_to_hub("dot-ammar/AR-dotted-tokenized-mediumPlus")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/335 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [62]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf")


## Training

In [63]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)


In [64]:
model = TFAutoModelForMaskedLM.from_pretrained("distilbert-base-multilingual-cased")


All model checkpoint layers were used when initializing TFMobileBertForMaskedLM.

All the layers of TFMobileBertForMaskedLM were initialized from the model checkpoint at google/mobilebert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMobileBertForMaskedLM for predictions without further training.


In [65]:
tf_train_set = model.prepare_tf_dataset(
    lm_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    lm_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [66]:
model.compile(optimizer=optimizer)  # No loss argument!


In [67]:
callback = PushToHubCallback(
    output_dir="dotless_mask_model-small",
    tokenizer=tokenizer,
)

/Users/ammar/Developer/git-repos/dotless/Models/Model v2/dotless_mask_model-small is already a clone of https://huggingface.co/dot-ammar/dotless_mask_model-small. Make sure you pull the latest changes with `repo.git_pull()`.


In [69]:
%load_ext tensorboard


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [71]:
rm -rf ./logs/

In [75]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


In [76]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback, tensorboard_callback])


Epoch 1/3
   30/20892 [..............................] - ETA: 12:21:46 - loss: 12.3918

KeyboardInterrupt: 