# This notebook corresponds with the Day 22 VAIL Action Item

### Experimenting with Autocomplete using the roBERTa model

#### Source [Article](https://colab.research.google.com/drive/1mXWYYkB9UjRdklPVSDvAcUDralmv3Pgv#scrollTo=-c0w1Xt2bnqR)

In [1]:
#trainig the roBERTa model on Spanish text:
!pip install transformers==2.8.0



In [2]:
#pulling a spanish dataset for fitting our model:
import os

# Download and unzip movie substitle dataset
if not os.path.exists('data/dataset.txt'):
  !wget "https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/es.txt.gz" -O dataset.txt.gz
  !gzip -d dataset.txt.gz
  !mkdir data
  !mv dataset.txt data

In [3]:
# showing the total line # from the dataset we pulled 
!wc -l data/dataset.txt
# showing som random lines from the dataset
!shuf -n 5 data/dataset.txt

179287150 data/dataset.txt
¿Qué está pasando?
Se enojaron.
- Silene lo sabe.
Supongo que eso no es elitista.
¡Ahora vete!


In [4]:
# pulling first 1,000,000 lines for training
TRAIN_SIZE = 1000000
!(head -n $TRAIN_SIZE data/dataset.txt) > data/train.txt
# pulling next 10,000 lines for validation
!(sed -n {TRAIN_SIZE + 1},{TRAIN_SIZE + VAL_SIZE}p data/dataset.txt) > data/dev.txt

sed: -e expression #1, char 0: unmatched `{'


In [5]:
!pip install tokenizers



In [8]:
#training a tokenizer specific to spanish:
from tokenizers import ByteLevelBPETokenizer

path = "data/train.txt"

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=path,
                vocab_size=50265,
                min_frequency=2,
                special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

# Save files to disk
!mkdir -p "/content/models"

In [10]:
tokenizer.save("/content/models")

['/content/models/vocab.json', '/content/models/merges.txt']

In [13]:
import json
config = {
	"architectures": [
		"RobertaForMaskedLM"
	],
	"attention_probs_dropout_prob": 0.1,
	"hidden_act": "gelu",
	"hidden_dropout_prob": 0.1,
	"hidden_size": 768,
	"initializer_range": 0.02,
	"intermediate_size": 3072,
	"layer_norm_eps": 1e-05,
	"max_position_embeddings": 514,
	"model_type": "roberta",
	"num_attention_heads": 12,
	"num_hidden_layers": 12,
	"type_vocab_size": 1,
	"vocab_size": 50265
}

with open("models/config.json", 'w') as fp:
    json.dump(config, fp)

tokenizer_config = {"max_len": 512}

with open("models/tokenizer_config.json", 'w') as fp:
    json.dump(tokenizer_config, fp)

In [14]:
#training model:
!wget -c https://raw.githubusercontent.com/chriskhanhtran/spanish-bert/master/run_language_modeling.py

--2021-03-02 03:16:11--  https://raw.githubusercontent.com/chriskhanhtran/spanish-bert/master/run_language_modeling.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34328 (34K) [text/plain]
Saving to: ‘run_language_modeling.py’


2021-03-02 03:16:11 (125 MB/s) - ‘run_language_modeling.py’ saved [34328/34328]



In [23]:
# Model paths
MODEL_TYPE = "roberta"
MODEL_DIR = "models" 
OUTPUT_DIR = "models/output" 
TRAIN_PATH = "data/train.txt" 
EVAL_PATH = "data/dev.txt" 

In [24]:
cmd = """python run_language_modeling.py \
    --output_dir {output_dir} \
    --model_type {model_type} \
    --mlm \
    --config_name {config_name} \
    --tokenizer_name {tokenizer_name} \
    {line_by_line} \
    {should_continue} \
    {model_name_or_path} \
    --train_data_file {train_path} \
    --eval_data_file {eval_path} \
    --do_train \
    {do_eval} \
    {evaluate_during_training} \
    --overwrite_output_dir \
    --block_size 512 \
    --max_step 25 \
    --warmup_steps 10 \
    --learning_rate 5e-5 \
    --per_gpu_train_batch_size 4 \
    --gradient_accumulation_steps 4 \
    --weight_decay 0.01 \
    --adam_epsilon 1e-6 \
    --max_grad_norm 100.0 \
    --save_total_limit 10 \
    --save_steps 10 \
    --logging_steps 2 \
    --seed 42
"""

In [25]:
train_params = {
    "output_dir": OUTPUT_DIR,
    "model_type": MODEL_TYPE,
    "config_name": MODEL_DIR,
    "tokenizer_name": MODEL_DIR,
    "train_path": TRAIN_PATH,
    "eval_path": EVAL_PATH,
    "do_eval": "--do_eval",
    "evaluate_during_training": "",
    "line_by_line": "",
    "should_continue": "",
    "model_name_or_path": "",
}

In [26]:
!pip install tensorboard==2.1.0
!tensorboard dev upload --logdir runs

2021-03-02 03:21:53.280821: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
TensorBoard.dev now supports the "graphs", "histograms", "hparams", and "text" plugins. To upload data for these plugins, please upgrade to TensorBoard 2.2.2 or newer.
Upload started and will continue reading any new data as it's added
to the logdir. To stop uploading, press Ctrl-C.
View your TensorBoard live at: https://tensorboard.dev/experiment/TZv7jTauQVe9yp1DXjKjvg/

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/tensorboard/uploader/uploader_main.py", line 426, in execute
    uploader.start_uploading()
  File "/usr/local/lib/python3.7/dist-packages/tensorboard/uploader/uploader.py", line 111, in start_uploading
    self._upload_once()
  File "/usr/local/lib/python3.7/dist-packages/tensorboard/uploader/uploader.py", line 116, in _upload_once
    self._rate_limiter.tick()
  File "/usr/local/lib/python3.7

In [27]:
!{cmd.format(**train_params)}

2021-03-02 03:22:47.700670: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
03/02/2021 03:22:49 - INFO - transformers.configuration_utils -   loading configuration file models/config.json
03/02/2021 03:22:49 - INFO - transformers.configuration_utils -   Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "m

In [22]:
#using out trined model to predicted masked words
#note: masked words are words we predict to occur next in the string based on
# the context provided by the adjacent words
"""
Masked Word Example:
Input: "I have watched this [MASK] and it was awesome."
Output: "I have watched this movie and it was awesome." 
"""

from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="chriskhanhtran/spanberta",
    tokenizer="chriskhanhtran/spanberta"
)

fill_mask("Lavarse frecuentemente las manos con agua y <mask>.")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=487.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=954339.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512068.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=16.0, style=ProgressStyle(description_w…




Model name 'chriskhanhtran/spanberta' was not found in model name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, bert-base-japanese, bert-base-japanese-whole-word-masking, bert-base-japanese-char, bert-base-japanese-char-whole-word-masking, bert-base-finnish-cased-v1, bert-base-finnish-uncased-v1, bert-base-dutch-cased, bart-large, bart-large-mnli, bart-large-cnn, bart-large-xsum, openai-gpt, transfo-xl-wt103, gpt2, gpt2-medium, gpt2-large, gpt2-xl, distilgpt2, ctrl, xlnet-base-cased, xlnet-large-cased, xlm-mlm-en-2048, xlm-mlm-ende-1024, xlm-mlm-enfr-1024, xlm-mlm-enro-1024, x

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501202014.0, style=ProgressStyle(descri…




	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero().item()


[{'score': 0.6469604969024658,
  'sequence': '<s> Lavarse frecuentemente las manos con agua y jabón.</s>',
  'token': 18493},
 {'score': 0.06074365973472595,
  'sequence': '<s> Lavarse frecuentemente las manos con agua y sal.</s>',
  'token': 619},
 {'score': 0.029788149520754814,
  'sequence': '<s> Lavarse frecuentemente las manos con agua y vapor.</s>',
  'token': 11079},
 {'score': 0.0264101754873991,
  'sequence': '<s> Lavarse frecuentemente las manos con agua y limón.</s>',
  'token': 12788},
 {'score': 0.01702934503555298,
  'sequence': '<s> Lavarse frecuentemente las manos con agua y vinagre.</s>',
  'token': 18424}]

# Conclusion: The model is able to interpolate Spanish!

In our sentence:

* "Lavarse frecuentemente las manos con agua y <mask>" 

* Translation: Frequently wash your hands with water and BLANK

The model was able to predict plausbile words for the blank fill in:


1.   jabón (water)
2.   sal (salt)
3.   vapor (steam)
4.   limón (lemon)
5.   vinagre (vinegar)

