<a href="https://colab.research.google.com/github/joaomsimoes/bert-pt/blob/main/BERTpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

Fri Jan 21 11:24:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers tokenizers datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Prepare the data

Import the scrapped data. Make sure we don't have duplicates using a set() object

In [None]:
import os

files = './drive/MyDrive/ColabNotebooks/BERTpt/scrapped/'

corpus = set()

for file in os.listdir(files):
  with open(files + file, mode='r', encoding='utf8') as fIn:
    for line in fIn:
      corpus.add(line)

len(corpus)

7799414

### Save the corpus to a .txt file
Save to a new clean txt file

In [None]:
path = './drive/MyDrive/ColabNotebooks/BERTpt/'

with open(path + 'corpus.txt', mode='a+') as fOut:
  for line in corpus:
    fOut.write(line + '\n')

## Tokenizer

In [None]:
from tokenizers import Tokenizer
from tokenizers.normalizers import Sequence, NFD, StripAccents, BertNormalizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.decoders import BPEDecoder
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import BpeTrainer

path = './drive/MyDrive/ColabNotebooks/BERTpt/tokenizer/'

### Prepare the TemplateProcessing for special tokens

In [None]:
from tokenizers.processors import TemplateProcessing

special_tokens=["[UNK]","[CLS]","[SEP]","[PAD]","[MASK]"]

temp_proc = TemplateProcessing(
    single ="[CLS] $A [SEP]",
    pair = "[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens = [
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
      ],
)

### Pipeline from Tokenizer

In [None]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

tokenizer.normalizer = Sequence(
    [
     BertNormalizer(clean_text=True, handle_chinese_chars=False,strip_accents=False, lowercase=True), 
     NFD(),
     StripAccents()
    ]
)

tokenizer.pre_tokenizer = Whitespace()
tokenizer.decoder = BPEDecoder()
tokenizer.postprocessor = temp_proc

### Train the tokenizer

In [None]:
trainer = BpeTrainer(
    vocab_size=30000, 
    special_tokens=special_tokens,
    continuing_subword_prefix='##'
  )

tokenizer.train_from_iterator(corpus, trainer=trainer)

### Test it

In [None]:
sentence = 'O covid foi um grande desafio para 2020 como para 2021. Como será 2022?'
sentence_encode = tokenizer.encode(sentence)

print(f"Output: {format(sentence_encode.tokens)}")
print(f"Output: {format(sentence_encode.ids)}")
print(f"Output: {format(tokenizer(sentence))}")

Output: ['o', 'covid', 'foi', 'um', 'grande', 'desafio', 'para', '2020', 'como', 'para', '2021', '.', 'como', 'sera', '2022', '?']
Output: [57, 7093, 5772, 5653, 6216, 9385, 5645, 7233, 5734, 5645, 8045, 18, 5734, 6002, 11172, 35]


In [None]:
tokenizer.save(path + "tokenizer.json")
tokenizer.model.save(path)

['./drive/MyDrive/ColabNotebooks/BERTpt/tokenizer/vocab.json',
 './drive/MyDrive/ColabNotebooks/BERTpt/tokenizer/merges.txt']

### Full tokenizer

In [None]:
from transformers import PreTrainedTokenizerFast

full_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=path + 'tokenizer.json',
    unk_token='[UNK]',
    pad_token='[PAD]',
    cls_token='[CLS]',
    sep_token='[SEP]',
    mask_token='[MASK]'
)

In [None]:
full_tokenizer.save_pretrained(path)

('./drive/MyDrive/ColabNotebooks/BERTpt/tokenizer/tokenizer_config.json',
 './drive/MyDrive/ColabNotebooks/BERTpt/tokenizer/special_tokens_map.json',
 './drive/MyDrive/ColabNotebooks/BERTpt/tokenizer/tokenizer.json')

## BERT

In [None]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained("./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased")

Didn't find file ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/added_tokens.json. We won't load it.
loading file None
loading file ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/special_tokens_map.json
loading file ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/tokenizer_config.json
loading file ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/tokenizer.json


In [None]:
from transformers import BertConfig, BertForMaskedLM

bert_config = BertConfig(
    vocab_size=30000,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

bert = BertForMaskedLM(bert_config)

In [None]:
import datasets
from datasets import load_dataset

features = datasets.Features(
    {'text': datasets.Value("string")}
    )

dataset = load_dataset(
    'text', 
    data_files={'train': './drive/MyDrive/ColabNotebooks/BERTpt/corpus.txt'},
    features=features)

print(dataset)

Using custom data configuration default-e050db18dbfee15c


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-e050db18dbfee15c/0.0.0/d86c40dad297bdddf277b406c6a59f0250b5318c400bf23d420a31aff88c84c4...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-e050db18dbfee15c/0.0.0/d86c40dad297bdddf277b406c6a59f0250b5318c400bf23d420a31aff88c84c4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 15598828
    })
})


In [None]:
import torch

dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=128), batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask'])
dataset.save_to_disk("./drive/MyDrive/ColabNotebooks/BERTpt/dataset")

  0%|          | 0/15599 [00:00<?, ?ba/s]

In [None]:
from datasets import load_from_disk
dataset = load_from_disk("./drive/MyDrive/ColabNotebooks/BERTpt/dataset")

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15)

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased",
    overwrite_output_dir=True,
    max_steps=800000,
    per_device_train_batch_size=64,
    save_steps=5000,
    logging_dir="./drive/MyDrive/ColabNotebooks/BERTpt/logs",
    logging_steps=100,
    eval_steps=100,
    ignore_data_skip=True)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from transformers import Trainer
trainer = Trainer(model=bert,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=dataset['train'])

max_steps is given, it will override any value given in num_train_epochs


In [None]:
import torch
torch.cuda.is_available()
torch.cuda.empty_cache()

In [None]:
# trainer.train()
trainer.train('./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/checkpoint-785000')

Loading model from ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/checkpoint-785000).
The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: text.
***** Running training *****
  Num examples = 15598828
  Num Epochs = 4
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 800000
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 3
  Continuing training from global step 785000


Step,Training Loss
785100,1.5028
785200,1.4814
785300,1.4739
785400,1.4614
785500,1.5032
785600,1.4671
785700,1.4706
785800,1.4779
785900,1.4633
786000,1.4718


Saving model checkpoint to ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/checkpoint-790000
Configuration saved in ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/checkpoint-790000/config.json
Model weights saved in ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/checkpoint-790000/pytorch_model.bin
Saving model checkpoint to ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/checkpoint-795000
Configuration saved in ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/checkpoint-795000/config.json
Model weights saved in ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/checkpoint-795000/pytorch_model.bin
Saving model checkpoint to ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/checkpoint-800000
Configuration saved in ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/checkpoint-800000/config.json
Model weights saved in ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/checkpoint-800000/pytorch_model.bin


Training completed. Do 

TrainOutput(global_step=800000, training_loss=0.027155688781738282, metrics={'train_runtime': 9412.6229, 'train_samples_per_second': 5439.504, 'train_steps_per_second': 84.992, 'total_flos': 1.6967616093558374e+18, 'train_loss': 0.027155688781738282, 'epoch': 3.06})

In [None]:
trainer.save_model("./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/last")

Saving model checkpoint to ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/last
Configuration saved in ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/last/config.json
Model weights saved in ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/last/pytorch_model.bin


## Testing

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/",
    tokenizer="./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/"
)

In [None]:
import pandas as pd
pd.DataFrame(fill_mask("2020 foi um ano [MASK]."))

Unnamed: 0,score,token,token_str,sequence
0,0.146935,7591,dificil,2020foiumanodificil.
1,0.101181,9902,historico,2020foiumanohistorico.
2,0.080123,19675,terrivel,2020foiumanoterrivel.
3,0.034216,6835,especial,2020foiumanoespecial.
4,0.028791,12082,complicado,2020foiumanocomplicado.


In [None]:
pd.DataFrame(fill_mask("O [MASK] é muito bonito."))

Unnamed: 0,score,token,token_str,sequence
0,0.071869,7846,design,odesignemuitobonito.
1,0.029477,9884,conceito,oconceitoemuitobonito.
2,0.029308,14359,vestido,ovestidoemuitobonito.
3,0.028057,8165,resto,orestoemuitobonito.
4,0.027589,6306,windows,owindowsemuitobonito.


In [None]:
pd.DataFrame(fill_mask("O FCPorto é melhor que o [MASK]."))

Unnamed: 0,score,token,token_str,sequence
0,0.608609,7709,benfica,ofc##portoemelhorqueobenfica.
1,0.188474,7935,sporting,ofc##portoemelhorqueosporting.
2,0.023601,16116,atletico,ofc##portoemelhorqueoatletico.
3,0.010015,13914,boavista,ofc##portoemelhorqueoboavista.
4,0.009242,10609,barcelona,ofc##portoemelhorqueobarcelona.


In [None]:
pd.DataFrame(fill_mask("Eu gosto de [MASK]."))

Unnamed: 0,score,token,token_str,sequence
0,0.409006,6131,tudo,eugostodetudo.
1,0.041077,6013,todos,eugostodetodos.
2,0.026858,5897,ti,eugostodeti.
3,0.023373,8406,ambos,eugostodeambos.
4,0.018875,7357,mim,eugostodemim.


In [None]:
pd.DataFrame(fill_mask("Eu quero [MASK] de casa."))

Unnamed: 0,score,token,token_str,sequence
0,0.850893,8009,sair,euquerosairdecasa.
1,0.111373,7906,mudar,euqueromudardecasa.
2,0.01288,6334,ir,euqueroirdecasa.
3,0.00463,8215,voltar,euquerovoltardecasa.
4,0.003045,8462,andar,euqueroandardecasa.


In [None]:
pd.DataFrame(fill_mask("Eu quero uma [MASK] melhor."))

Unnamed: 0,score,token,token_str,sequence
0,0.138783,6503,vida,euqueroumavidamelhor.
1,0.083636,7479,experiencia,euqueroumaexperienciamelhor.
2,0.059155,7051,internet,euqueroumainternetmelhor.
3,0.04477,6645,coisa,euqueroumacoisamelhor.
4,0.044105,7834,plataforma,euqueroumaplataformamelhor.


In [None]:
pd.DataFrame(fill_mask("[MASK] é uma boa linguagem de programacao"))

Unnamed: 0,score,token,token_str,sequence
0,0.155832,27384,python,pythoneumaboalinguagemdeprogramacao
1,0.152056,14348,java,javaeumaboalinguagemdeprogramacao
2,0.106369,11304,programacao,programacaoeumaboalinguagemdeprogramacao
3,0.056731,6267,isto,istoeumaboalinguagemdeprogramacao
4,0.044161,13206,linguagem,linguagemeumaboalinguagemdeprogramacao


## Push to the Hub

In [None]:
!huggingface-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/token.
        (Deprecated, will be removed in v0.3.0) To login with username and password instead, interrupt with Ctrl+C.
        
Token: 
Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on you

In [None]:
from transformers import AutoModel

model = AutoModel.from_pretrained('./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/')

loading configuration file ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/config.json
Model config BertConfig {
  "_name_or_path": "./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 30000
}

loading weights file ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/pytorch_model.bin
Some weights of the model checkpoint at ./drive/MyDrive/ColabNotebooks/BERTpt/bert-base-uncased/ were not used when in

In [None]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (2,613 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package git-lfs.
(Reading database ... 155229 files and directories c

In [None]:
model.push_to_hub(repo_url="joaomsimoes/bert-base-cased-pt_Portugal")

Cloning https://huggingface.co/joaomsimoes/bert-base-cased-pt_Portugal into local empty directory.


TypeError: ignored