In [None]:
# We won't need TensorFlow here
!pip uninstall -y tensorflow
# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

def remove_empty_lines(filename):
    if not os.path.isfile(filename):
        print("{} does not exist ".format(filename))
        return
    with open(filename) as filehandle:
        lines = filehandle.readlines()

    with open(filename, 'w') as filehandle:
        lines = filter(lambda x: x.strip(), lines)
        filehandle.writelines(lines)

Corpus (armazenado no Google Drive, para mais fácil acesso ao arquivo que tem ~ 1GB)

Remoção de linhas em branco

In [None]:
remove_empty_lines('/content/drive/MyDrive/Final.txt')

Embaralhar linhas (uma vez que as linhas dos tweets foram postas sequencialmente após os textos da Leipzig Collection, essa randomização é importante para que os dados de treinamento contemplem dados das duas fontes)

In [None]:
import random
lines = open('/content/drive/MyDrive/Final.txt').readlines()
random.shuffle(lines)
open('/content/drive/MyDrive/Final_Shuffled.txt', 'w').writelines(lines)

In [None]:
!wc -l /content/drive/MyDrive/Final_Shuffled.txt

In [None]:
!shuf -n 5 /content/drive/MyDrive/Final_Shuffled.txt

In [None]:
!mkdir data

In [None]:
# Get a subset of first 44362 lines for training
#TRAIN_SIZE = 44362 #@param {type:"integer"}
#!(head -n $TRAIN_SIZE /content/drive/MyDrive/Final_Shuffled.txt) > data/train.txt

In [None]:
# Get a subset of next 443 lines for validation
#VAL_SIZE = 443 #@param {type:"integer"}
#!(sed -n {TRAIN_SIZE + 1},{TRAIN_SIZE + VAL_SIZE}p /content/drive/MyDrive/Final_Shuffled.txt) > data/dev.txt

In [None]:
%%time 
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

#paths = [str(x) for x in Path(".").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files='/content/drive/MyDrive/Final_Shuffled.txt', vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [None]:
!mkdir roB3rta
tokenizer.save_model("roB3rta")

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./roB3rta/vocab.json",
    "./roB3rta/merges.txt",
)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [None]:
tokenizer.encode("Ações da Bolsa de Valores")

In [None]:
tokenizer.encode("Ações da Bolsa de Valores").tokens

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=512,
    num_attention_heads=8,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./roB3rta", max_len=512)

In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [None]:
model.num_parameters()
# => 84 million parameters

In [None]:
!mkdir ./shards
!split -a 4 -l 256000 -d /content/drive/MyDrive/Final_Shuffled.txt ./shards/shard_

In [None]:
!pip install datasets

In [None]:
import glob

files = glob.glob('shards/*')

from datasets import load_dataset

dataset = load_dataset('text', data_files=files, split='train')

In [None]:
def encode(examples):
  return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=96)

dataset = dataset.map(encode, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [None]:
#%%time
#from transformers import LineByLineTextDataset
#
#dataset = LineByLineTextDataset(
#    tokenizer=tokenizer,
#    file_path="/content/drive/MyDrive/Final_Shuffled.txt",
#    block_size=128,
#)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./roB3rta",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=True,
    half_precision_backend='amp',
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
!git clone https://github.com/NVIDIA/apex

In [None]:
import os
os.chdir('apex')

In [None]:
!pip install AMP

In [None]:
!pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

In [None]:
#%%time
trainer.train()

In [None]:
trainer.save_model("./roB3rta")