In [1]:
import sys
sys.executable
import os
import tensorflow as tf

In [2]:
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home"

In [3]:
# %pip install --upgrade pip
# %pip install transformers
# %pip install tensorflow_datasets
# %pip install tensorflow
# %pip install keras
# %pip install tqdm

In [4]:
# OPTION A:
# USE HUGGING FACE TO:
# 1. USE ONE OF THEIR VOCABULARIES - BUT! - ADD THE RADIOLOGIST WORDS TO THAT VOCAB: add_tokens method for transformers.SpecialTokensMixin
# 2. AFTER I EXTEND THE VOCAB, FINE TUNE ONE OF THE EXISTING MODELS
# THE ONLY OTHER OPTION WOULD BE TO TRAIN BOTH A NEW TOKENIZER AND A NEW MODEL FROM SCRATCH

# import pandas as pd
# from transformers import BertTokenizer, BertModel

# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertModel.from_pretrained("bert-base-uncased")

# print(len(tokenizer))

In [7]:
# OPTION B:
# USE HUGGING FACE TO:
# 1. CREATE MY OWN VOCABULARY USING TXT FILES 
# 2. TRAIN A NEW BERT MODEL

# this uses an example published by hugging face
from tokenizers import ByteLevelBPETokenizer, CharBPETokenizer, SentencePieceBPETokenizer, BertWordPieceTokenizer
from pathlib import Path

paths = [str(x) for x in Path("./esperanto/").glob("**/*.txt")]

tokenizer = ByteLevelBPETokenizer()

In [8]:
tokenizer.train(files = paths, vocab_size = 52_000, min_frequency = 2, special_tokens = [
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [9]:
tokenizer.save_model("./esperanto_vocab/","esperberto")

['./esperanto_vocab/esperberto-vocab.json',
 './esperanto_vocab/esperberto-merges.txt']

In [10]:
from tokenizers.processors import BertProcessing

# now we want to use the saved vocabulary
tokenizer = ByteLevelBPETokenizer("./esperanto_vocab/esperberto-vocab.json",
                                 "./esperanto_vocab/esperberto-merges.txt")

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")), 
    ("<s>", tokenizer.token_to_id("<s>")),
)

tokenizer.enable_truncation(max_length = 512)

print(tokenizer.encode("Mi estas Julien."))
print(tokenizer.encode("Mi estas Julien.").tokens)

Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['<s>', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '</s>']


In [None]:
# instead of creating esperanto dataframe as below can I use the technique in IMDB example


In [9]:
import torch
from torch.utils.data import Dataset

class EsperantoDataset(Dataset):
    """Make Esperanto Dataset."""
    
    def __init__(self):
        tokenizer = ByteLevelBPETokenizer(
            "./esperanto_vocab/esperberto-vocab.json",
            "./esperanto_vocab/esperberto-merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512) 

        self.examples = []

        src_files = Path("./esperanto/").glob("**/*.txt")
        for src_file in src_files:
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i])
    
espDS = EsperantoDataset()    

In [10]:
# src_files = Path("./esperanto/").glob("**/*.txt")
# for src_file in src_files:
#     kmg = src_file.read_text(encoding = "utf-8").splitlines()
# thing = tokenizer.encode_batch(kmg)
# print(thing[1].tokens)

print(len(espDS))
print(espDS.examples[1])
type(espDS)
# so each record in the txt file has been tokenized and the tokens have been replaced by the id number in the vocab

974616
[0, 14488, 373, 5811, 274, 8634, 4616, 16, 5505, 274, 18297, 43129, 428, 10677, 288, 4729, 296, 1410, 288, 1259, 313, 18, 1130, 316, 27025, 1072, 428, 6473, 288, 6531, 274, 23496, 44134, 296, 752, 31297, 16, 1282, 2853, 316, 25989, 1122, 376, 4445, 12086, 31668, 288, 37105, 16, 1485, 274, 18297, 43129, 18, 2]


__main__.EsperantoDataset

In [11]:
# train my own model
from transformers import BertTokenizer#, glue_convert_examples_to_features
import tensorflow_datasets as tfds
from transformers import BertForMaskedLM, Trainer, TrainingArguments
from transformers.data.processors.utils import DataProcessor, InputExample, InputFeatures

# train_dataset = thing

model = BertForMaskedLM.from_pretrained("bert-large-uncased") # this lets us start off w/ saved weights

training_args = TrainingArguments(
    output_dir='./esperanto_results',# output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

# trainer = Trainer(
#     model=model,                         # the instantiated 🤗 Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset=train_dataset#,         # training dataset
# #     eval_dataset=test_dataset            # evaluation dataset
# )

# # to train
# trainer.train() 

# # to evaluate
# trainer.evaluate() 



Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

src_files = Path("./esperanto/").glob("**/*.txt")
for src_file in src_files:
    kmg = src_file.read_text(encoding = "utf-8").splitlines()

def tokenize_sentences(sentences, tokenizer):
    tokenized_sentences = []

    for sentence in sentences:
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True # Add '[CLS]' and '[SEP]'
                    )
        
        tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

input_ids = tokenize_sentences(kmg, tokenizer)
# attention_masks = create_attention_masks(input_ids)

In [76]:
# dir(input_ids[1])
print(input_ids[1].attention_mask)
print(input_ids[1].char_to_token)
print(input_ids[1].char_to_word)
print(input_ids[1].ids)
print(input_ids[1].merge)
print(input_ids[1].offsets)
print(input_ids[1].overflowing)
print(input_ids[1].pad)
print(input_ids[1].special_tokens_mask)
print(input_ids[1].token_to_chars)
print(input_ids[1].token_to_word)
print(input_ids[1].tokens)
print(input_ids[1].truncate)
print(input_ids[1].type_ids)
print(input_ids[1].word_to_chars)
print(input_ids[1].word_to_tokens)
print(input_ids[1].words)

# def create_dataset(ids, masks, labels):
#     def gen():
#         for i in range(len(train_ids)):
#             yield (
#                 {
#                     "input_ids": ids[i],
#                     "attention_mask": masks[i]
#                 },
#                 labels[i],
#             )

#     return tf.data.Dataset.from_generator(
#         gen,
#         ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
#         (
#             {
#                 "input_ids": tf.TensorShape([None]),
#                 "attention_mask": tf.TensorShape([None])
#             },
#             tf.TensorShape([None]),
#         ),
#     )

# train_dataset = create_dataset(train_ids, train_masks, train_labels)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
<built-in method char_to_token of tokenizers.Encoding object at 0x7ff19a63c030>
<built-in method char_to_word of tokenizers.Encoding object at 0x7ff19a63c030>
[0, 14488, 373, 5811, 274, 8634, 4616, 16, 5505, 274, 18297, 43129, 428, 10677, 288, 4729, 296, 1410, 288, 1259, 313, 18, 1130, 316, 27025, 1072, 428, 6473, 288, 6531, 274, 23496, 44134, 296, 752, 31297, 16, 1282, 2853, 316, 25989, 1122, 376, 4445, 12086, 31668, 288, 37105, 16, 1485, 274, 18297, 43129, 18, 2]
<built-in method merge of type object at 0x1062c5e00>
[(0, 0), (0, 5), (5, 9), (9, 17), (17, 20), (20, 30), (30, 37), (37, 38), (38, 47), (47, 50), (50, 56), (56, 66), (66, 72), (72, 77), (77, 81), (81, 86), (86, 89), (89, 93), (93, 97), (97, 100), (100, 103), (103, 104), (104, 108), (108, 114), (114, 122), (122, 126), (126, 132), (132, 137), (137, 141), (141, 1

In [117]:
# from transformers import pipeline

# fill_mask = pipeline(
#     "fill-mask", 
#     model = "./models/EsperBERTo-small", 
#     tokenizer = "./models/EsperBERTo-small"
# )

# result = fill_mask("La suno <mask>.")