In [29]:
# Imports
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import AlbertTokenizerFast, AutoTokenizer, AutoConfig, AlbertModel, Trainer, TrainingArguments
import pandas as pd
from tqdm import tqdm
import multiprocessing
from itertools import chain

In [6]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /home/hjmuizelaar/.cache/huggingface/token
Login successful


In [10]:
with open('../../input_data/train.sliding.full.txt') as f:
    train_lines = f.readlines()
train_lines_df = pd.DataFrame(train_lines)
train_lines_df = train_lines_df.rename(columrans={0: 'text'})
train_dataset = Dataset.from_pandas(train_lines_df)

In [11]:
with open('../../input_data/eval.sliding.full.txt') as f:
    eval_lines = f.readlines()
eval_lines_df = pd.DataFrame(eval_lines)
eval_lines_df = eval_lines_df.rename(columns={0: 'text'})
eval_dataset = Dataset.from_pandas(eval_lines_df)

In [26]:
raw_datasets = concatenate_datasets([train_dataset, eval_dataset])

In [27]:
tokenizer = AlbertTokenizerFast.from_pretrained('albert-base-v2')

In [6]:
# create a python generator to dynamically load the data
def batch_iterator(batch_size=10000):
    for i in tqdm(range(0, len(raw_datasets), batch_size)):
        yield raw_datasets[i : i + batch_size]["text"]

In [7]:
hagalbert_tokenizer = tokenizer.train_new_from_iterator(text_iterator=batch_iterator(), vocab_size=32_000)
hagalbert_tokenizer.save_pretrained("tokenizer")


100%|██████████| 216/216 [00:27<00:00,  7.81it/s]






('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/tokenizer.json')

In [19]:
context_length = 512


In [7]:
tokenizer = AutoTokenizer.from_pretrained("tokenizer")
num_proc = multiprocessing.cpu_count()
print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")

The max length for the tokenizer is: 512


In [8]:
def group_texts(examples):
    tokenized_inputs = tokenizer(
       examples["text"], return_special_tokens_mask=True, truncation=True, max_length=tokenizer.model_max_length
    )
    return tokenized_inputs


In [13]:
# preprocess dataset
tokenized_datasets = raw_datasets.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)
tokenized_datasets.features

                                                                                       

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'special_tokens_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [14]:
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= tokenizer.model_max_length:
        total_length = (total_length // tokenizer.model_max_length) * tokenizer.model_max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + tokenizer.model_max_length] for i in range(0, total_length, tokenizer.model_max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

In [15]:
tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=num_proc)
# shuffle dataset
tokenized_datasets = tokenized_datasets.shuffle(seed=34)

print(f"the dataset contains in total {len(tokenized_datasets)*tokenizer.model_max_length} tokens")

                                                                                        

the dataset contains in total 161628672 tokens


In [20]:
config = AutoConfig.from_pretrained(
    "albert-base-v2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [22]:
model = AlbertModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"HAGALBERT size: {model_size/1000**2:.1f}M parameters")

HAGALBERT size: 11.9M parameters


In [24]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=True)

In [28]:
out = data_collator([tokenized_datasets[i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 512])
token_type_ids shape: torch.Size([5, 512])
attention_mask shape: torch.Size([5, 512])
labels shape: torch.Size([5, 512])


In [30]:
tokenized_datasets

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 315681
})

In [32]:
tokenized_datasets_split = tokenized_datasets.train_test_split(test_size=0.2)
args = TrainingArguments(
    output_dir="",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

NameError: name 'PartialState' is not defined