# Training llama models from scratch

Import all needed libraries:

In [179]:
import torch
import pandas as pd
from random import sample
from pathlib import Path
from tqdm import tqdm 

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer)

from tokenizers.normalizers import Lowercase, Strip, StripAccents, NFD

from transformers import (
    AutoTokenizer, 
    PreTrainedTokenizerFast, 
    set_seed, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling, 
    LlamaForCausalLM, 
    LlamaConfig)

from datasets import load_dataset

### Paths

Set paths to training data, eval data, and model directory:

In [180]:
training_files = ['/Users/jitkamuravska/Neural-Networks/10/large_book.txt',]

eval_files = ['/Users/jitkamuravska/Neural-Networks/10/small_book.txt',]

In [181]:
model_path = '/Users/jitkamuravska/Neural-Networks/10/model/'

### Tokenizer

Initialize with BPE:

In [182]:
tokenizer = Tokenizer(models.BPE())

Normalizer that sets everything to normal unicode, lowercase, and strips white spaces and accents

(explanations here: https://huggingface.co/docs/tokenizers/components)

In [183]:
normalizer = normalizers.Sequence([NFD(), Lowercase(), Strip(), StripAccents()])

In [184]:
normalizer.normalize_str("Héllò hôw are ü?")

'hello how are u?'

In [185]:
tokenizer.normalizer = normalizer

Pre-tokenization (division of text into tokens on which BPE can be performed):

In [186]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [187]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")

[('Let', (0, 3)),
 ("'s", (3, 5)),
 ('Ġtest', (5, 10)),
 ('Ġpre', (10, 14)),
 ('-', (14, 15)),
 ('tokenization', (15, 27)),
 ('!', (27, 28))]

Set vocab size, add special tokens:

In [188]:
trainer = trainers.BpeTrainer(vocab_size=16000,) #special_tokens=["<|endoftext|>", "<pad>",]))

In [189]:
tokenizer.train(files = ['/Users/jitkamuravska/Neural-Networks/10/large_book.txt'], trainer=trainer)






In [190]:
encoding = tokenizer.encode("Let us make a model")
print(encoding.tokens)

['let', 'Ġus', 'Ġmake', 'Ġa', 'Ġmodel']


By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don’t want the offsets to include these whitespaces, then this PostProcessor must be used:

(https://huggingface.co/docs/tokenizers/api/post-processors)

In [191]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

In [192]:
sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]
sentence[start:end]

'token'

In [193]:
tokenizer

Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=Sequence(normalizers=[NFD(), Lowercase(), Strip(strip_left=True, strip_right=True), StripAccents()]), pre_tokenizer=ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True), post_processor=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), decoder=None, model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={"!":0, "#":1, "$":2, "%":3, "&":4, "(":5, ")":6, "*":7, ",":8, "-":9, ".":10, "/":11, "0":12, "1":13, "2":14, "3":15, "4":16, "5":17, "6":18, "7":19, "8":20, "9":21, ":":22, ";":23, "?":24, "[":25, "]":26, "_":27, "a":28, "b":29, "c":30, "d":31, "e":32, "f":33, "g":34, "h":35, "i":36, "j":37, "k":38, "l":39, "m":40, "n":41, "o":42, "p":43, "q":44, "r":45, "s":46, "t":47, "u":48, "v":49, "w":50, "x":51, "y":52, "z":53, "¢":54, "¦":55, "±":56, "´":57, "µ":58, "¶":59, 

In [194]:
tokenizer.decoder = decoders.ByteLevel()

In [195]:
tokenizer.decode(encoding.ids)

'lets test this tokenizer.'

Save it:

In [196]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
    pad_token="<pad>",
)    

In [197]:
wrapped_tokenizer.save_pretrained(model_path+'tokenizer/')

('/Users/jitkamuravska/Neural-Networks/10/model/tokenizer/tokenizer_config.json',
 '/Users/jitkamuravska/Neural-Networks/10/model/tokenizer/special_tokens_map.json',
 '/Users/jitkamuravska/Neural-Networks/10/model/tokenizer/tokenizer.json')

### Training 

Load tokenizer:

In [198]:
tokenizer = AutoTokenizer.from_pretrained(model_path+'tokenizer/')
tokenizer.pad_token = tokenizer.eos_token

Load data (now for training):

In [199]:
raw_datasets = load_dataset('text', data_files={'train': training_files, 
                                           'validation': eval_files})

Creates batches (https://huggingface.co/docs/transformers/pad_truncation)

In [200]:
context_length = 64

In [201]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        padding=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True)
    
    input_batch = []
    
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(tokenize, 
                                      batched=True, 
                                      remove_columns=raw_datasets["train"].column_names)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 61684
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 14909
    })
})

In [202]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 61684
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 14909
    })
})

Initiate new Llama with config as wished:

In [203]:
len(tokenizer)

16002

In [204]:
config = LlamaConfig(
    vocab_size=len(tokenizer),
    hidden_size=16,
    num_hidden_layers=2,
    intermediate_size=16,
    num_attention_heads=2,
    bos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"),
    eos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"),
    pad_token_id=tokenizer.convert_tokens_to_ids("<pad>"),
    max_position_embeddings=16
)

Set seed for weight initialization:

In [205]:
set_seed(42)

New model object:

In [206]:
model = LlamaForCausalLM(config)

In [207]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Check out param size:

In [208]:
print(f'model num parameters = {model.num_parameters()}')

model num parameters = 515728


In [209]:
config_dict = config.to_dict()

Set training parameters:

In [210]:
training_args = TrainingArguments(
    output_dir=model_path,
    overwrite_output_dir=True,
    save_strategy = "epoch", # saves after every epoch
    #save_strategy = "steps", 
    #save_steps = 0.1, # if below zero, then saves after every (n*100)% of training steps
    save_total_limit=0,  # set to zero to avoid saving
    eval_strategy = "epoch",
    #eval_steps = 0.1,
    num_train_epochs= 10,
    #max_steps = 1,
    gradient_accumulation_steps=8,
    per_device_train_batch_size=16,
    warmup_steps=200, 
    lr_scheduler_type="cosine",
    learning_rate=3e-4, # normal: 5e-4
    logging_steps=10,
    #fp16=True, ## only on CUDA
    #load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    #use_mps_device=True, ## only on apple silicon
    #use_cpu = True
)

Initialize trainer object:

In [211]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],#[:15000]['input_ids'],
    eval_dataset=tokenized_datasets['validation']#[:1200]['input_ids'],
)

  trainer = Trainer(


Train:

In [212]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,6.8047,7.633952
2,5.9306,7.0069
3,5.7172,6.785738
4,5.4537,6.585369
5,5.2419,6.425212
6,5.105,6.331116
7,5.0393,6.280762
8,4.944,6.258432
9,5.0023,6.250455
10,4.9047,6.249041


TrainOutput(global_step=4820, training_loss=5.605137153878746, metrics={'train_runtime': 667.6729, 'train_samples_per_second': 923.866, 'train_steps_per_second': 7.219, 'total_flos': 32954269349760.0, 'train_loss': 5.605137153878746, 'epoch': 10.0})

Save logs of losses:

In [213]:
df = pd.DataFrame(trainer.state.log_history)
df.to_csv(model_path+'logs/losses.csv')  

Save final model

In [214]:
trainer.save_model(model_path+'final/')

### Test trained model on text generation

With hf pipelines:

In [215]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model_path+'final/')

Device set to use mps:0


In [218]:
pipe("From the prison he remembers ", do_sample = True, 
     num_return_sequences = 20, 
     max_length=128
     #top_k=50,
     #top_p=0.8,
     #temperature=1.0,
    )

[{'generated_text': 'From the prison he remembers ,, who were not be in an hand, for i am is the letter,, “i am in m; i had just. you have to have been you,” said it to be as he am.” one.” monte cristo into the other. which i to his hands to i had to say that i were, i think of the door, it was as i do be in the young one of the one as who not have been, she replied i do one, i had?” said my young the young this count, “what shall know?” said at me, “i will you?” said is'},
 {'generated_text': 'From the prison he remembers , to be and your father?” said in the young two’s hand, and she is on the house to the father, like the letter of the count, for to be just. i were at them; if not not to not know all at me?” said a man of the count, “i will be the carriage, “you was a young other.” he is a good at the old man, he’s in the count,” said i should be in to the old man to my same young count by his room and the most so he not be a abbe, the time a small of that'},
 {'generated_text': 'Fr