# Training llama models from scratch

Import all needed libraries:

In [71]:
import torch
import pandas as pd
from random import sample
from pathlib import Path
from tqdm import tqdm 

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer)

from tokenizers.normalizers import Lowercase, Strip, StripAccents, NFD

from transformers import (
    AutoTokenizer, 
    PreTrainedTokenizerFast, 
    set_seed, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling, 
    LlamaForCausalLM, 
    LlamaConfig)

from datasets import load_dataset

### Paths

Set paths to training data, eval data, and model directory:

In [72]:
training_files = ['/Users/jitkamuravska/Neural-Networks/project/training_data/formal_train.txt',]

eval_files = ['/Users/jitkamuravska/Neural-Networks/project/training_data/formal_val.txt',]

In [73]:
model_path = '/Users/jitkamuravska/Neural-Networks/project/formal-model/'

### Tokenizer

Initialize with BPE:

In [74]:
tokenizer = Tokenizer(models.BPE())

Normalizer that sets everything to normal unicode, lowercase, and strips white spaces and accents

(explanations here: https://huggingface.co/docs/tokenizers/components)

In [75]:
normalizer = normalizers.Sequence([NFD(), Lowercase(), Strip(), StripAccents()])

In [76]:
normalizer.normalize_str("Héllò hôw are ü?")

'hello how are u?'

In [77]:
tokenizer.normalizer = normalizer

Pre-tokenization (division of text into tokens on which BPE can be performed):

In [78]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [79]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")

[('Let', (0, 3)),
 ("'s", (3, 5)),
 ('Ġtest', (5, 10)),
 ('Ġpre', (10, 14)),
 ('-', (14, 15)),
 ('tokenization', (15, 27)),
 ('!', (27, 28))]

Set vocab size, add special tokens:

In [80]:
trainer = trainers.BpeTrainer(vocab_size=16000,) #special_tokens=["<|endoftext|>", "<pad>",]))

In [81]:
tokenizer.train(files = ['/Users/jitkamuravska/Neural-Networks/project/training_data/formal_train.txt'], trainer=trainer)






In [82]:
encoding = tokenizer.encode("Let us make a model")
print(encoding.tokens)

['let', 'Ġus', 'Ġmake', 'Ġa', 'Ġmodel']


By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don’t want the offsets to include these whitespaces, then this PostProcessor must be used:

(https://huggingface.co/docs/tokenizers/api/post-processors)

In [83]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

In [84]:
sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]
sentence[start:end]

'to'

In [85]:
tokenizer

Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=Sequence(normalizers=[NFD(), Lowercase(), Strip(strip_left=True, strip_right=True), StripAccents()]), pre_tokenizer=ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True), post_processor=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), decoder=None, model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={""":0, "#":1, "$":2, "%":3, "&":4, "'":5, "(":6, ")":7, "*":8, "+":9, ",":10, "-":11, ".":12, "/":13, "0":14, "1":15, "2":16, "3":17, "4":18, "5":19, "6":20, "7":21, "8":22, "9":23, ":":24, ";":25, "<":26, "=":27, ">":28, "@":29, "[":30, "]":31, "_":32, "a":33, "b":34, "c":35, "d":36, "e":37, "f":38, "g":39, "h":40, "i":41, "j":42, "k":43, "l":44, "m":45, "n":46, "o":47, "p":48, "q":49, "r":50, "s":51, "t":52, "u":53, "v":54, "w":55, "x":56, "y":57, "z":58, "{":59, 

In [86]:
tokenizer.decoder = decoders.ByteLevel()

In [87]:
tokenizer.decode(encoding.ids)

"let's test this tokenizer."

Save it:

In [88]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
    pad_token="<pad>",
)    

In [89]:
wrapped_tokenizer.save_pretrained(model_path+'tokenizer/')

('/Users/jitkamuravska/Neural-Networks/project/formal-model/tokenizer/tokenizer_config.json',
 '/Users/jitkamuravska/Neural-Networks/project/formal-model/tokenizer/special_tokens_map.json',
 '/Users/jitkamuravska/Neural-Networks/project/formal-model/tokenizer/tokenizer.json')

### Training 

Load tokenizer:

In [90]:
tokenizer = AutoTokenizer.from_pretrained(model_path+'tokenizer/')
tokenizer.pad_token = tokenizer.eos_token

Load data (now for training):

In [91]:
raw_datasets = load_dataset('text', data_files={'train': training_files, 
                                           'validation': eval_files})

Creates batches (https://huggingface.co/docs/transformers/pad_truncation)

In [92]:
context_length = 64

In [93]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        padding=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True)
    
    input_batch = []
    
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(tokenize, 
                                      batched=True, 
                                      remove_columns=raw_datasets["train"].column_names)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 31318
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 3047
    })
})

In [94]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 31318
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 3047
    })
})

Initiate new Llama with config as wished:

In [95]:
len(tokenizer)

16002

In [None]:
config = LlamaConfig(
    vocab_size=len(tokenizer),
    hidden_size=128,          # Increased from 16
    num_hidden_layers=6,      # Increased from 2
    intermediate_size=512,    # Increased from 16
    num_attention_heads=8,    # Increased from 2
    bos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"),
    eos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"),
    pad_token_id=tokenizer.convert_tokens_to_ids("<pad>"),
    max_position_embeddings=512  # Increased from 16
)



# config = LlamaConfig(
#     vocab_size=len(tokenizer),
#     hidden_size=16,
#     num_hidden_layers=2,
#     intermediate_size=16,
#     num_attention_heads=2,
#     bos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"),
#     eos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"),
#     pad_token_id=tokenizer.convert_tokens_to_ids("<pad>"),
#     max_position_embeddings=16
# )

Set seed for weight initialization:

In [97]:
set_seed(42)

New model object:

In [98]:
model = LlamaForCausalLM(config)

In [99]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Check out param size:

In [100]:
print(f'model num parameters = {model.num_parameters()}')

model num parameters = 5671040


In [101]:
config_dict = config.to_dict()

Set training parameters:

In [102]:
training_args = TrainingArguments(
    output_dir=model_path,
    overwrite_output_dir=True,
    save_strategy = "epoch", # saves after every epoch
    #save_strategy = "steps", 
    #save_steps = 0.1, # if below zero, then saves after every (n*100)% of training steps
    save_total_limit=2,  # set to zero to avoid saving
    eval_strategy = "epoch",
    #eval_steps = 0.1,
    num_train_epochs= 5,
    #max_steps = 1,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=8,
    warmup_steps=100, 
    lr_scheduler_type="cosine",
    learning_rate=1e-4, #original: 3e-4, # normal: 5e-4
    logging_steps=50,
    #fp16=True, ## only on CUDA
    #load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    #use_mps_device=True, ## only on apple silicon
    #use_cpu = True
)

Initialize trainer object:

In [103]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],#[:15000]['input_ids'],
    eval_dataset=tokenized_datasets['validation']#[:1200]['input_ids'],
)

  trainer = Trainer(


Train:

In [104]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,5.4053,6.053078
1,4.9992,5.795939
2,4.8297,5.700454
3,4.7522,5.691287
4,4.6517,5.697586


TrainOutput(global_step=4890, training_loss=5.129633857087367, metrics={'train_runtime': 586.4956, 'train_samples_per_second': 266.993, 'train_steps_per_second': 8.338, 'total_flos': 200692901308416.0, 'train_loss': 5.129633857087367, 'epoch': 4.9992337164750955})

Save logs of losses:

In [106]:
df = pd.DataFrame(trainer.state.log_history)
df.to_csv(model_path+'logs/losses.csv')  

Save final model

In [107]:
trainer.save_model(model_path+'final/')

### Test trained model on text generation

With hf pipelines:

In [108]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model_path+'final/')

Device set to use mps:0


In [109]:
pipe("From the prison he remembers ", do_sample = True, 
     num_return_sequences = 20, 
     max_length=128
     #top_k=50,
     #top_p=0.8,
     #temperature=1.0,
    )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'From the prison he remembers ) or i was by them to those who be a great other species. that we can have a very that we are to not necessary of the nature in that degree of the state, in a power. it is, in this other the president for every power, and of the members, to find that of the united states, or even to have to produce no few, will not, or we are to probably and if there will be seen in the most of these, are the same states, that to the species of the government, with a single, without the members of the federal powers.'},
 {'generated_text': 'From the prison he remembers ; that the constitution ought to have not been more to never been so at a country. they, as i have been, from many of the people, when the same. from their species, or a more than that which they have shown from the new york-continued as are a, we do not, that in all our government, as the general whole-water, and not all that they can have been to be it, of the states, and their cases, i