In [1]:
# !pip install transformers datasets torch accelerate scipy==1.12 trl

In [2]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline
from trl import SFTTrainer

dataset = load_dataset("j2moreno/leo-training-data")

# Checking the dataset
print(dataset)

train_dataset = dataset['train']

# Optionally, split the dataset into training and testing sets
train_test_split = train_dataset.train_test_split(test_size=0.1, seed=42)

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


##################
# Data Processing
##################

# Tokenize the data
def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples['answer'], truncation=True, padding="max_length", max_length=1024)
    # tokenized_outputs = tokenizer(examples['answer'], truncation=True, padding="max_length", max_length=128)

    tokenized_inputs['labels'] = tokenized_inputs["input_ids"].copy()  

    return tokenized_inputs

tokenized_datasets = train_test_split.map(tokenize_function, batched=True)

# Check that the dataset is not empty and has the required fields
print("Dataset size:", len(tokenized_datasets))
print("Sample data:", tokenized_datasets["train"])

print(100*"*")
print("Sample data:", tokenized_datasets["test"][0])

training_args = TrainingArguments(
    output_dir="./model_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,  # Simulate an effective batch size of 4
    weight_decay=0.01,
    num_train_epochs=4,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

2024-04-26 18:11:21.105073: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


DatasetDict({
    train: Dataset({
        features: ['prompt', 'answer'],
        num_rows: 6
    })
})


Some weights of LlamaForCausalLM were not initialized from the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0 and are newly initialized: ['model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rot

Dataset size: 2
Sample data: Dataset({
    features: ['prompt', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5
})
****************************************************************************************************
Sample data: {'prompt': 'How does Leonardo Moreno integrate AI in his work?', 'answer': '[{"content": "How does Leonardo Moreno integrate AI in his work?", "role": "user"}, {"content": "Leonardo Moreno integrates AI by developing machine learning models to analyze genomic data and improve diagnostic accuracies. He utilizes frameworks like TensorFlow and PyTorch to create models that can predict disease onset and outcomes based on genetic information.", "role": "assistant"}]', 'input_ids': [1, 518, 6377, 3051, 1115, 376, 5328, 947, 10255, 6491, 3879, 8154, 22782, 319, 29902, 297, 670, 664, 29973, 613, 376, 12154, 1115, 376, 1792, 10758, 8853, 3051, 1115, 376, 3226, 265, 6491, 3879, 8154, 3990, 1078, 319, 29902, 491, 14338, 4933, 6509, 4733, 304, 27599, 20

In [3]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    dataset_text_field="answer",
    tokenizer=tokenizer,
    packing=True
)

trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,No log,0.156913
2,No log,0.104839
2,No log,0.099407
3,No log,0.099091


TrainOutput(global_step=8, training_loss=1.4677610397338867, metrics={'train_runtime': 613.2576, 'train_samples_per_second': 0.033, 'train_steps_per_second': 0.013, 'total_flos': 101696705396736.0, 'train_loss': 1.4677610397338867, 'epoch': 3.2})

In [4]:
#############
# Evaluation
#############
tokenizer.padding_side = 'left'
metrics = trainer.evaluate()
metrics["eval_samples"] = len(tokenized_datasets['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

############
# Save model
############
trainer.save_model("./saved_model")

***** eval metrics *****
  epoch                   =        3.2
  eval_loss               =     0.0991
  eval_runtime            = 0:00:05.73
  eval_samples            =          1
  eval_samples_per_second =      0.174
  eval_steps_per_second   =      0.174
