This file seeks to use [SFTTrainer](https://huggingface.co/docs/trl/en/sft_trainer) from huggingface to train on custom data to make the process easier than configuring the dataset for llama-recipes which seems much more difficult.

In [1]:
from datasets import load_dataset
from trl import SFTTrainer
from transformers import LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
import torch
import transformers

In [None]:
# Load Model
model_dir = "./evaluation/local-models/llama-2-7b-chat-hf"

model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    # QUANTIZE MODEL, NECESSARY STEP
    quantization_config=BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type='nf4'
            ))

# Load Tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_dir)

# Specify device
### NEEDED OR IT WILL BE INCREDIBLY SLOW
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device is:", device)

# Create Pipeline
pipeline = transformers.pipeline(
    "text-generation",

    model=model,

    tokenizer=tokenizer,

    torch_dtype=torch.float16,
)

# Test math skills

In [3]:
prompts = [
    "What is 12345 + 54321?",
    "What is 45678243424 / 98765542?"
    "What is (10*9) + (2 * 5)?"
]
answers = [
    12345 + 54321, 
    45678243424 / 98765542,
    (10*9) + (2 * 5)]

for prompt, answer in zip(prompts, answers):
    sequences = pipeline(
        prompt,

        do_sample=True,

        top_k=10,

        num_return_sequences=1,

        eos_token_id=tokenizer.eos_token_id,

        max_length=400,

        truncation=True

    )

    # Update
    print(f"\nQuestion: {prompt}\n")

    # Print Output
    for seq in sequences:

        print(f"{seq['generated_text']}")
        
    print(f"\nReal Answer: {answer}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Question: What is 12345 + 54321?

What is 12345 + 54321?
 obviously 
$$12345 + 54321 = 66766$$

So the answer is $66766$.

Real Answer: 66666

Question: What is 45678243424 / 98765542?What is (10*9) + (2 * 5)?

What is 45678243424 / 98765542?What is (10*9) + (2 * 5)?

Real Answer: 462.4917000303608


It got them all wrong! We can see it is close but hallucinates random digits on occasion.

# Training

In [4]:
# Print first 5 addition examples
import json

addition_data = open("./data/addition_dataset.json")
addition_json = json.load(addition_data)
addition_data.close()

print(addition_json[:5])

[{'input': '31471416137816 + 3030333', 'output': '31471416137816 + 3030333 = 31471419168149', 'answer': '31471419168149'}, {'input': '5390747146408 + 63', 'output': '5390747146408 + 63 = 5390747146471', 'answer': '5390747146471'}, {'input': '78574 + 84527', 'output': '78574 + 84527 = 163101', 'answer': '163101'}, {'input': '1071589 + 643041674321550', 'output': '1071589 + 643041674321550 = 643041675393139', 'answer': '643041675393139'}, {'input': '182833556 + 2840720893356', 'output': '182833556 + 2840720893356 = 2840903726912', 'answer': '2840903726912'}]


In [5]:
import pandas as pd

# Keep only input and output fields
clean_data = [{'question': entry['input'], 'answer': entry['answer']} for entry in addition_json]

df = pd.DataFrame(clean_data)

df.head()

Unnamed: 0,question,answer
0,31471416137816 + 3030333,31471419168149
1,5390747146408 + 63,5390747146471
2,78574 + 84527,163101
3,1071589 + 643041674321550,643041675393139
4,182833556 + 2840720893356,2840903726912


In [6]:
# From https://huggingface.co/docs/trl/en/sft_trainer
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['question'])):
        text = f"### Question: {example['question'][i]}\n ### Answer: {example['answer'][i]}"
        output_texts.append(text)
    return output_texts

In [7]:
# Make the dataframe tiny to check if that is the problem
tiny_df = df.head(10)

In [8]:
from datasets import Dataset

# Convert dataset to huggingface
data = Dataset.from_pandas(df)
tiny_data = Dataset.from_pandas(tiny_df)

tiny_data

Dataset({
    features: ['question', 'answer'],
    num_rows: 10
})

In [9]:
# Test formatter
formatting_prompts_func(tiny_data[:10])

['### Question: 31471416137816 + 3030333\n ### Answer: 31471419168149',
 '### Question: 5390747146408 + 63\n ### Answer: 5390747146471',
 '### Question: 78574 + 84527\n ### Answer: 163101',
 '### Question: 1071589 + 643041674321550\n ### Answer: 643041675393139',
 '### Question: 182833556 + 2840720893356\n ### Answer: 2840903726912',
 '### Question: 85475562968 + 362970762818695\n ### Answer: 363056238381663',
 '### Question: 442420836941876 + 343536963733045\n ### Answer: 785957800674921',
 '### Question: 93416902250747 + 40563968175152\n ### Answer: 133980870425899',
 '### Question: 9718915233811 + 5833558625795\n ### Answer: 15552473859606',
 '### Question: 70817378248043 + 20419152271408\n ### Answer: 91236530519451']

In [10]:
# Get longest sequence from data
max_seq = max([len(q) for q in df['question']])

print("Max Seq: ", max_seq)

Max Seq:  33


In [11]:
from transformers import TrainingArguments
from peft import LoraConfig, TaskType

model_dir = "./evaluation/local-models/llama-2-7b-chat-hf"

peft_config = LoraConfig(
    r=8,
    task_type="CAUSAL_LM",
)

In [None]:
from transformers import TrainingArguments
from peft import LoraConfig

args = TrainingArguments(
    output_dir = "evaluation/local-models/math-llama-sft",
    # Attempts to minimize dta size
#     eval_accumulation_steps = 5,
#     save_only_model = True,
#     fp16 = True, 
)



trainer = SFTTrainer(
    model,
    args=args,
    train_dataset=data,
    formatting_func=formatting_prompts_func,
    max_seq_length = 50, # To account for padding
    peft_config=peft_config
)

trainer.train()

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Map:   0%|          | 0/304000 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,1.5829
1000,1.2665
1500,1.2352


