In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
import buddygpt

model_id = 'outputs/buddygpt'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
model

BuddyGPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (layers): ModuleList(
      (0-11): 12 x Layer(
        (mha): SelfCausalAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (mlp): MLP(
          (ln1): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='none')
          (ln2): Linear(in_features=3072, out_features=768, bias=True)
        )
        (norm1): RMSNorm((768,), eps=None, elementwise_affine=True)
        (norm2): RMSNorm((768,), eps=None, elementwise_affine=True)
      )
    )
    (ln_norm): RMSNorm((768,), eps=None, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [3]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def sample(model, query, max_len):
    input_ids = tokenizer.encode(query, return_tensors='pt').to(model.device)
    outputs = model.generate(input_ids, max_len)
    gen_text = tokenizer.decode(outputs[0])
    return gen_text

FLASH = 0
sample(model.to(device), 'Create an array of length 5 which contains all even numbers between 1 and 10.', max_len=128)
tokenizer.chat_template = '''
{% for message in messages %}
{{ '<|system|>' if message['role'] == 'system' else '<|user|>' if message['role'] == 'user' else '<|assistant|>' }} {{ message['content'] }}
{% endfor %}
<|assistant|>
'''
tokenizer.chat_template

"\n{% for message in messages %}\n{{ '<|system|>' if message['role'] == 'system' else '<|user|>' if message['role'] == 'user' else '<|assistant|>' }} {{ message['content'] }}\n{% endfor %}\n<|assistant|>\n"

In [4]:
messages = [
    {"role": "system", "content": "You are a helpful assistant!"},
    {"role": "user", "content": "Hello!"},
    {"role": "assistant", "content": "Hi there! How can I help you today?"},
    {"role": "user", "content": "What's the weather like?"},
]
prompt = tokenizer.apply_chat_template(    
    messages,
    tokenize=False,              # return plain text
    add_generation_prompt=True  # adds trailing "Assistant:" or equivalent)
)
print(prompt)


<|system|> You are a helpful assistant!
<|user|> Hello!
<|assistant|> Hi there! How can I help you today?
<|user|> What's the weather like?
<|assistant|>


In [5]:
from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split="train")

def format_chat_template(example):
    output_texts = []
    for i in range(len(example["instruction"])):
        messages = []
        user_content = f'instruction:{example["instruction"][i]}'+("" if example["input"][i] else f'\ninput:{example["input"][i]}')
        messages.append({"role":"system", "content": "You are a helpful assistant!"})
        messages.append({"role":"user", "content": user_content})
        messages.append({"role":"assistant", "content": example['output']})
        text = tokenizer.apply_chat_template(messages, tokenize=False, return_tensors='pt')
        output_texts.append(text)
    result = {}
    result['conversation'] = output_texts
    return result
    
dataset = dataset.map(format_chat_template, batched=True).select_columns('conversation')
dataset

Dataset({
    features: ['conversation'],
    num_rows: 51760
})

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
import json

output_dir = 'outputs/buddysft'

def formatting_prompts_func(example):
    return example['conversation']

response_template = "<|assistant|>"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

args = SFTConfig(
    output_dir=output_dir,
    learning_rate=2e-5,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_steps=50,
)

trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    args=args,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
)

trainer.train()

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [None]:
trainer.save_model(output_dir)

In [None]:
sample(model, 