## Before finetuning

In [1]:
import torch
import os
import transformers
import evaluate
import datasets
from tokenizers import AddedToken
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, PeftModel
from utils import load_model, test_chat_template

In [2]:
model_path = "models/gemma-2b-it"
model, tokenizer = load_model(model_path)

Loading tokenizer and model from: models/gemma-2b-it


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Load sample data (Summary dataset) and evaluate

### Metrics

In [3]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    if isinstance(eval_preds, tuple):
        eval_preds = eval_preds[0]
    labels_ids = eval_preds.label_ids # list summarization ids
    pred_ids = eval_preds.predictions # list predict ids
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    result = rouge.compute(predictions=pred_str, references=label_str)
    return result

### Visulize dataset

In [3]:
dataset_name = 'CarperAI/openai_summarize_tldr'
train_dataset = datasets.load_dataset(dataset_name, split='train[:2000]')
val_dataset = datasets.load_dataset(dataset_name, split='valid[:500]')

KeyboardInterrupt: 

In [9]:
train_dataset[0]

{'prompt': "SUBREDDIT: r/relationships\nTITLE: I (f/22) have to figure out if I want to still know these girls or not and would hate to sound insulting\nPOST: Not sure if this belongs here but it's worth a try. \n\nBackstory:\nWhen I (f/22) went through my first real breakup 2 years ago because he needed space after a year of dating roand  it effected me more than I thought. It was a horrible time in my life due to living with my mother and finally having the chance to cut her out of my life. I can admit because of it was an emotional wreck and this guy was stable and didn't know how to deal with me. We ended by him avoiding for a month or so after going to a festival with my friends. When I think back I wish he just ended. So after he ended it added my depression I suffered but my friends helped me through it and I got rid of everything from him along with cutting contact. \n\nNow: Its been almost 3 years now and I've gotten better after counselling and mild anti depressants. My mothe

In [6]:
def format_prompt(conversation, summary) -> str:
    output_prompt = f"""
### Instruction: Below is infomation in a post. Write a summary of the information.

### Text:
{conversation.strip()}

### Summary:
{summary}
""".strip()
    return output_prompt

In [40]:
# demo formating
print(format_prompt(train_dataset[0]['prompt'], train_dataset[0]['label']))

### Instruction: Below is infomation in a post. Write a summary of the information.

### Text:
SUBREDDIT: r/relationships
TITLE: I (f/22) have to figure out if I want to still know these girls or not and would hate to sound insulting
POST: Not sure if this belongs here but it's worth a try. 

Backstory:
When I (f/22) went through my first real breakup 2 years ago because he needed space after a year of dating roand  it effected me more than I thought. It was a horrible time in my life due to living with my mother and finally having the chance to cut her out of my life. I can admit because of it was an emotional wreck and this guy was stable and didn't know how to deal with me. We ended by him avoiding for a month or so after going to a festival with my friends. When I think back I wish he just ended. So after he ended it added my depression I suffered but my friends helped me through it and I got rid of everything from him along with cutting contact. 

Now: Its been almost 3 years now 

### Process dataset

In [94]:
def format_prompt(conversation, summary) -> str:
    output_prompt = f"""
### Instruction: Below is infomation in a post. Write a summary of the information.

### Text:
{conversation.strip()}

### Summary:
{summary}
""".strip()
    return output_prompt
    
def load_dataset(
    dataset_name_or_path: str,  
    tokenizer: AutoTokenizer,
    max_seq_length: int,
    split: str = "train[:2000]"
):
    
    dataset = datasets.load_dataset(dataset_name_or_path, split=split)

    def generate_inputs(examples):
        examples['input_text'] = []
        examples['input_ids'] = []
        examples['attention_mask'] = []
        for idx in range(len(examples['prompt'])):
            text = format_prompt(examples['prompt'][idx], examples['label'][idx])
            tokenized_data = tokenizer(text, max_length=max_seq_length, truncation=True, padding=True)
            
            examples['input_text'].append(text)
            examples['input_ids'].append(tokenized_data['input_ids'])
            examples['attention_mask'].append(tokenized_data['attention_mask'])

        return examples
    
    # tokenize dataset
    dataset = dataset.map(generate_inputs, batched=True, remove_columns=list(dataset.features))

    return dataset

In [95]:
dataset_name = 'CarperAI/openai_summarize_tldr'
max_seq_length = 2048
train_dataset = load_dataset(dataset_name, tokenizer, max_seq_length=max_seq_length, split="train[:2000]")
val_dataset = load_dataset(dataset_name, tokenizer, max_seq_length=max_seq_length, split="valid[:500]")

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [96]:
train_dataset, val_dataset

(Dataset({
     features: ['input_text', 'input_ids', 'attention_mask'],
     num_rows: 2000
 }),
 Dataset({
     features: ['input_text', 'input_ids', 'attention_mask'],
     num_rows: 500
 }))

## Zeroshot summary

In [81]:
def get_inference_prompt(text):
    input_prompt = f"""
### Instruction: Below is infomation in a post. Write a summary of the information.

### Text:
{text.strip()}

### Summary:
""".strip()

    return input_prompt

def summarize(model, tokenizer, text: str):
    inputs = tokenizer(
        get_inference_prompt(text), 
        return_tensors="pt"
    ).to(model.device)
    
    with torch.no_grad():
        out_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            top_p=0.9,
            top_k=40,
            temperature=0.0001,
            repetition_penalty=1.05,
        )
        
    return tokenizer.batch_decode(out_ids[:, input_ids.size(1):], skip_special_tokens=True)[0].strip()


def generate_summaries(model, dataset, tokenizer, num_samples=5):
    summaries = []
    for i, example in enumerate(dataset):
        if i >= num_samples:
            break
        print(i)
        prompt = example['input_text']
        summary = summarize(model, prompt)
        summaries.append({'conversation': example['prompt'], 'generated_summary': summary})
    return summaries


In [82]:
test_prompt = get_inference_prompt(train_dataset[0]['prompt'])
test_prompt

"### Instruction: Below is infomation in a post. Write a summary of the information.\n\n### Text:\nSUBREDDIT: r/relationships\nTITLE: I (f/22) have to figure out if I want to still know these girls or not and would hate to sound insulting\nPOST: Not sure if this belongs here but it's worth a try. \n\nBackstory:\nWhen I (f/22) went through my first real breakup 2 years ago because he needed space after a year of dating roand  it effected me more than I thought. It was a horrible time in my life due to living with my mother and finally having the chance to cut her out of my life. I can admit because of it was an emotional wreck and this guy was stable and didn't know how to deal with me. We ended by him avoiding for a month or so after going to a festival with my friends. When I think back I wish he just ended. So after he ended it added my depression I suffered but my friends helped me through it and I got rid of everything from him along with cutting contact. \n\nNow: Its been almost 3

In [83]:
test_inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)

In [71]:
out_ids = model.generate(
    **test_inputs,
    max_new_tokens=256,
    do_sample=True,
    top_p=0.9,
    top_k=40,
    temperature=0.0001,
    repetition_penalty=1.05,
)

In [72]:
out_ids.shape

torch.Size([1, 502])

In [14]:
inputs_length = len(test_inputs["input_ids"][0])
inputs_length

469

In [23]:
pred_str = tokenizer.decode(out_ids[0][inputs_length:], skip_special_tokens=True)
pred_str

'\n\nThe person is struggling with the emotional impact of a past relationship. They are considering cutting contact with former girlfriends due to the pain and discomfort it may cause.'

In [25]:
pred_str = pred_str.strip()
pred_str

'The person is struggling with the emotional impact of a past relationship. They are considering cutting contact with former girlfriends due to the pain and discomfort it may cause.'

In [21]:
label_str = train_dataset[0]['label']
label_str

"I still have contact with an old ex's friends but can't stand to see or talk to him. His friends are really nice ,so how do I tell them I possibly want to unfriend them on Facebook because of him?"

### Sample calculate rouge score 

In [26]:
result = rouge.compute(predictions=[pred_str], references=[label_str])
result

{'rouge1': 0.1408450704225352,
 'rouge2': 0.028985507246376812,
 'rougeL': 0.08450704225352113,
 'rougeLsum': 0.08450704225352113}

In [93]:
train_dataset.features

{'prompt': Value(dtype='string', id=None),
 'label': Value(dtype='string', id=None),
 'input_text': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

## Train without LoRA

In [88]:
collator = DataCollatorForCompletionOnlyLM(
    response_template="### Summary:\n",                     
    tokenizer=tokenizer,
    mlm=False
)

In [97]:
train_batch_size = 2
output_dir = "sft-summary-gemma"

In [98]:
# Training Arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=train_batch_size,
    per_gpu_eval_batch_size=1,
    bf16=True,
    learning_rate=5e-5,
    lr_scheduler_type='cosine',
    warmup_ratio=0.1,
    logging_steps=5,
    save_steps=500,
    save_total_limit=3,
    num_train_epochs=2,
    ddp_find_unused_parameters=False,
    group_by_length=True
)

# Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collator
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [99]:
trainer.train()

Step,Training Loss
5,3.0436
10,3.6468
15,3.2035
20,2.6811
25,2.6703
30,2.06
35,2.5953
40,2.4266
45,1.7954
50,2.2927


KeyboardInterrupt: 

## Train SFT, LORA

In [5]:
def load_model_lora(model_name_or_path, device_map="cuda", use_chatml_template=False, multi_gpu=False):
    print("Loading tokenizer and model from:", model_name_or_path)
    
    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        use_fast=True, # fast load tokenizer
        padding_side='right' # custom for rotary position embedding
    )

    # apply chat template
    if use_chatml_template:
        # Apply chatml template
        tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
    
        # Add additional token to ensure tokenizer don't create new sub_token
        tokenizer.add_tokens([AddedToken("<|im_start|>"), AddedToken("<|im_end|>")])

    # BitsAndBytes config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    # set device for multi GPU
    if multi_gpu:
        local_rank = os.environ["LOCAL_RANK"]
        device_map = f"cuda:{local_rank}"
        
    # load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        quantization_config=bnb_config,
        device_map=device_map,
        attn_implementation="flash_attention_2" # enable flash attention
    )
    # resize tokenizer length
    model.resize_token_embeddings(len(tokenizer))

    model.gradient_checkpointing_enable()
    config = LoraConfig(
        r=16, 
        lora_alpha=16, 
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        modules_to_save=["embed_tokens", "lm_head"],
        lora_dropout=0.0, 
        bias="none", 
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, config)
    model.print_trainable_parameters()
    
    return model, tokenizer


In [6]:
model_path = "models/gemma-2b-it"
model, tokenizer = load_model_lora(model_path)

Loading tokenizer and model from: models/gemma-2b-it


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 1,068,187,648 || all params: 3,574,360,064 || trainable%: 29.8847
