# Fine-tune Llama

## Goal

Which results do we get if we fine-tune Llama 13b?

## Imports

In [None]:
import torch
import gc
import time
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from tqdm.auto import tqdm
import yaml
import os
import hashlib

from transformers import (
    AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,
    pipeline, TrainingArguments
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from datasets import Dataset

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

pd.set_option('display.max_colwidth', 200)

## Load model

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant= True,
    llm_int8_enable_fp32_cpu_offload= True,
    llm_int8_skip_modules=['gate', 'lm_head'],
)

torch.cuda.empty_cache()

In [None]:
model_path = "/home/gbarbadillo/data/llama2-13b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True,
    )
model.config.pretraining_tp = 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '<pad>'})
tokenizer.padding_side = 'right' # by default is left, for training right seems to be better
model.resize_token_embeddings(len(tokenizer))

In [None]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

def chat_with_mixtral(prompt, max_new_tokens=200, verbose=True, do_sample=False, temperature=0.7, top_p=0.95):
    if not prompt.startswith('<s>[INST]'):
        print('Formatting the prompt to Mixtral needs.')
        prompt = f'<s>[INST] {prompt} [/INST]'
    start = time.time()

    if do_sample:
        sampling_kwargs = dict(do_sample=True, temperature=temperature, top_p=top_p)
    else:
        sampling_kwargs = dict(do_sample=False)

    sequences = pipe(
        prompt ,
        max_new_tokens=max_new_tokens,
        # https://www.reddit.com/r/LocalLLaMA/comments/184g120/mistral_fine_tuning_eos_and_padding/
        # https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/discussions/106
        pad_token_id=tokenizer.eos_token_id,
        **sampling_kwargs,
        return_full_text=False,
    )
    response = sequences[0]['generated_text']
    #response = re.sub(r'[\'"]', '', response)
    if verbose:
        stop = time.time()
        time_taken = stop-start
        n_tokens = len(tokenizer.tokenize(response))
        print(f"Execution Time : {time_taken:.1f} s, tokens per second: {n_tokens/time_taken:.1f}")
    return response

In [None]:
def print_gpu_memory():
    for device in range(torch.cuda.device_count()):
        print(f'GPU {device} memory allocated: {torch.cuda.memory_allocated(device)/1024**3:.1f} GB, max memory allocated: {torch.cuda.max_memory_allocated(device)/1024**3:.1f} GB')
print_gpu_memory()

## Prepare data

In [None]:
prompt_template = """<s>[INST][prompt-recovery]
Analyze the original and rewritten text and answer with the most likely text prompt that was given to rewrite or make stylistic changes to the original text.

- The text prompt should be a single sentence. Reply just with a short sentence and do not add any notes or comments.
- Sometimes the rewritten text will have hints about the text prompt. For example if it starts by
  Reworded, Rephrased, Translated, Update etc. you should include that word in the text prompt.
- Unless necessary do not make reference to details in the original text and keep the text prompt abstract and generic.

## Original text

{original_text}

## Rewritten text

{rewritten_text}

[/INST] The most likely text prompt given to transform the original text into the rewritten text was: {response} </s>"""
response_template = "The most likely text prompt given to transform the original text into the rewritten text was:"

In [None]:
def prepare_dataframe_for_training(filepath, target_col='rewrite_prompt'):
    df = pd.read_csv(filepath)
    texts = []
    for _, row in df.iterrows():
        texts.append(prompt_template.format(original_text=row['original_text'],
                                            rewritten_text=row['rewritten_text'],
                                            response=row[target_col]))
    df['text'] = texts
    df['n_tokens'] = df['text'].apply(lambda x: len(tokenizer.tokenize(x)))
    return df

In [None]:
import glob
high_quality_datasets = pd.concat([prepare_dataframe_for_training(filepath) for filepath \
                                   in glob.glob('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_v*.csv')])

In [None]:

train_df_2 = prepare_dataframe_for_training('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/mooney_test_with_gpt4.csv',
                                            target_col='gpt4_prompt')
bad_indices = [164, 181, 235]
print(train_df_2.loc[bad_indices].rewritten_text.values)
train_df_2.drop(bad_indices, inplace=True)
train_df_3 = prepare_dataframe_for_training('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/gemma_suppl_rewrite_curated_with_gpt4.csv',
                                         target_col='gpt4_prompt')
train_df = pd.concat([high_quality_datasets, train_df_2, train_df_3], ignore_index=True).reset_index(drop=True)
train_df.head()

In [None]:
eval_df_indices = train_df.sample(frac=0.1, random_state=42).index
eval_df = train_df.loc[eval_df_indices].copy()
train_df.drop(eval_df_indices, inplace=True)

In [None]:
plt.hist(train_df['n_tokens'], bins=50, alpha=0.5, label='train', cumulative=True, density=True)
plt.ylim(0, 1)
plt.grid()
plt.legend(loc=0)
plt.xlabel('Number of tokens')
plt.title('Token distribution of the texts');

In [None]:
print(f'There were {len(train_df)} samples for training and {len(eval_df)} samples for evaluation.')
max_seq_length = 640
train_df = train_df[train_df['n_tokens'] <= max_seq_length]
eval_df = eval_df[eval_df['n_tokens'] <= max_seq_length]
print(f'There are {len(train_df)} samples for training and {len(eval_df)} samples for evaluation.')

In [None]:
print(f'One epoch is {len(train_df)//16} steps.')

In [None]:
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

## Inference before training

In [None]:
for text in train_df['text'].values[:5]:
    print(chat_with_mixtral(text.split(response_template)[0] + response_template))

In [None]:
for text in eval_df['text'].values[:5]:
    print(chat_with_mixtral(text.split(response_template)[0] + response_template))

In [None]:
raise

## Train

In [None]:
model = prepare_model_for_kbit_training(model)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False # Gradient checkpointing is used by default but not compatible with caching

In [None]:
peft_config = LoraConfig(
    # lora_alpha: LoRA scaling factor.
    lora_alpha=64, #64,
    lora_dropout=0.1, # 0.1, althought Vaca suggested to use 0.05 for big models
    # r: the rank of the update matrices, expressed in int. Lower rank results in smaller update matrices with fewer trainable parameters.
    r=16, #16
    bias="none",
    task_type="CAUSAL_LM",
    # target_modules: The modules (for example, attention blocks) to apply the LoRA update matrices.
    target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [None]:
logging_steps = len(train_df)//16
logging_steps = 50
print(f'Logging steps: {logging_steps}')
training_arguments = TrainingArguments(
        output_dir="/mnt/hdd0/Kaggle/llm_prompt_recovery/trainings/2024-04-08_new_trainings/08_llama_moar_data",
        evaluation_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=8, # 4-16 should be fine for lora.
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=8,
        log_level="debug",
        save_steps=logging_steps, #50,
        logging_steps=logging_steps, #50,
        learning_rate=2e-5, # maybe we can increase this
        eval_steps=logging_steps, #50,
        max_steps=(len(train_df)//16)*10, #300,
        warmup_steps=30,
        lr_scheduler_type="linear",
)

In [None]:
data_collator = DataCollatorForCompletionOnlyLM(tokenizer=tokenizer, response_template=response_template)
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=data_collator,
    args=training_arguments,
)

trainer.train()

## Make a few inferences 

In [None]:
for text in train_df['text'].values[:5]:
    print(chat_with_mixtral(text.split(response_template)[0] + response_template))

In [None]:
for text in eval_df['text'].values[:5]:
    print(chat_with_mixtral(text.split(response_template)[0] + response_template))

## Initial experiments with Llama

In [None]:
raise

In [None]:
from transformers import AutoTokenizer
import transformers
import torch

model = "/home/gbarbadillo/data/llama2-13b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model_path = "/home/gbarbadillo/data/llama2-13b-chat-hf"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    #quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True,
)
print(model)

tokenizer = AutoTokenizer.from_pretrained(model_path)

pipeline = transformers.pipeline(
    task="text-generation",
    model=model,
    device_map="auto",
    tokenizer=tokenizer,
)
sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import transformers
import torch

model_path = "/home/gbarbadillo/data/llama2-13b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    # bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    # bnb_4bit_use_double_quant= True,
    # llm_int8_enable_fp32_cpu_offload= True,
    # llm_int8_skip_modules=['gate', 'lm_head'],
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True,
)
print(model)

tokenizer = AutoTokenizer.from_pretrained(model_path)
pipeline = transformers.pipeline(
    task="text-generation",
    model=model,
    device_map="auto",
    tokenizer=tokenizer,
)

sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")


After quantization I get

```
RuntimeError: mat1 and mat2 shapes cannot be multiplied (28x5120 and 1x6912)
```

If I change `"pretraining_tp": 2` to `"pretraining_tp": 1` in the `config.json` it works! 
https://discuss.huggingface.co/t/llama2-finetuning-giving-error-mat1-and-mat2-shapes-cannot-be-multiplied-4096x5120-and-1x2560/47466/7

Very weird behaviour.


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import transformers
import torch

model_path = "/home/gbarbadillo/data/llama2-13b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant= True,
    llm_int8_enable_fp32_cpu_offload= True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True,
)
model.config.pretraining_tp = 1
print(model)

tokenizer = AutoTokenizer.from_pretrained(model_path)
pipeline = transformers.pipeline(
    task="text-generation",
    model=model,
    device_map="auto",
    tokenizer=tokenizer,
)

sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

## TODO

- [ ] rslora? https://huggingface.co/docs/peft/main/en/conceptual_guides/lora#common-lora-parameters-in-peft
- [ ] Reduce alpha, or reduce r as well
- [ ] Try with Vaca's parameters