# Finetune Flan-* for Alpaca

using Lora and *int8* quantization

For UL2 training needs a 40G GPU; Smaller models will train a 24G card (still looking into 4bit quant)


In [1]:
#using_model='flan-t5-small' # takes 3 hours to train on a titan
using_model='flan-t5-xxl'
#using_model='flan-ul2' # I don't have a pretrained adapter weights for this yet; out of llambda labs credits :-(

In [2]:
# Allow for a different sharded model to be used; otherwise should be same,same
# (model name for model, model name for tokenizer)

if using_model == 'flan-ul2': # currently requires 40G GPU
    model_name = ('google/flan-ul2','google/flan-ul2')
    run_name = 'flanul2-lora-int8-alpaca'
    dataset = 'johnrobinsn/alpaca-cleaned'
    peft_name = 'flanul2-lora-int8-alpaca'
elif using_model == 'flan-t5-xxl': # should run on 24G GPU for debugging
    model_name = ('philschmid/flan-t5-xxl-sharded-fp16','google/flan-t5-xxl')
    run_name = 'flant5xxl-lora-int8-alpaca'
    dataset = 'johnrobinsn/alpaca-cleaned'
    peft_name = 'flant5xxl-lora-int8-alpaca'
elif using_model == 'flan-t5-small': # quick to train    
    model_name = ('google/flan-t5-small','google/flan-t5-small')
    run_name = 'flant5small-lora-int8-alpaca'
    dataset = 'johnrobinsn/alpaca-cleaned'
    peft_name = 'flant5small-lora-int8-alpaca'

In [3]:
model_name[1],dataset,peft_name,run_name

('google/flan-t5-xxl',
 'johnrobinsn/alpaca-cleaned',
 'flant5xxl-lora-int8-alpaca',
 'flant5xxl-lora-int8-alpaca')

In [4]:
# on lambdalabs bits and bytes couldn't find the cuda runtime so add ld path
# import os

# #if use_flan_ul2: # I'm using llambda labs for training ul2
# os.environ['LD_LIBRARY_PATH'] = '/usr/lib/x86_64-linux-gnu/'
# os.getenv('LD_LIBRARY_PATH')

In [None]:
# Install training dependences
!pip install -Uqq  git+https://github.com/huggingface/peft.git
!pip install -Uqq "transformers==4.27.1" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes==0.37.1" loralib --upgrade --quiet
!pip install -Uqq wandb
!pip install -Uqq protobuf==3.20

__If you just want to do inference you can jump all the way down to the ["Evaluate"](#evaluate) cell and start running from there to download my adapter weights from hf hub and try it out.__

## Training

In [None]:
report_to = "wandb" # "none"

if report_to != "none":
    import wandb
    wandb.login()

In [None]:
wandb.init(project=run_name,config={
    "model": model_name[1],
    "dataset":dataset
})

### Dataset

Alpaca

In [None]:
def generate_prompt(data_point):
    # sorry about the formatting disaster gotta move fast
    if data_point["input"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}"""


### Model

In [None]:
from transformers import AutoTokenizer

print("Loading tokenizer for model: ", model_name[1])
tokenizer = AutoTokenizer.from_pretrained(model_name[1])

In [None]:
def tokenize(x, tokenizer):
    prompt = generate_prompt(x)
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding="max_length",
    )
    labels = tokenizer(text_target=x["output"], max_length=CUTOFF_LEN, padding="max_length", truncation=True)
    # loss function will ignore -100 tokens
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    labels["input_ids"] = [(l if l != tokenizer.pad_token_id else -100) for l in labels["input_ids"]]
             
    return {
        "input_ids": result["input_ids"],
        "attention_mask": result["attention_mask"],
        "labels": labels["input_ids"]
    }
    

In [None]:
from datasets import load_dataset

# Load dataset from the hub
data = load_dataset(dataset)

In [None]:
VAL_SET_SIZE = 2000
CUTOFF_LEN = 256  # 256 accounts for about 96% of the data


train_val = data["train"].train_test_split(
    test_size=VAL_SET_SIZE, shuffle=True, seed=42
)
train_data = train_val["train"]
val_data = train_val["test"]

train_data = train_data.shuffle().map(lambda x: tokenize(x, tokenizer))
val_data = val_data.shuffle().map(lambda x: tokenize(x, tokenizer))

# print(f"Train dataset size: {len(dataset['train'])}")
# print(f"Test dataset size: {len(dataset['test'])}")


In [None]:
from transformers import AutoModelForSeq2SeqLM

print("Loading model for model: ", model_name[0])
model = AutoModelForSeq2SeqLM.from_pretrained(model_name[0], load_in_8bit=True, device_map="auto")

### Finetune Flan-UL2 with LoRA and int8

We could shard the Flan-UL2 model to conserve memory while loading the model.

Now, we can prepare our model for the LoRA int-8 training using `peft`.

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config 
lora_config = LoraConfig(
 r= 8, #16, 
 lora_alpha=16, #32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
 #task_type=TaskType.CAUSAL_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# trainable params: 18874368 || all params: 11154206720 || trainable%: 0.16921300163961817

In [None]:
import transformers
eval_steps = 200
save_steps = 200
logging_steps = 20
output_dir = 'results'


In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

trainer = Seq2SeqTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=Seq2SeqTrainingArguments(
        num_train_epochs=5,
        learning_rate=3e-4,
        logging_steps=logging_steps,
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=eval_steps,
        save_steps=save_steps,
        output_dir=output_dir,
        report_to=report_to if report_to else "none",
        save_total_limit=3,
        load_best_model_at_end=True,
        push_to_hub=False,
    ),
    data_collator=data_collator
)

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

Let's now train our model and run the cells below. Note that for T5 and UL2, some layers are kept in `float32` for stability purposes.

In [None]:
# train model
trainer.train() 

In [None]:
# Save our LoRA model & tokenizer results
trainer.model.save_pretrained(peft_name)
tokenizer.save_pretrained(peft_name)
# if you want to save the base model to call
# trainer.model.base_model.save_pretrained(peft_model_id)

### Save to Hub

In [None]:
!pip install -Uqq huggingface_hub
import huggingface_hub
huggingface_hub.login()

In [None]:
# On lambdalabs had to run this in a terminal outside of the notebook
# sudo apt install git-lfs

In [None]:
repo_id = f'{huggingface_hub.whoami()["name"]}/{peft_name}'
trainer.model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

### Free Up Memory

In [None]:
import torch
import gc
config = None
model = None
tokenizer=None
trainer=None
gc.collect()
torch.cuda.empty_cache()

## Evaluate <a id="evaluate"></a>

In [5]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,T5ForConditionalGeneration


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /home/jr/anaconda3/envs/flan_t5_xxl/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /home/jr/anaconda3/envs/flan_t5_xxl/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


In [6]:
# Load Adapter Weights
model_id = model_name[1] #'google/flan-ul2'
#peft_model_id = peft_name # use locally saved adapter weights if you trained above
peft_model_id = f'johnrobinsn/{peft_name}' # use my pretrained adapter weights

In [7]:
model_id,peft_model_id

('google/flan-t5-xxl', 'johnrobinsn/flant5xxl-lora-int8-alpaca')

In [8]:
# Load peft config for pre-trained checkpoint etc. 
#peft_model_id = "results"
config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_id,  load_in_8bit=True,  device_map={"":0})
#model = AutoModelForSeq2SeqLM.from_pretrained(model_id,  load_in_8bit=True,  device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
model.eval()

print("Peft model loaded")



Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Peft model loaded


### Try it out
Let’s load the dataset again with a random sample to try the summarization.

In [9]:
from transformers import GenerationConfig

In [10]:
def generate_prompt(data_point):
    # sorry about the formatting disaster gotta move fast
    if data_point["input"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}"""

In [11]:
def generate(instruction,input=None):
    prompt = generate_prompt({'instruction':instruction,'input':input})
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
    outputs = model.generate(input_ids=input_ids, max_new_tokens=1000, do_sample=True, top_p=0.9)
    print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")

In [12]:
generate('Write a short story in third person narration about a protagonist who has to make an important career decision.')

summary:
The protagonist was faced with an important career decision. He had been working in his current position for years, but the job was not satisfying him. He felt like he needed to make a change to get a more challenging and fulfilling job. After much consideration, he decided to explore his options and find another job.


In [13]:
generate('Who was the first man to walk on the moon')

summary:
The first man to walk on the moon was Neil Armstrong in 1969.


In [14]:
generate('Explain why the following fraction is equivalent to 1/4','4/16')

summary:
The fraction 4/16 is equivalent to 1/4 because 1/16 x 2 equals 1/4.


In [16]:
generate('Write a poem about about a cat')

summary:
When she walks down the street, A cat's purr's like a waterfall The soft silky fur of her tail fluttering in the breeze Her whiskers swish in the breeze and she is my purrr-fect cat The soft meows of her fur meows loudly Bringing joy to those who are blessed with her presence And purr's like a heart of gold Her whimper meows can comfort the weary heart.
