In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Fine-tune Llama 2 in Google Colab
> 🗣️ Large Language Model Course

❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne), based on Younes Belkada's [GitHub Gist](https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da). Special thanks to Tolga HOŞGÖR for his solution to empty the VRAM.

This notebook runs on a T4 GPU. (Last update: 24 Aug 2023)


In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

from datasets import load_dataset

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "cnn_dailymail"

# Fine-tuned model name
new_model = "/content/drive/MyDrive/llama-2-7b-miniguanaco"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "/content/drive/MyDrive/results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

# Data preparation

In [None]:
#import random
#import glob


# Load dataset (you can process it here)
#PATH = 'drive/MyDrive/bbc-2/'
#domains = ['business', 'entertainment', 'politics', 'sport', 'tech']

#train_1, train_2, train_3, test = random.sample(range(4), 4)
#val = 5

#train_files = glob.glob(PATH + domains[train_1] + '/*.txt')
#train_files.extend(glob.glob(PATH + domains[train_2] + '/*.txt'))
#train_files.extend(glob.glob(PATH + domains[train_3] + '/*.txt'))

#val_files = glob.glob(PATH + domains[val] + '/*.txt')

#test_files = glob.glob(PATH + domains[test] + '/*.txt')

#data = []
#for f_name in train_files:
  # print(f_name)
  #f = open(f_name, 'r')
  #lines = f.readlines()
  #el = "<s>"
  #inst = "[INST] generate a piece of news for the title: " + lines[0][:-1].encode("utf-8").decode('unicode_escape') + "[/INST]"
  #text = ""
  #for l in lines[1:]:
  #  text += l[:-1].encode("utf-8").decode('unicode_escape') + ' '
  #text += "</s>"

  #el += (inst + text)
  #data.append([el])

#df = pd.DataFrame(data, columns=['text'])

# ds_dataset = ds.dataset(pa.Table.from_pandas(df).to_batches())

### convert to Huggingface dataset
#dataset = Dataset(pa.Table.from_pandas(df))

#print(dataset[0])

In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')

dataset = load_dataset(dataset_name, '1.0.0', split='train')

data = []
number_of_words = 0
for i in range(5000):
  item = dataset[i]
  article = item['article']
  highlights = item['highlights']
  el = "<s>"
  inst = "[INST] generate a piece of news with the following highlights: " + highlights + "[/INST]"
  text = article + "</s>"

  number_of_words += len(word_tokenize(article))

  el += (inst + text)
  data.append([el])

df = pd.DataFrame(data, columns=['text'])
number_of_words = number_of_words / i

ds_dataset = ds.dataset(pa.Table.from_pandas(df).to_batches())

# convert to Huggingface dataset
dataset = Dataset(pa.Table.from_pandas(df))

print(dataset[0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/256M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

{'text': '<s>[INST] generate a piece of news with the following highlights: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday . Young actor says he has no plans to fritter his cash away . Radcliffe\'s earnings from first five Potter films have been held in trust fund .[/INST]LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extra

# Training

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

Your GPU supports bfloat16: accelerate training with bf16=True


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]



Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,2.1289
50,1.7886
75,1.8415
100,1.6317
125,1.8247
150,1.5732
175,1.7813
200,1.6187
225,1.7719
250,1.6439


Step,Training Loss
25,2.1289
50,1.7886
75,1.8415
100,1.6317
125,1.8247
150,1.5732
175,1.7813
200,1.6187
225,1.7719
250,1.6439


TrainOutput(global_step=1250, training_loss=1.6993938262939454, metrics={'train_runtime': 2351.7213, 'train_samples_per_second': 2.126, 'train_steps_per_second': 0.532, 'total_flos': 8.519377208328192e+16, 'train_loss': 1.6993938262939454, 'epoch': 1.0})

In [None]:
# model = AutoModelForCausalLM.from_pretrained(new_model)

# Generation

In [None]:
dataset = load_dataset(dataset_name, '1.0.0', split='validation')

prompts = []
truths = []
highlights_vec = []
for i in range(10):
  item = dataset[i]
  article = item['article']
  highlights = item['highlights']
  el = "<s>"
  inst = "[INST] generate a piece of news with the following highlights: " + highlights + "[/INST]"

  highlights_vec.append(highlights)
  el += (inst)
  prompts.append(el)
  truths.append(article)

In [None]:
with open('drive/MyDrive/llama-2-4/highlights.txt', 'w', encoding="utf-8") as outfile:
  for el in highlights_vec:
    outfile.write(el.replace("\n", " ") + '\n')

## Ground-truth

In [None]:
from tqdm import tqdm

# save output to file
with open('drive/MyDrive/llama-2-4/reference.txt', 'w', encoding="utf-8") as outfile:
  for el in truths:
    outfile.write(el.replace("\n", " ") + '\n')

## Greedy

In [None]:
from tqdm import tqdm

greedy_outputs = []
for i in tqdm(range(10, 15)):
  inst = prompts[i]
  model_inputs = tokenizer(inst, return_tensors='pt')
  greedy_output = model.generate(
      **model_inputs,
      max_new_tokens=number_of_words * 2
      )

  text = tokenizer.decode(greedy_output[0], skip_special_tokens=True)[len(inst):]
  greedy_outputs.append(text)

# save output to file
with open('drive/MyDrive/llama-2-4/greedy.txt', 'w', encoding="utf-8") as outfile:
  for index in range(10):
    outfile.write(greedy_outputs[index].replace("\n", " ") + '\n')

100%|██████████| 10/10 [1:02:27<00:00, 374.79s/it]


## Beam search

In [None]:
tfrom tqdm import tqdm

beam_outputs = []

with open('drive/MyDrive/llama-2-4/beam.txt', 'a', encoding="utf-8") as outfile:
  for i in tqdm(range(10, 15)):
    inst = prompts[i]

    model_inputs = tokenizer(inst, return_tensors='pt')
    beam_output = model.generate(
      **model_inputs,
      max_new_tokens=int(number_of_words) * 2,
      num_beams=5,
      no_repeat_ngram_size=2,
      early_stopping=True
    )

    text = tokenizer.decode(beam_output[0], skip_special_tokens=True)[len(inst):]
    beam_outputs.append(text)
    outfile.write(text)

# save output to file
#with open('drive/MyDrive/llama-2-4/beam.txt', 'w', encoding="utf-8") as outfile:
#  for index in range(10):
#    outfile.write(beam_outputs[index].replace("\n", " ") + '\n')

100%|██████████| 10/10 [2:04:54<00:00, 749.40s/it]


In [None]:
with open('drive/MyDrive/llama-2-4/beam.txt', 'w', encoding="utf-8") as outfile:
  for index in range(10):
    outfile.write(beam_outputs[index].replace("\n", " ") + '\n')

## Top-k sampling

In [None]:
from tqdm import tqdm

topk_outputs = []
for i in tqdm(range(10, 15)):
  inst = prompts[i]

  model_inputs = tokenizer(inst, return_tensors='pt')
  topk_output = model.generate(
      **model_inputs,
      max_new_tokens=number_of_words * 2,
      do_sample=True,
      top_k=50
  )

  text = tokenizer.decode(topk_output[0], skip_special_tokens=True)[len(inst):]
  topk_outputs.append(text)

# save output to file
with open('drive/MyDrive/llama-2-4/topk.txt', 'w', encoding="utf-8") as outfile:
  for index in range(10):
    outfile.write(topk_outputs[index].replace("\n", " ") + '\n')

100%|██████████| 10/10 [1:32:43<00:00, 556.35s/it]


## Top-p sampling

In [None]:
from tqdm import tqdm

topp_outputs = []
with open('drive/MyDrive/llama-2-4/topp.txt', 'a', encoding="utf-8") as outfile:
  for i in tqdm(range(10, 15)):
    inst = prompts[i]

    model_inputs = tokenizer(inst, return_tensors='pt')
    topp_output = model.generate(
        **model_inputs,
        max_new_tokens=number_of_words * 2,
        do_sample=True,
        top_p=0.92,
        top_k=0
    )

    text = tokenizer.decode(topp_output[0], skip_special_tokens=True)[len(inst):]
    topp_outputs.append(text)

    outfile.write(text.replace("\n", " ") + '\n')

# save output to file
#with open('drive/MyDrive/llama-2-4/topp.txt', 'w', encoding="utf-8") as outfile:
#  for index in range(10):
#    outfile.write(topp_outputs[index].replace("\n", " ") + '\n')

100%|██████████| 10/10 [58:29<00:00, 350.97s/it]
