based off https://www.philschmid.de/fine-tune-llms-in-2024-with-trl with some additions and changes here and there

In [1]:
!pip install  --upgrade \
  "transformers==4.36.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.42.0" \
  # "trl==0.7.10" # \
  # "peft==0.7.1" \

# install peft & trl from github
!pip install git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
!pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade

import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'
# install flash-attn
!pip install ninja packaging
!MAX_JOBS=4 pip install flash-attn --no-build-isolation

Collecting transformers==4.36.2
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets==2.16.1
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting accelerate==0.26.1
  Downloading accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Collecting evaluate==0.4.1
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting bitsandbytes==0.42.0
  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers==4.36.2)
  Downloading huggingface_hub-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers==4.36.2)
  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40

In [1]:
from datasets import load_dataset

dataset = load_dataset('json', data_files='combined_convos.json', split='train')

In [2]:
dataset

Dataset({
    features: ['messages'],
    num_rows: 20
})

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant = True
)

model_path = "meta-llama/Llama-2-7b-chat-hf"
huggingface_token = ''
tokenizer = AutoTokenizer.from_pretrained(
    model_path, 
    token=huggingface_token
)
# full discussion here https://github.com/huggingface/transformers/issues/22312
# but basically dont do tokenizer.add_special_tokens({'pad_token': '[PAD]'}) because it will 
# add a new token and make the vocab size 32000+1, which is unideal, to say the least, as the
# model was trained with vocab 32000.
tokenizer.pad_token = tokenizer.unk_token 
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    quantization_config=bnb_config,
    attn_implementation="flash_attention_2",
    device_map={'':0}, # have to specifically set each layer to device 0 when training with single gpu (sus i know)
    torch_dtype=torch.bfloat16, 
    token=huggingface_token
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
from transformers import AutoTokenizer
model_path = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(
    model_path, 
    token=huggingface_token
)
tokenizer.pad_token = tokenizer.unk_token 


In [6]:
dataset[:1]

{'messages': [[{'content': "I'm feeling really anxious right now because of a situation with my family. We had a big argument last night and things got really heated. I said some things that I didn't mean, and now they won't even talk to me. I feel like I've ruined everything and I don't know how to fix it.\n\nI just need someone to talk to, to help me navigate through this situation. I hate feeling like this, like my family is falling apart because of me. I wish I could turn back time and take back everything I said.\n\nI just need some advice on how to apologize and make things right with my family. I can't stand this tension and unease between us. I know I messed up, but I just want things to go back to the way they used to be. Can someone please help me?",
    'role': 'user'},
   {'content': "Wow, sorry to hear about the awkward situation. The tension between you and your family must be agonizing. Unfortunately, I don't really have much advice to give on how to mend the situation. 

In [7]:
from trl.extras.dataset_formatting import conversations_formatting_function
from trl.trainer.utils import DataCollatorForCompletionOnlyLM

format = conversations_formatting_function(tokenizer=tokenizer, messages_field='messages') # technically SFTTrainer does this by default, but just to make it explicit
print(format(dataset[0]))
instruction_template = "<s>"
response_template = "[/INST]"
collator = DataCollatorForCompletionOnlyLM(
    instruction_template=instruction_template, 
    response_template=response_template, 
    tokenizer=tokenizer
)
# uncomment(and further index) if you wanna see what the collator does
#input_ids = collator.torch_call(dataset['input_ids'][:2])['input_ids'][0]
#labels = collator.torch_call(dataset['input_ids'][:2])['labels'][0]
#print(labels)
#print(tokenizer.decode([input_ids[i] for i in range(len(input_ids)) if labels[i] != -100]))

<s>[INST] I'm feeling really anxious right now because of a situation with my family. We had a big argument last night and things got really heated. I said some things that I didn't mean, and now they won't even talk to me. I feel like I've ruined everything and I don't know how to fix it.

I just need someone to talk to, to help me navigate through this situation. I hate feeling like this, like my family is falling apart because of me. I wish I could turn back time and take back everything I said.

I just need some advice on how to apologize and make things right with my family. I can't stand this tension and unease between us. I know I messed up, but I just want things to go back to the way they used to be. Can someone please help me? [/INST] Wow, sorry to hear about the awkward situation. The tension between you and your family must be agonizing. Unfortunately, I don't really have much advice to give on how to mend the situation. If you don't mind me asking though, what led to the a

In [8]:
from transformers import TrainingArguments, GenerationConfig

training_args = TrainingArguments(
    output_dir="ckpts",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adamw_torch_fused", # https://github.com/huggingface/transformers/issues/22141
    learning_rate=5e-5,
    lr_scheduler_type="constant",
    logging_steps=1,
    num_train_epochs=5,
    seed=42,
    data_seed=42,
    save_strategy='epoch',
    report_to="none",
    log_level = 'debug',
    logging_first_step = True,
    max_grad_norm=0.3, # max gradient norm based on QLoRA paper
    warmup_ratio=0.03, # warmup ratio based on QLoRA paper,
    bf16=True
)
training_args.generation_config = GenerationConfig.from_pretrained(model_path, do_sample=False, max_new_tokens=1024)





In [9]:
from peft import LoraConfig, get_peft_model
from peft.tuners.lora import LoraLayer

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

In [10]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    peft_config=peft_config,
    data_collator=collator,
    formatting_func=format,
    max_seq_length=1024,
    tokenizer=tokenizer,
    packing=False, # might try it in the future idk seems kinda sus tho
    dataset_kwargs={"add_special_tokens": False},
)

Using auto half precision backend


In [11]:
trainer.train()

Currently training with a batch size of: 1
***** Running training *****
  Num examples = 20
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 25
  Number of trainable parameters = 639,631,360
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss
1,2.7029
2,2.3286
3,2.2392
4,2.2873
5,1.9382
6,1.9221
7,1.7198
8,1.733
9,1.7744
10,1.726


Saving model checkpoint to ckpts/tmp-checkpoint-5
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/92011f62d7604e261f748ec0cfe6329f31193e33/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 32000
}

tokenizer config file saved in ckpts/tmp-checkpoint

TrainOutput(global_step=25, training_loss=1.5464777207374574, metrics={'train_runtime': 164.5486, 'train_samples_per_second': 0.608, 'train_steps_per_second': 0.152, 'total_flos': 3334188246958080.0, 'train_loss': 1.5464777207374574, 'epoch': 5.0})

In [2]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="", # ur huggingface repo
    folder_path="ckpts/checkpoint-25", # or whichever checkpoint
    token = huggingface_token
)

optimizer.pt:   0%|          | 0.00/2.56G [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/1.28G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.66k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jhlim8/listenerckpt25_19mar/commit/80b31808d3772d0a9b86d61abbd2b35f8abd1d84', commit_message='Upload folder using huggingface_hub', commit_description='', oid='80b31808d3772d0a9b86d61abbd2b35f8abd1d84', pr_url=None, pr_revision=None, pr_num=None)

In [12]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Mon Mar 18 23:28:26 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.146.02             Driver Version: 535.146.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  | 00000000:A1:00.0 Off |                  N/A |
| 39%   38C    P8              33W / 350W |  19011MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    