In [1]:
# Import necessary libraries
import os
import torch
from datasets import load_dataset
from transformers import BitsAndBytesConfig, TrainingArguments, pipeline, logging, LlamaForCausalLM, LlamaTokenizer
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import torch
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Force garbage collection
gc.collect()

def display_cuda_memory():    
    print("\n--------------------------------------------------\n")
    print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
    print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
    print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))
    print("\n--------------------------------------------------\n")

display_cuda_memory()

#For PyTorch memory management add the following code

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1024"


--------------------------------------------------

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB

--------------------------------------------------



In [3]:
# Define model, dataset, and new model name
base_model = 'meta-llama/Llama-2-7b-hf'
horoscope_dataset = "chloeliu/horoscope"
new_model = "llama-2-7b-chat-horoscope"

# credentials
hf_token = os.environ.get('HF_TOKEN')

# Load dataset
dataset = load_dataset(horoscope_dataset, split="train")


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = LlamaTokenizer.from_pretrained(base_model,token=hf_token)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

In [5]:
# model = LlamaForCausalLM.from_pretrained(base_model,torch_dtype=torch.float16, token=hf_token).to(device) 

# eval_prompt = """
# What is the love horoscope for Aquarius today.
# ---
# Response:
# """

# model_input = tokenizer(eval_prompt, return_tensors="pt").to(device)

# model.eval()
# with torch.no_grad():
#     print(tokenizer.decode(model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True))

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.48s/it]



What is the love horoscope for Aquarius today.
---
Response:
You can have it all, but you can't have it all at once.
---
What is the love horoscope for Pisces today.
---
Response:
You can have it all, but you can't have it all at once.
---
What is the love horoscope for Aries today.
---
Response:
You can have it all, but you can't have it all at once.
---
What is the love horoscope for Taurus today.
---
Response:
You can have it all, but you can't have it all at once.
---
What is the love horoscope for Gemini today.
---
Response:
You can have it all, but you can't have it all at once.
---
What is the love horoscope for Cancer today.
---
Response:
You can have it all, but you can't have it all at once.
---
What is the love horoscope for Leo today.
---
Response:
You can have it all, but you can't have it all at once.
---
What is the love horoscope


In [6]:
# 4-bit Quantization Configuration
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(load_in_4bit=True, 
                                  bnb_4bit_quant_type="nf4", 
                                  bnb_4bit_compute_dtype=compute_dtype, 
                                  bnb_4bit_use_double_quant=False)

# Load model with 4-bit precision
model = LlamaForCausalLM.from_pretrained(base_model, quantization_config=quant_config, device_map={"": 0},token = hf_token)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, token = hf_token)
tokenizer = LlamaTokenizer.from_pretrained(base_model, trust_remote_code=True, token = hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.25s/it]


In [7]:
# Set PEFT Parameters
peft_params = LoraConfig(lora_alpha=16, lora_dropout=0.1, r=64, bias="none", task_type="CAUSAL_LM")

# Define training parameters
training_params = TrainingArguments(output_dir="./results", 
                                    num_train_epochs=10, 
                                    per_device_train_batch_size=4, 
                                    gradient_accumulation_steps=1, 
                                    optim="paged_adamw_32bit", 
                                    save_steps=25, 
                                    logging_steps=25, 
                                    learning_rate=2e-4, 
                                    weight_decay=0.001, 
                                    fp16=False, 
                                    bf16=False, 
                                    max_grad_norm=0.3, 
                                    max_steps=-1, 
                                    warmup_ratio=0.03, 
                                    group_by_length=True, 
                                    lr_scheduler_type="constant", 
                                    report_to="tensorboard")

# Initialize the trainer
trainer = SFTTrainer(model=model, 
                     train_dataset=dataset, 
                     peft_config=peft_params, 
                     dataset_text_field="text", 
                     max_seq_length=None, 
                     tokenizer=tokenizer, 
                     args=training_params, 
                     packing=False)

#Force clean the pytorch cache
gc.collect()

torch.cuda.empty_cache()

# Train the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
25,2.5629
50,2.2084
75,2.2163
100,2.0123
125,1.9916
150,1.9575
175,1.7875
200,1.7845
225,1.7335
250,1.5626


TrainOutput(global_step=730, training_loss=1.3341605911516163, metrics={'train_runtime': 990.647, 'train_samples_per_second': 2.917, 'train_steps_per_second': 0.737, 'total_flos': 1.3478140887687168e+16, 'train_loss': 1.3341605911516163, 'epoch': 10.0})

In [8]:
# Save the model and tokenizer
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

# Test the model
logging.set_verbosity(logging.CRITICAL)
prompt = "What is the work horoscope for Aquarius today?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] What is the work horoscope for Aquarius today? [/INST] The workhorse in you wants to charge forward and make a name for yourself. Take the lead as the moon in Leo opposes Pluto in Aquarius. This cosmic climate will empower you to be the star of the show. Your limelight could come in the form of a promotion, a raise, or just being seen as the go-getter that you are. [/INST] $3.00
[LEO] Workhorse shenanigans...
[LEO] Workhorse shenanigans... Don’t be afraid to be a little silly at work today. It’s all good. The moon in Leo opposes Pluto in Aquarius, which is a perfect cosmic climate for you to be seen as a star. [/LEO] $3.00


In [9]:
torch.cuda.empty_cache()

In [3]:
# Reload model in FP16 and merge it with LoRA weights
model = LlamaForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
    token = hf_token
)
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = LlamaTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.11s/it]


In [4]:
model.push_to_hub(new_model, use_temp_dir=False, token=hf_token)
tokenizer.push_to_hub(new_model, use_temp_dir=False, token=hf_token)

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

[A[A
model-00002-of-00003.safetensors:   0%|          | 442k/4.95G [00:00<19:08, 4.31MB/s]

model-00002-of-00003.safetensors:   0%|          | 9.57M/4.95G [00:00<01:30, 54.3MB/s]

[A[A

model-00002-of-00003.safetensors:   0%|          | 16.0M/4.95G [00:00<02:52, 28.7MB/s]

model-00002-of-00003.safetensors:   1%|          | 32.0M/4.95G [00:00<01:57, 41.8MB/s]

model-00002-of-00003.safetensors:   1%|          | 48.0M/4.95G [00:01<01:45, 46.5MB/s]

model-00002-of-00003.safetensors:   1%|▏         | 64.0M/4.95G [00:01<01:39, 49.0MB/s]

[A[A

model-00002-of-00003.safetensors:   2%|▏         | 80.0M/4.95G [00:01<01:49, 44.4MB/s]

model-00002-of-00003.safetensors:   2%|▏         | 96.0M/4.95G [00:02<01:44, 46.5MB/s]

model-00002-of-00003.safetensors:   3%|▎         | 128M/4.95G [00:02<01:23, 57.9MB/s] 

model-00002-of-00003.safetensors:   3%|▎         | 144M/4.95G [00:02<01:20, 59.6MB/s]

[A[A

model-00002-

CommitInfo(commit_url='https://huggingface.co/chloeliu/llama-2-7b-chat-horoscope/commit/ac1a6a76228867e21e15b0ea84dbb08f341fe984', commit_message='Upload tokenizer', commit_description='', oid='ac1a6a76228867e21e15b0ea84dbb08f341fe984', pr_url=None, pr_revision=None, pr_num=None)