In [1]:
!pip install peft transformers trl bitsandbytes accelerate datasets

Collecting peft
  Obtaining dependency information for peft from https://files.pythonhosted.org/packages/14/0b/8402305043884c76a9d98e5e924c3f2211c75b02acd5b742e6c45d70506d/peft-0.6.2-py3-none-any.whl.metadata
  Downloading peft-0.6.2-py3-none-any.whl.metadata (23 kB)
Collecting trl
  Obtaining dependency information for trl from https://files.pythonhosted.org/packages/0d/44/c406c3cf5981bddb16ff72acb5ca235888db4073d868cf51bd143bef3aad/trl-0.7.4-py3-none-any.whl.metadata
  Downloading trl-0.7.4-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Obtaining dependency information for bitsandbytes from https://files.pythonhosted.org/packages/c2/49/557f8f4aa9cfc1e9d7875fd850a44a6d3d881a42c483bc8cf56a6b597dfe/bitsandbytes-0.41.2.post2-py3-none-any.whl.metadata
  Downloading bitsandbytes-0.41.2.post2-py3-none-any.whl.metadata (9.8 kB)
Collecting tyro>=0.5.11 (from trl)
  Obtaining dependency information for tyro>=0.5.11 from https://files.pythonhosted.org/packages/19/c3/35e23412b4c9b38

In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer



In [3]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("hftoken")

# login to hf
login(secret_value_0)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
model_name = "Open-Orca/Mistral-7B-OpenOrca"

dataset_path = "/kaggle/input/mistral7b-orca-data/mistral7b-orca-data.jsonl"

output_dir = 'mistral-7b-orca-new'

# The instruction dataset to use
dataset = load_dataset('json', data_files=dataset_path, split='train')

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-93b3e1bbf9d3944e/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-93b3e1bbf9d3944e/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)

model.config.use_cache = False

Downloading (…)lve/main/config.json:   0%|          | 0.00/623 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True,
)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
def generate_example(sys_message, prompt, answer):
    sys_content = {"role": "system", "content": f"{sys_message}"}    
    user_content = {"role": "user", "content": f"{prompt}"}
    assistant_content = {"role": "assistant", "content": f"{answer}"}
    return [sys_content, user_content, assistant_content]

def formatting_prompts_func(dataset):
    output_texts = []
    for i in range(len(dataset['question'])):
        sample = generate_example(dataset['system'][i], dataset['question'][i], dataset['answer'][i])
        output_texts.append(tokenizer.apply_chat_template(sample, tokenize=False))
    return output_texts

In [8]:
# Lora Config

lora_config = LoraConfig(
    lora_alpha=256, #alpha scaling
    lora_dropout=0.05,
    r=128, #attention heads
    task_type='CAUSAL_LM',
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none"
)


In [9]:
# Training args

training_arguments = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    num_train_epochs=5,
    max_steps=-1,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    fp16=True,
    logging_steps=3,
    report_to="tensorboard"
)

In [10]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=lora_config,
    max_seq_length=512,
    tokenizer=tokenizer,
    formatting_func=formatting_prompts_func,
    args=training_arguments,
    packing=False
)

trainer.train()

trainer.model.save_pretrained(output_dir)

  0%|          | 0/1 [00:00<?, ?ba/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
3,3.5784
6,1.7792
9,1.0723
12,0.97
15,0.8787
18,0.5028
21,1.4668
24,0.8866
27,0.4705
30,0.5386


In [11]:
model = PeftModel.from_pretrained(model, output_dir)

In [12]:
system_message = dataset['system'][0]
print(system_message)

You are Keith Low, a 3rd-year computer science student. User will ask you a question. Your goal is to answer the question as faithfully as you can.


In [13]:
prompt = "So tell me about yourself"

sys_content = {"role": "system", "content": f"{system_message}"}
    
user_content = {"role": "user", "content": f"{prompt}"}

message = tokenizer.apply_chat_template([sys_content, user_content], tokenize=False, add_generation_prompt=True)

inputs = tokenizer.apply_chat_template([sys_content, user_content], tokenize=True, add_generation_prompt=True, return_tensors="pt")

print(message)
print(inputs)

<|im_start|>system
You are Keith Low, a 3rd-year computer science student. User will ask you a question. Your goal is to answer the question as faithfully as you can.<|im_end|>
<|im_start|>user
So tell me about yourself<|im_end|>
<|im_start|>assistant

tensor([[32001,  1587,    13,  1976,   460, 22759, 13091, 28725,   264, 28705,
         28770,  5240, 28733,  4395,  6074,  6691,  5716, 28723,  1247,   622,
          1460,   368,   264,  2996, 28723,  3604,  5541,   349,   298,  4372,
           272,  2996,   390,  7152,  3071,   390,   368,   541, 28723, 32000,
         32001,  2188,    13,  5142,  1912,   528,   684,  3936, 32000, 32001,
         13892,    13]])


In [None]:
model.push_to_hub(output_dir,
                  use_auth_token=True,
                  commit_message="basic training",
                  private=True)