In [1]:
# %pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
# %pip install -q datasets bitsandbytes einops wandb

In [1]:
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
from typing import List
import torch
from torch import cuda, bfloat16
from datasets import load_dataset
import os
 
import matplotlib.pyplot as plt
import matplotlib as mpl
# import seaborn as sns
from pylab import rcParams
# from trl import SFTTrainer
 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [2]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_8bit=True,
)


# Need auth token for these
hf_auth = os.environ.get('hf_token')

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

model.eval()
print(f"Model loaded on {device}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded on cuda:0


In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
 
# tokenizer.pad_token_id = (0)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [4]:
data = load_dataset("json", data_files="/home/hb/5G_spec_knowledge.json")
data["train"]

Downloading and preparing dataset json/default to /home/hb/.cache/huggingface/datasets/json/default-4c3743e548ed87ee/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/hb/.cache/huggingface/datasets/json/default-4c3743e548ed87ee/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['instruction', 'output'],
    num_rows: 99979
})

In [5]:
# CUTOFF_LEN = None

# def generate_prompt(data_point):
#     return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
# ### Instruction:
# {data_point["instruction"]}
# ### Input:
# {data_point["input"]}
# ### Response:
# {data_point["output"]}"""
 
 
# def tokenize(prompt, add_eos_token=True):
#     result = tokenizer(
#         prompt,
#         truncation=True,
#         max_length=CUTOFF_LEN,
#         padding=False,
#         return_tensors=None,
#     )
#     if (
#         result["input_ids"][-1] != tokenizer.eos_token_id
#         and len(result["input_ids"]) < CUTOFF_LEN
#         and add_eos_token
#     ):
#         result["input_ids"].append(tokenizer.eos_token_id)
#         result["attention_mask"].append(1)
 
#     result["labels"] = result["input_ids"].copy()
 
#     return result
 
# def generate_and_tokenize_prompt(data_point):
#     full_prompt = generate_prompt(data_point)
#     tokenized_full_prompt = tokenize(full_prompt)
#     return tokenized_full_prompt

In [9]:
CUTOFF_LEN = 2048

def generate_prompt(data_point):
    return f"""Below is an instruction that describes a task, paired with an output that provides the completion of the task.  # noqa: E501
### Instruction:
{data_point["instruction"]}
### Response:
{data_point["output"]}"""

def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt


In [10]:
train_val = data["train"].train_test_split(
    test_size=9900, shuffle=True, seed=42
)
train_data = (
    train_val["train"].map(generate_and_tokenize_prompt)
)
val_data = (
    train_val["test"].map(generate_and_tokenize_prompt)
)

Loading cached split indices for dataset at /home/hb/.cache/huggingface/datasets/json/default-4c3743e548ed87ee/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-f816a8b94b6cc55e.arrow and /home/hb/.cache/huggingface/datasets/json/default-4c3743e548ed87ee/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-4f83e7ef859965f1.arrow


Map:   0%|          | 0/90079 [00:00<?, ? examples/s]

Map:   0%|          | 0/9900 [00:00<?, ? examples/s]

In [16]:
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64
 
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

output_dir = "hyonbo/llama_5G_spec"
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
optim = "paged_adamw_32bit"
save_steps = 500
logging_steps = 200
learning_rate = 1e-4
max_grad_norm = 0.3
max_steps = 10000
warmup_ratio = 0.05
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    # save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [17]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, return_tensors="pt", padding=True
)

In [18]:
from trl import SFTTrainer

max_seq_length = None

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    peft_config=peft_config,
    dataset_text_field="output",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)



Map:   0%|          | 0/90079 [00:00<?, ? examples/s]

Map:   0%|          | 0/9900 [00:00<?, ? examples/s]

In [19]:
trainer.train()



Step,Training Loss
200,3.0523
400,2.4988
600,2.4331
800,2.2975
1000,2.247
1200,2.1573
1400,2.1735
1600,2.0359
1800,2.105
2000,1.9617




TrainOutput(global_step=10000, training_loss=1.8433364318847656, metrics={'train_runtime': 20411.4372, 'train_samples_per_second': 1.96, 'train_steps_per_second': 0.49, 'total_flos': 3.941392648546222e+17, 'train_loss': 1.8433364318847656, 'epoch': 0.44})

In [20]:
new_model = "llama_5G_spec_10K"

trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('llama_5G_spec_10K/tokenizer_config.json',
 'llama_5G_spec_10K/special_tokens_map.json',
 'llama_5G_spec_10K/tokenizer.model',
 'llama_5G_spec_10K/added_tokens.json',
 'llama_5G_spec_10K/tokenizer.json')

# Saving the model

In [1]:
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
import torch

model_id = 'meta-llama/Llama-2-7b-chat-hf'
new_model = "llama_5G_spec"

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='auto',
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
model.push_to_hub('hyonbokan/llama_5G_spec_2K')
tokenizer.push_to_hub('hyonbokan/llama_5G_spec_2K')

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hyonbokan/llama_5G_spec_2K/commit/526ea76f411fc4a7ffeb37bf542b3cf57ea059ff', commit_message='Upload tokenizer', commit_description='', oid='526ea76f411fc4a7ffeb37bf542b3cf57ea059ff', pr_url=None, pr_revision=None, pr_num=None)