# Load and Merge Model

### In Case of Using CoLab : Connect to google drive

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
%cd /content/gdrive/MyDrive/LLMStudy/

### Install Packages

#### In Case of Using CoLab : Run install cells everytime
#### In Case of Using GPU Server : Run install cells at only the first time

In [None]:
%pip install transformers==4.46.3
%pip install datasets==3.2.0
%pip install accelerate==1.2.1
%pip install peft==0.14.0
%pip install trl==0.12.2
%pip install bitsandbytes==0.45.0

In [None]:
%pip install huggingface_hub

#### Check Cuda & Torch

In [1]:
import torch
res = torch.cuda.is_available()
print(res)

True


### import packages

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format



### Log in to Hugging Face API

In [3]:
API_TOKEN = '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'

from huggingface_hub.hf_api import HfFolder
HfFolder.save_token(API_TOKEN)

### model id

In [4]:
# Reload tokenizer and model
# The model that you want to train from the Hugging Face hub
base_model = "Bllossom/llama-3.2-Korean-Bllossom-3B"

# Fine-tuned model name
new_model = "llama-3.2-3b-bts"

# Load base and new model

In [5]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

BaseModel= AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if BaseModel.config.pad_token_id is None:
    BaseModel.config.pad_token_id = BaseModel.config.eos_token_id

# Merge Model (Base Model + Lora Peft Trained New Model)

In [8]:
len(BaseModel.model.embed_tokens.weight)

128256

In [None]:
# Merge adapter with base model
tokenizer.chat_template = None
BaseModel_chat, tokenizer = setup_chat_format(BaseModel, tokenizer)


In [None]:
len(BaseModel.model.embed_tokens.weight)

In [9]:
BaseModel.resize_token_embeddings(128257)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(128257, 3072)

In [11]:
len(BaseModel.model.embed_tokens.weight)

128257

In [12]:

new_model = "rssaem/llama-3.2-3b-bts"
PeftModelBaseNew = PeftModel.from_pretrained(BaseModel, new_model)


In [13]:
MergeModel = PeftModelBaseNew.merge_and_unload()

In [None]:
print(tokenizer.pad_token_id)

### 어뎁터 연결 모델 정상작동 확인

In [14]:
def generate_response(messages, model):

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

    terminators = [
        tokenizer.convert_tokens_to_ids("<|end_of_text|>"),
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    #decoded_output = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

    return decoded_output

instruction = "BTS 음악의 컨셉은 무엇입니까?"
messages = [
    {"role": "user", "content": f"{instruction}"},
]

generate_response(messages, MergeModel)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"system\n\nCutting Knowledge Date: December 2023\nToday Date: 16 Dec 2024\n\nYou are a helpful AI assistant. Please answer the user's questions kindly. 당신은 유능한 AI 어시스턴트 입니다. 사용자의 질문에 대해 친절하게 답변해주세요.user\n\nBTS 음악의 컨셉은 무엇입니까?assistant\n\nBTS는 다양한 음악 컨셉을 통해 팬들과 상호작용하며, 음악적, 사회적, 문화적 소통을 통해 다양한 주제를 다루고 있습니다. 일반적으로 BTS의 음악 컨셉은 다음과 같은 요소를 포함합니다:\n\n1. **정체성과 상호작용**: 팬들과의 상호작용을 통해 팬클럽 문화를 강화하고, 팬들과의 소통을 통해 음악적 영감을 얻습니다.\n2. **정치적, 사회적 주제**: 정치적, 사회적 문제에 대한 소통과 참여를 통해 팬들과 함께 문제를 해결하는 방안을 모색합니다.\n3. **문화적 다양성과 inclusivity**: 다양한 문화적 배경을 가진 팬들과의 상호작용을 통해 문화적 다양성을 강조하고, inclusivity를 실현합니다.\n4. **개인적, 심리적 주제**: 개인의 감정, 경험, 생각을 담아내며, 팬들과 공감대를 형성합니다.\n5. **미학적, 시각적 요소**: 음악 비디오, 앨범 커버, 라이브 공연 등 미적 요소로 음악적 완성도를 높입니다."

# Model Push to Huggingface

In [15]:
#hf_merged_model = "rssaem/Llama3_2_MergeTest"
hf_merged_model = "Llama3_2_MergeTest"

MergeModel.save_pretrained(hf_merged_model)
tokenizer.save_pretrained(hf_merged_model)

('Llama3_2_MergeTest/tokenizer_config.json',
 'Llama3_2_MergeTest/special_tokens_map.json',
 'Llama3_2_MergeTest/tokenizer.json')

In [16]:
MergeModel.push_to_hub(hf_merged_model, use_temp_dir=False)
tokenizer.push_to_hub(hf_merged_model, use_temp_dir=False)

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rssaem/Llama3_2_MergeTest/commit/d88bab42d59c540aaea24a4085b2ba6e6ed5271a', commit_message='Upload tokenizer', commit_description='', oid='d88bab42d59c540aaea24a4085b2ba6e6ed5271a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rssaem/Llama3_2_MergeTest', endpoint='https://huggingface.co', repo_type='model', repo_id='rssaem/Llama3_2_MergeTest'), pr_revision=None, pr_num=None)