## Merge base and LoRA adapters For LLama2 13B

Make sure you pick an instance type with enough memory. Llama2 13B needs about 26GB of memory to process the merge

In [None]:
!pip install -Uq peft==0.4.0
!pip install -Uq bitsandbytes==0.40.2
!pip install -Uq sentencepiece

### > Setup

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

### > Download a lora adapter as an example

In [None]:
from huggingface_hub import snapshot_download
lora_adapter_id = "Mikael110/llama-2-13b-guanaco-qlora"
revision = "main"
lora_local_dir = "lora-adapter"

snapshot_download(repo_id=lora_adapter_id, 
                  revision=revision, 
                  local_dir=lora_local_dir,
                 local_dir_use_symlinks=False)

### Merge the model with Lora weights

Save the combined model and tokenizer

In [None]:
model_name = "NousResearch/Llama-2-13b-hf"
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

model = PeftModel.from_pretrained(base_model, lora_local_dir)
model = model.merge_and_unload()
save_dir = "merged-4bit"
model.save_pretrained(save_dir, safe_serialization=True, max_shard_size="2GB")

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer.save_pretrained(save_dir)

### > upload the combined model to S3

In [None]:
import sagemaker
bucket = sagemaker.Session().default_bucket()
prefix  = f"{model_name}/models"
model_data_s3_location = f"s3://{bucket}/{prefix}"
!cd {save_dir} && aws s3 cp --recursive . {model_data_s3_location}

### > store the parameters into the enviornment for downstream process

In [None]:
%store model_data_s3_location
%store model_name