In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
FOLDERNAME = "Colab\ Notebooks/fetch-data"
%cd drive/MyDrive/$FOLDERNAME

In [None]:
# !pip install -q transformers peft bitsandbytes accelerate llama-cpp-python

## Merge

In [None]:
%%bash
export HF_TOKEN="MY_TOKEN"
huggingface-cli login --token $HF_TOKEN --add-to-git-credential

[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


Token is valid (permission: fineGrained).
The token `llama-2` has been saved to /root/.cache/huggingface/stored_tokens
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", use_fast=True)

# Load base model to GPU
base = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b",
    device_map="auto",            # Automatically put model onto GPU
    torch_dtype=torch.float16,
    offload_folder=None,
    offload_state_dict=False
)

peft_model = PeftModel.from_pretrained(base, "lora_gemma2_resume")
peft_model.eval()

# Merge LoRA
merged = peft_model.merge_and_unload()

merged.save_pretrained("gemma2_merged")

## GGUF

In [None]:
!apt-get update && apt-get install -y cmake

In [None]:
!git clone https://github.com/ggml-org/llama.cpp.git

In [None]:
%cd llama.cpp

In [None]:
!cmake -B build -S .

In [None]:
!cmake --build build -- -j$(nproc)

In [None]:
%cd ..

In [None]:
!pip install -q huggingface_hub

In [None]:
from huggingface_hub import hf_hub_download
import shutil, os

os.makedirs("gemma2_merged", exist_ok=True)

for fn in ["tokenizer.model", "tokenizer.json"]:
    try:
        src = hf_hub_download(
            repo_id="google/gemma-2b",
            filename=fn,
            repo_type="model",
            use_auth_token=True
        )
        shutil.copy(src, "gemma2_merged/")
        print(f"✔ copied {fn}")
    except Exception as e:
        print(f"✘ {fn} not found or error: {e}")

In [None]:
!cp gemma2_original/tokenizer.model      gemma2_merged/
!cp gemma2_original/tokenizer.json       gemma2_merged/
!cp gemma2_original/vocab.json           gemma2_merged/ 2>/dev/null || true
!cp gemma2_original/merges.txt           gemma2_merged/ 2>/dev/null || true

In [None]:
!python3 llama.cpp/convert_hf_to_gguf.py \
  gemma2_merged \
  --outfile gemma2_merged-q8_0.gguf \
  --outtype q8_0