In [None]:
# Quantization methods available: q2_k, q3_k_l, q3_k_m, q3_k_s, q4_0, q4_1, q4_k_m, q4_k_s, q5_0, q5_1, q5_k_m, q5_k_s, q6_k, q8_0
# The HF_TOKEN corresponds to the name of the secret that stores your [Hugging Face access token](https://huggingface.co/settings/tokens).
# HF_TOKEN must be write token if you want to upload models to HuggingFace

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_NAME = MODEL_ID.split('/')[-1]
QUANTIZATION_METHODS = "q4_k_m, q5_k_m, q6_k"
QUANTIZATION_METHODS = QUANTIZATION_METHODS.replace(" ", "").split(",")

HF_USERNAME=""
HF_TOKEN=""

In [None]:
!pip install -U "huggingface_hub[cli]"
!pip install sentencepiece

from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import snapshot_download

base_model = "./original_model/"
snapshot_download(repo_id=MODEL_ID, local_dir=base_model, local_dir_use_symlinks=False)

In [None]:
# Install llama.cpp
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make
!pip install -r llama.cpp/requirements.txt

# Convert to fp16
fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin"
!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16} # --vocab-type bpe # uncomment vocab-type fro llama models

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
    !./llama.cpp/quantize {fp16} {qtype} {method}

In [None]:
# optional step, upload quantized model to HF

from huggingface_hub import create_repo, HfApi

api = HfApi()

# Create empty repo
create_repo(
    repo_id = f"{HF_USERNAME}/{MODEL_NAME}-GGUF",
    repo_type="model",
    exist_ok=True,
    token=HF_TOKEN
)

# Upload gguf files
api.upload_folder(
    folder_path=MODEL_NAME,
    repo_id=f"{HF_USERNAME}/{MODEL_NAME}-GGUF",
    allow_patterns=["*.gguf"],
    token=HF_TOKEN
)