## GGUF TEST

```bash
# Download the model
huggingface-cli download $HF_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME --include "Codestral-22B-v0.1-Q8_0.gguf"
huggingface-cli download $HF_MODEL_NAME --repo-type model --revision main --local-dir $LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME --include "Codestral-22B-v0.1.imatrix"


# Start the server
SERVED_MODEL_NAME="${HF_MODEL_NAME#*/}"
MODEL_PATH=$LIBRARY_BASE_PATH/models/$SERVED_MODEL_NAME/Codestral-22B-v0.1-Q8_0.gguf
GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits | head -n 1)

python -m vllm.entrypoints.openai.api_server \
    --host 0.0.0.0 \
    --port 8000 \
    --enable-prefix-caching \
    --gpu-memory-utilization 0.97 \
    --tensor-parallel-size $GPU_COUNT \
    --max-model-len $MAX_CONTEXT_LEN \
    --model $MODEL_PATH \
    --served-model-name $SERVED_MODEL_NAME \
    --num-scheduler-steps 8 \
    --use-v2-block-manager \
    --disable-log-stats \
    --chat-template $LIBRARY_BASE_PATH/tabbyAPI/prompt_templates/codestral_gguf.jinja
```

In [None]:
from huggingface_hub import hf_hub_download
from vllm import LLM, SamplingParams

def run_gguf_inference(model_path):
    llm = LLM(
	model=model_path,
	max_model_len=4096,
	tokenizer="meta-llama/Meta-Llama-3.1-8B-Instruct",
	tensor_parallel_size=1, 
    )

    tokenizer = llm.get_tokenizer()
    conversations = tokenizer.apply_chat_template(
        [{'role': 'user', 'content': 'what is the future of AI?'}],
        tokenize=False,
        add_generation_prompt=True,
    )

    outputs = llm.generate(
        [conversations],
        SamplingParams(temperature=0, max_tokens=1000),
    )
    for output in outputs:
	print(output)


if __name__ == "__main__":
    repo_id = "bullerwins/Meta-Llama-3.1-8B-Instruct-GGUF"
    filename = "Meta-Llama-3.1-8B-Instruct-Q2_K.gguf"
    model = hf_hub_download(repo_id, filename=filename)
    run_gguf_inference(model)

In [None]:
curl --request POST \
    --url https://1x0y86v7sz5g5d-8000.proxy.runpod.net/v1/chat/completions \
    --header "Content-Type: application/json" \
    --data '{
  "model": "Codestral-22B-v0.1-GGUF",
  "messages": [
  {
      "role": "system",
      "content": "You are a helpful virtual coder assistant trained by OpenAI."
  },
  {
    "role": "user",
    "content": "Implement fibonacci series in python."
  }
  ], 
  "temperature": 0.8,
  "stream": false
}'