# 🧠 LLaMA 3 GPU Inference on RunPod
Setup log for Week 2 – Day 1 of LLM Evaluation Roadmap

In [None]:
# ✅ 1. System setup (RunPod template: PyTorch 2.1 + CUDA 11.8)
!apt update
!apt install -y cmake build-essential libcurl4-openssl-dev git-lfs wget
!pip install --upgrade pip
!pip install huggingface_hub

In [None]:
# ✅ 2. Clone and build llama.cpp with CUDA
!rm -rf ~/llama.cpp

!cd ~ && git clone https://github.com/ggerganov/llama.cpp.git
!mkdir -p ~/llama.cpp/build


In [None]:
!cd ~/llama.cpp/build && cmake .. -DGGML_CUDA=on -DLLAMA_BUILD_EXAMPLES=on && make -j

In [None]:
# ✅ 3. Authenticate with Hugging Face and download model
!huggingface-cli login --token <TOKEN-HERE> 

# Download model (4.6GB) into llama.cpp directory
!mkdir -p ~/llama.cpp/models/llama-3
!huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF --include "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf" --local-dir ~/llama.cpp/models/llama-3


In [None]:
# ✅ 4. Run inference!
#!~/llama.cpp/build/bin/llama-cli -m ~/llama.cpp/models/llama-3/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf -p "What is the capital of France?" -n 128 --color

In [None]:
pip install "git+https://github.com/EleutherAI/lm-evaluation-harness.git"


In [None]:
!pip install -U sentencepiece
!pip uninstall -y llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUDA=on -DLLAMA_CUDA_ARCHS=86" \
  pip install --no-binary :all: "llama-cpp-python[server]"



In [None]:
python -m llama_cpp.server \
  --model $HOME/llama.cpp/models/llama-3/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf \
  --n_gpu_layers -1 \
  --host 127.0.0.1 \
  --port 8000 \
  --verbose 1


In [None]:
!python -m lm_eval \
  --model gguf \
  --model_args "base_url=http://127.0.0.1:8000,temperature=0" \
  --tasks truthfulqa_gen \
  --limit 100 \
  --seed 42 \
  --output_path results/llama3_truthfulqa_sampled.json \
  --write_out \
  --log_samples \
  --verbosity INFO