In [1]:
import os, math, numpy as np
# Note: vLLM doesn't work on macOS - it requires CUDA on Linux
# Using transformers instead, which supports macOS with Metal acceleration

In [2]:
%%time
# vLLM doesn't work on macOS - using transformers + accelerate instead
# These work on macOS with Metal (Apple GPU) support
%pip install accelerate
# Optional: Install AutoAWQ for AWQ quantization support
# %pip install autoawq
%pip install grpcio==1.62.2
%pip install antlr4-python3-runtime==4.11.0

/Users/taha/Projects/llm_engineering/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.
/Users/taha/Projects/llm_engineering/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.
/Users/taha/Projects/llm_engineering/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.
CPU times: user 5.48 ms, sys: 12.8 ms, total: 18.3 ms
Wall time: 676 ms


## Note: vLLM Compatibility Issue

**vLLM doesn't work on macOS** because it requires:
- CUDA (NVIDIA GPU support)
- Linux operating system

### Alternatives:
1. **Use transformers + accelerate** (implemented below) - Works on macOS with Metal acceleration
2. **Use a Linux server with CUDA** - If you need vLLM's specific features
3. **Use cloud services** - Run vLLM on cloud GPU instances

### What changed:
- Replaced `vllm.LLM()` with `AutoModelForCausalLM` from transformers
- Uses Metal (Apple GPU) on macOS if available
- Supports AWQ quantization via AutoAWQ library
- Falls back to 4-bit quantization if AWQ not available


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# macOS-compatible alternative to vLLM
# Note: tensor_parallel_size not available on macOS (requires CUDA)
model_path = "/kaggle/input/bagel-v3-343"  # Update this path for your local model

# Check if Metal (Apple GPU) is available
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True
)

# Load model with AWQ quantization support
# For AWQ models, use AutoAWQ if available, otherwise use standard loading
try:
    from awq import AutoAWQForCausalLM
    print("Loading with AWQ quantization...")
    llm = AutoAWQForCausalLM.from_quantized(
        model_path,
        device_map="auto",
        trust_remote_code=True,
        max_model_len=1024,
    )
except ImportError:
    print("AWQ not available, loading with standard quantization...")
    # Fallback to standard loading with 4-bit quantization
    from transformers import BitsAndBytesConfig
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    llm = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16 if device == "mps" else torch.float32,
        max_length=1024,
    )

print("Model loaded successfully!")


Using device: mps


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/kaggle/input/bagel-v3-343'. Use `repo_type` argument if needed.

In [None]:
# Text generation function (similar to vLLM's interface)
def generate(prompts, max_tokens=256, temperature=0.7, top_p=0.9):
    """
    Generate text from prompts (similar to vLLM interface)
    
    Args:
        prompts: Single string or list of strings
        max_tokens: Maximum tokens to generate
        temperature: Sampling temperature
        top_p: Top-p sampling parameter
    """
    if isinstance(prompts, str):
        prompts = [prompts]
    
    results = []
    for prompt in prompts:
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt")
        
        # Move to appropriate device
        if hasattr(llm, 'device'):
            inputs = {k: v.to(llm.device) for k, v in inputs.items()}
        elif device != "cpu":
            inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate
        with torch.no_grad():
            outputs = llm.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        results.append(generated_text)
    
    return results[0] if len(results) == 1 else results

# Example usage:
# output = generate("Hello, how are you?", max_tokens=50)
# print(output)


In [None]:
import pandas as pd
VALIDATE = 128

test = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/test.csv") 
if len(test)==3:
    test = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/train.csv")
    test = test.iloc[:VALIDATE]
print( test.shape )
test.head(1)

In [None]:
from typing import Any, Dict, List
from transformers import LogitsProcessor
import torch

choices = ["A","B","tie"]

KEEP = []
for x in choices:
    c = tokenizer.encode(x,add_special_tokens=False)[0]
    KEEP.append(c)
print(f"Force predictions to be tokens {KEEP} which are {choices}.")

class DigitLogitsProcessor(LogitsProcessor):
    def __init__(self, tokenizer):
        self.allowed_ids = KEEP
        
    def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor:
        scores[self.allowed_ids] += 100
        return scores

In [None]:
sys_prompt = """Please read the following prompt and two responses. Determine which response is better.
If the responses are relatively the same, respond with 'tie'. Otherwise respond with 'A' or 'B' to indicate which is better."""

In [None]:
SS = "#"*25 + "\n"


In [None]:
all_prompts = []
for index,row in test.iterrows():
    
    a = " ".join(eval(row.prompt, {"null": ""}))
    b = " ".join(eval(row.response_a, {"null": ""}))
    c = " ".join(eval(row.response_b, {"null": ""}))
    
    prompt = f"{SS}PROMPT: "+a+f"\n\n{SS}RESPONSE A: "+b+f"\n\n{SS}RESPONSE B: "+c+"\n\n"
    
    formatted_sample = sys_prompt + "\n\n" + prompt
    
    all_prompts.append( formatted_sample )