In [None]:
import os, math, numpy as np
# Note: vLLM doesn't work on macOS - it requires CUDA on Linux
# Using transformers instead, which supports macOS with Metal acceleration

In [None]:
%%time
# vLLM doesn't work on macOS - using transformers + accelerate instead
# These work on macOS with Metal (Apple GPU) support
%pip install accelerate
# Optional: Install AutoAWQ for AWQ quantization support
# %pip install autoawq
%pip install grpcio==1.62.2
%pip install antlr4-python3-runtime==4.11.0

Collecting vllm==0.4.2
  Downloading vllm-0.4.2.tar.gz (588 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m588.8/588.8 kB[0m [31m4.0 MB/s[0m  [33m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[18 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "/Users/taha/.pyenv/versions/3.12.6/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py", line 389, in <module>
  [31m   [0m     main()
  [31m   [0m   File "/Users/taha/.pyenv/versions/3.12.6/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py", line 373, in main
  [31m   [0m     json_out["return_val"] = hook(**hook_input["kwargs"])
 

## Note: vLLM Compatibility Issue

**vLLM doesn't work on macOS** because it requires:
- CUDA (NVIDIA GPU support)
- Linux operating system

### Alternatives:
1. **Use transformers + accelerate** (implemented below) - Works on macOS with Metal acceleration
2. **Use a Linux server with CUDA** - If you need vLLM's specific features
3. **Use cloud services** - Run vLLM on cloud GPU instances

### What changed:
- Replaced `vllm.LLM()` with `AutoModelForCausalLM` from transformers
- Uses Metal (Apple GPU) on macOS if available
- Supports AWQ quantization via AutoAWQ library
- Falls back to 4-bit quantization if AWQ not available


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# macOS-compatible alternative to vLLM
# Note: tensor_parallel_size not available on macOS (requires CUDA)
model_path = "/kaggle/input/bagel-v3-343"  # Update this path for your local model

# Check if Metal (Apple GPU) is available
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True
)

# Load model with AWQ quantization support
# For AWQ models, use AutoAWQ if available, otherwise use standard loading
try:
    from awq import AutoAWQForCausalLM
    print("Loading with AWQ quantization...")
    llm = AutoAWQForCausalLM.from_quantized(
        model_path,
        device_map="auto",
        trust_remote_code=True,
        max_model_len=1024,
    )
except ImportError:
    print("AWQ not available, loading with standard quantization...")
    # Fallback to standard loading with 4-bit quantization
    from transformers import BitsAndBytesConfig
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    llm = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16 if device == "mps" else torch.float32,
        max_length=1024,
    )

print("Model loaded successfully!")


ModuleNotFoundError: No module named 'vllm'

In [None]:
# Text generation function (similar to vLLM's interface)
def generate(prompts, max_tokens=256, temperature=0.7, top_p=0.9):
    """
    Generate text from prompts (similar to vLLM interface)
    
    Args:
        prompts: Single string or list of strings
        max_tokens: Maximum tokens to generate
        temperature: Sampling temperature
        top_p: Top-p sampling parameter
    """
    if isinstance(prompts, str):
        prompts = [prompts]
    
    results = []
    for prompt in prompts:
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt")
        
        # Move to appropriate device
        if hasattr(llm, 'device'):
            inputs = {k: v.to(llm.device) for k, v in inputs.items()}
        elif device != "cpu":
            inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate
        with torch.no_grad():
            outputs = llm.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        results.append(generated_text)
    
    return results[0] if len(results) == 1 else results

# Example usage:
# output = generate("Hello, how are you?", max_tokens=50)
# print(output)
