# Run LLMs locally

We will be using the `transformers` library to run LLMs locally with PyTorch as backend.  
This notebook will require a GPU with at least 12GB of VRAM.  
It is recommended to use Google Colab with at least a T4 GPU runtime.


This notebook will cover the following topics:
- Running LLMs locally
- VRAM limitations
- Quantization
- Using multimodal models
- Structured outputs with JSONformer

In [None]:
!pip install transformers hf_transfer torch accelerate bitsandbytes pillow

# - transformers - high-level API for working with models
# - hf_transfer - allows for faster downloads from Hugging Face
# - torch - Backend library for transformers
# - accelerate - library for distributed inference / training - allows us to use device="auto"
# - bitsandbytes - allows for quantization of models on the fly
# - pillow - allows for image processing

# enable hf_transfer for faster downloads
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" 

## Loading and using text models

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

### Loading the model and tokenizer

In [3]:
model_id = "Qwen/Qwen2.5-1.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map="cuda:0")
tokenizer = AutoTokenizer.from_pretrained(model_id, torch_dtype='auto', device_map="cuda:0")


#### Define a message to send to the model

In [None]:
# Define a message to send to the model
messages = [
    {"role": "user", "content": "When was Alan Turing born?"}
]

# generate the chat template
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
print(text)

# tokenize the text in a batch of 1
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
model_inputs

#### Generate the output

In [None]:
# Set the PyTorch seed for reproducibility
torch.manual_seed(0)

# generate the output in inference mode (no gradient tracking)
with torch.inference_mode():
    batched_output_ids = model.generate(
        **model_inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.2,
    )

# extract the generated tokens by skipping the input ids
generated_ids = [
    output_ids[len(input_ids):] 
    for input_ids, output_ids 
    in zip(model_inputs.input_ids, batched_output_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response



#### Unload Model from VRAM

In [7]:
def print_mem_stats():
    import torch
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e6:.2f} MB")
    print(f"GPU memory cached: {torch.cuda.memory_reserved() / 1e6:.2f} MB")

def reclaim_memory():
    import gc
    import torch

    for _ in range(3):
        for gen in range(3):
            gc.collect(gen)
        torch.cuda.empty_cache()


In [8]:
del model
del tokenizer

reclaim_memory()

### Loading Gated Models from Hugging Face

Some models on Hugging Face are gated and require a token to be downloaded.  

Before downloading the model, You need to agree to the model's author terms of use.

<div>
<img src="./.images/hf_accept_model_tos.png" alt="Hugging Face Gated Model" width="500"/>
</div>

Now, create a new readonly token at [Hugging Face](https://huggingface.co/settings/tokens/new?tokenType=read) and store in a secure place.  
Then, set it in the `HF_TOKEN` environment variable.

In [None]:
import os
print("Please fill in your Hugging Face token below")
os.environ["HF_TOKEN"] = input("Hugging Face token:")

#### Loading the gated model

We will be loading `meta-llama/Llama-3.1-8B-Instruct`, so make sure you nagivate to https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct before proceeding and accept the terms of use.

In [None]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"

# using device_map="auto" will automatically shard the model on the GPU + CPU memory thanks to accelerate
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id, torch_dtype='auto', device_map="auto")

In [None]:
messages = [
    {"role": "user", "content": "What is your name?"}
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

with torch.inference_mode():
    batched_output_ids = model.generate(**model_inputs, max_new_tokens=512)

generated_ids = [
    output_ids[len(input_ids):] 
    for input_ids, output_ids 
    in zip(model_inputs.input_ids, batched_output_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)


#### Streaming the generation

In [None]:
# streaming the output to stdout

from transformers import TextStreamer
streamer = TextStreamer(tokenizer)

with torch.inference_mode():
    model.generate(**model_inputs, max_new_tokens=512, streamer=streamer)

del streamer

In [None]:
from transformers import TextIteratorStreamer
from threading import Thread

streamer = TextIteratorStreamer(tokenizer)
thread = Thread(
    target=model.generate, 
    kwargs=model_inputs | {
        "streamer": streamer,
        "max_new_tokens": 512
    }
)

with torch.inference_mode():
    thread.start()

generated_text = ""
for new_text in streamer:
    generated_text += new_text
    print(new_text, end="", flush=True)

thread.join()

del streamer
del thread

In [None]:
print_mem_stats()

del model
del tokenizer
reclaim_memory()

### Quantization

Quantization is a technique to reduce the model size and improve the inference speed.

Generating with the previous model might have been slow because it might have not fit in the GPU VRAM, having to be sharded across GPU, CPU, and possibly disk.

To easily calculate the memory footprint of the model, take the model data type and multiply it by the number of parameters.  
For example, `Llama-3.1-8B-Instruct` has a data type of `float16` (which is 2 bytes per parameter) and 8B parameters, so the memory footprint is 2 bytes/param * 8B params ~= 16GB.

Let's use `bitsandbytes` to quantize the model on the fly while loading it.

See the [Quantization documentation](https://huggingface.co/docs/transformers/main/en/quantization/overview) for more information.


In [16]:
from transformers import BitsAndBytesConfig

quantization_config_8bit = BitsAndBytesConfig(
    load_in_8bit=True,
)

quantization_config_4bit = BitsAndBytesConfig(
   load_in_4bit=True,
)

In [None]:
# Load the model with 8-bit quantization
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config_8bit, device_map="cuda:0")
tokenizer = AutoTokenizer.from_pretrained(model_id, device_map="cuda:0")

print_mem_stats()

del model
del tokenizer
reclaim_memory()


In [None]:
# Load the model with 8-bit quantization
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config_4bit, device_map="cuda:0")
tokenizer = AutoTokenizer.from_pretrained(model_id, device_map="cuda:0")

print_mem_stats()

del model
del tokenizer
reclaim_memory()

Notice how the memory footprint is reduced when using quantization, but its not a perfect liniar scale, as the model has to retain some unquantized parameters for operations reasons.

`INT8` quantization:  
**Expected:** `1byte/param * 8B params = 8GB`  
**Actual:** `~9GB`  

`INT4` quantization:  
**Expected:** `0.5byte/param * 8B params = 4GB`  
**Actual:** `~6GB`  

Quantization is a trade-off between model size, model quality and inference speed.  

If the model fits in VRAM, it will be faster to run inference without quantization.  
Quantization will allow you to run inference with a smaller model size, but can be slower, depending on the hardware, model architecture, and model sharding on multiple devices.  

Quantizing to 8-bits usually does not come at a significant quality loss for the model.  
Quantizing to 4-bits can come at a larger quality loss, but can still be useful if you need to run the model on a low resource device.

Let's see how much faster we can run inference with quantization.

In [18]:
from time import perf_counter
from contextlib import contextmanager

@contextmanager
def catchtime(message: str = "Time"):
    t1 = t2 = perf_counter() 
    yield lambda: t2 - t1
    t2 = perf_counter() 
    print(f'{message}: {t2 - t1:.3f} seconds')

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto', device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id, torch_dtype='auto',device_map="auto")

with catchtime('Time without quantization'):
    with torch.inference_mode():
        for _ in range(5):
            model.generate(**model_inputs, max_new_tokens=512)

del model
del tokenizer
reclaim_memory()


In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config_8bit, device_map="cuda:0")
tokenizer = AutoTokenizer.from_pretrained(model_id, device_map="cuda:0")

with catchtime('Time with 8-bit quantization'):
    with torch.inference_mode():
        for _ in range(5):
            model.generate(**model_inputs, max_new_tokens=512)

del model
del tokenizer
reclaim_memory()

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config_4bit, device_map="cuda:0")
tokenizer = AutoTokenizer.from_pretrained(model_id, device_map="cuda:0")

with catchtime('Time with 4-bit quantization'):
    with torch.inference_mode():
        for _ in range(5):
            model.generate(**model_inputs, max_new_tokens=512)

del model
del tokenizer
reclaim_memory()

## Loading and using multimodal models

We will be using `Qwen/Qwen2-VL-7B-Instruct` for this example.  
This model is a multimodal model that can understand images as well as text.  

The model is 8.3B parameters, requiring ~16.6GB of VRAM to load.  
Since we are constrained by VRAM, we will be using a pre-quantized version of the model from `Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8`


In [None]:
# install the optimum and auto-gptq libraries
# quantized qwen model depends on them
!pip install -q optimum auto-gptq

In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

model_id = "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype='auto',
    device_map="cuda:0",
)
processor = AutoProcessor.from_pretrained(model_id)

In [None]:
import requests
from PIL import Image
from IPython.display import Image as IPImage, display


image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
display(image.resize((int(image.width / image.height * 300), 300)))

In [None]:
resized_image = image.resize((int(image.width / image.height * 300), 300))
display(resized_image)

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": "If I had to write a haiku for this one, it would be: "}
    ]}
]

input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

inputs = inputs = processor(
    text=[input_text], images=[resized_image], padding=True, return_tensors="pt"
).to(model.device)

output = model.generate(**inputs, max_new_tokens=128)
print(processor.decode(output[0][len(inputs['input_ids'][0]):], skip_special_tokens=True))

In [26]:
del model
del processor.tokenizer
del processor
reclaim_memory()

## Structured outputs

When generating the next token, models have a vector of probabilities representing the likelihood of each token in the vocabulary to be the next token.  
We will use a library that constrains the generation process to produce valid JSON output.

We will be using the [`outlines`](https://github.com/dottxt-ai/outlines/) library to constrain the generation process.

In [None]:
# there's currently a bug in the outlines library 
# during the installation process it assumes that rust is already installed
# in colab and other linux environments this is not the case
# so we need to install rust first
import os
!curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
!source $HOME/.cargo/env
os.environ['PATH'] += f':{os.environ["HOME"]}/.cargo/bin'
!echo $PATH
!pip install "outlines==0.1.0"

In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

# load the model
model_id = "Qwen/Qwen2-VL-7B-Instruct"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id, torch_dtype="auto", device_map="cuda:0"
)
processor = AutoProcessor.from_pretrained(model_id)

In [None]:
from PIL import Image
import requests
from IPython.display import display

image_url = "https://storage.googleapis.com/generativeai-downloads/images/jetpack.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
display(image.resize((int(image.width / image.height * 400), 400)))

In [3]:
from outlines.processors.structured import JSONLogitsProcessor
from outlines.models.transformers import TransformerTokenizer
from pydantic import BaseModel, Field
import json

# create a Pydantic schema for the output
class BackpackFeatures(BaseModel):
    feature: str = Field(description="A feature of the backpack")

class ImageDetails(BaseModel):
    title: str = Field(description="The title of the image")
    features: list[BackpackFeatures] = Field(description="A list of features of the backpack")

# convert the schema to a JSON schema
schema = ImageDetails.model_json_schema()
schema_text = json.dumps(schema)

# create a JSON logits processor
json_logits_processor = JSONLogitsProcessor(schema, TransformerTokenizer(processor.tokenizer))

In [None]:
resized_image = image.resize((int(image.width / image.height * 400), 400))
display(resized_image)

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": f"""
Describe the image in detail. 

Respond in valid JSON, formatted following the schema: {schema_text}
""".strip()}
    ]}
]



input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

inputs = inputs = processor(
    text=[input_text], images=[resized_image], padding=True, return_tensors="pt"
).to(model.device)

output = model.generate(**inputs, max_new_tokens=512, logits_processor=[json_logits_processor])
# output = model.generate(**inputs, max_new_tokens=512)

json_text = processor.decode(output[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
json_data = json.loads(json_text)

from pprint import pprint
pprint(json_data)