In [None]:
import gc
import time

import torch
import vllm
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Load tokenizer and model

### Using LLaMA 3

#### Conditions

Llama 3 can be used for commercial products, but there are certain requirements and restrictions to follow:

1. **Attribution**:  
   You must provide a clear and prominent acknowledgment, such as "Built with Meta Llama 3," in all relevant user interfaces, documentation, and webpages.

2. **User Threshold**:  
   If your product or service utilizing Llama 3 exceeds **700 million monthly active users**, you must obtain a separate, specific license from Meta.

3. **Restrictions on Enhancing Other Models**:  
   Llama 3 materials or outputs cannot be used to improve or train any other large language models outside the Llama family.

4. **Compliance**:  
   Users must ensure compliance with applicable laws and regulations, such as GDPR and trade compliance laws.

In conclusion, Llama 3 offers great flexibility for commercial applications, provided you adhere to these licensing terms and restrictions. See [Llama 3 Overview](https://ai.meta.com/static-resource/july-responsible-use-guide).


#### Steps

1. **Apply for model access**: Visit [Llama-3-8B-Instruct on Hugging Face](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) to request access to the model. Please note that it may take a few days for your application to be approved. Once approved, you will see the following message on the website:

    * **Gated model** You have been granted access to this model
   
2. **Create your Hugging Face Access Key**: Go to your [Hugging Face settings](https://huggingface.co/settings/tokens) to create an access token. When creating the token, ensure you check the box:

    * `Read access to contents of all public gated repos you can access` under **Permissions**.

3. **Provide your Hugging Face Access Key**: Once you have your access token, paste it below to authenticate the notebook with Hugging Face.


In [None]:
hf_access_key = "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"  # noqa
login(hf_access_key)

### Quantization

Suppose you have access to **16 GB of GPU memory**, which is insufficient to load the entire LLaMA model at once. To complete inference, Hugging Face will dynamically move parts of the model onto the GPU during runtime, which will cause the inference to become **extremely slow**.

To address this limitation, we **quantize** the model using Hugging Face's `bitsandbytes` library. This approach significantly reduces GPU memory consumption, enabling faster inference without needing to load the entire model into GPU memory at once.


In [None]:
# Create a BitsAndBytesConfig for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Change this to `False` to disable quantization
    bnb_4bit_use_double_quant=True,  # Optional for performance
    bnb_4bit_quant_type='nf4',  # Normal floating-point 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16  # Set compute dtype to float16 for faster inference
)

# Model name--you can change to many huggingface models
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

Now, let's load the tokenizer and model.

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=quantization_config
)

Let’s check the GPU information and verify that all parts of the model are loaded onto the GPU.

In [None]:
def info_gpu():
    """ Check GPU info """
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1024 ** 3
    allocated_memory = torch.cuda.memory_allocated(0) / 1024 ** 3
    free_memory = total_memory - allocated_memory
    print(f"Total GPU memory: {total_memory:.2f} GB")
    print(f"Allocated GPU memory: {allocated_memory:.2f} GB")
    print(f"Free GPU memory: {free_memory:.2f} GB")


# Check GPU info
info_gpu()

# Check the device of each module of the model
for name, param in model.named_parameters():
    print(f"{name} is on device: {param.device}")

Let's try the model.

In [None]:
# Generate chat response
def generate_response(prompt, max_length=30, temperature=0.2, num_beams=1):
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate the output ids
    outputs = model.generate(inputs.input_ids,
                             attention_mask=inputs['attention_mask'],  # Avoid warning
                             pad_token_id=tokenizer.eos_token_id,  # Avoid warning
                             max_length=max_length,  # Length of generation
                             temperature=temperature,  # Temperature for randomness
                             num_beams=num_beams  # Number of beams
                             )

    # Decode the output ids to a string
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# Try it
generate_response("Hi mate!")

### Key parameters in `model.generate()`

There are numerous arguments for `model.generate()`. Below, we highlight the most useful ones.

#### **Length Control**
- **`max_length`**: Defines the maximum number of tokens to generate, including both the input and generated tokens. The model will stop once this limit is reached.
- **`min_length`**: Sets the minimum number of tokens that must be generated before stopping. This ensures that the output doesn't stop too early.
- **`eos_token_id`**: The ID of the end-of-sequence (EOS) token. The generation will stop once the model generates this token, marking the end of the sequence.

#### **Diversity and Quality**
- **`temperature`**: Controls the randomness of predictions. Lower values (e.g., 0.7) make the model more deterministic, while higher values (e.g., 1.0 or above) increase randomness, making the outputs more diverse.
- **`top_k`**: Limits the next token selection to the top `k` most likely tokens. A higher value allows for more variety in the generated text, while a lower value makes it more deterministic.
- **`top_p` (nucleus sampling)**: Limits token selection to tokens with a cumulative probability of `p`. This ensures that only the top `p` percent of the probability mass is considered, promoting diverse but controlled generation.
- **`do_sample`**: Enables random sampling of tokens instead of greedy decoding (which selects the highest-probability token). This is essential for generating diverse outputs.
- **`num_beams`**: The number of beams for beam search. Higher values explore more possibilities during generation, leading to better outputs but at the cost of increased computation.

![beam](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/beam_search.png)

#### **Repetition and Token Constraints**
- **`repetition_penalty`**: Penalizes repeated tokens, discouraging the model from generating repetitive sequences. A value greater than 1.0 reduces the likelihood of repeating the same token.
- **`no_repeat_ngram_size`**: Prevents repetition of n-grams of a specified size. For example, `no_repeat_ngram_size=3` ensures that trigrams do not repeat in the generated output.

#### **Output Control**
- **`num_return_sequences`**: The number of different sequences to generate. For example, `num_return_sequences=3` generates three separate outputs from the same prompt.


### Batch inference

Now, we enhance our `generate_response()` function to support batch inference, which is crucial for serving multiple users in production environments. Additionally, we add functionality to measure the runtime of the inference process.

In [None]:
def generate_response(prompts, max_length=30, temperature=0.2, num_beams=1, measure_time=False):
    # Check if input is a single prompt or batch of prompts
    if isinstance(prompts, str):
        prompts = [prompts]  # Convert single prompt to a list for batch processing

    # Tokenize the batch of prompts (single or multiple)
    inputs = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")

    # Start time measurement for model.generate() if requested
    start_time = time.time() if measure_time else None

    # Generate responses for the batch
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs['attention_mask'],  # Avoid warning
        pad_token_id=tokenizer.eos_token_id,  # Avoid warning
        max_length=max_length,  # Length of generation
        temperature=temperature,  # Temperature for randomness
        num_beams=num_beams  # Number of beams
    )

    # Measure time after generation
    if measure_time:
        end_time = time.time()
        runtime = end_time - start_time
    else:
        runtime = None

    # Decode the batch of generated outputs
    responses = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    # If the original input was a single prompt, return a single string, not a list
    if len(prompts) == 1:
        responses = responses[0]

    # Return both the response and runtime if time measurement was requested
    return (responses, runtime) if measure_time else responses

For batched inference, padding is necessary to ensure that all input sequences in a batch are of the same length. This allows the model to process multiple inputs in parallel. To set up padding correctly, we need to configure the tokenizer to handle padding. Specifically, we can assign a padding token (typically the `eos_token`) and set the padding side to ensure proper alignment of input sequences.


In [None]:
# Set the pad token to the eos token 
tokenizer.pad_token = tokenizer.eos_token

# Set padding side to left for decoder-only models
tokenizer.padding_side = "left"

Now let's create a multi-input task, and measure its runtime.

In [None]:
# Batch input
many_prompts = [
    "Once upon a time in a distant land...",
    "Explain the theory of relativity in simple terms.",
    "What is the capital of France?",
    "Generate a story about a robot in the future.",
    "How do you bake a chocolate cake?"
]

# Batch inference
many_responses, t = generate_response(model, many_prompts, measure_time=True)

# Print results
for res in many_responses:
    print("---------------")
    print(res)
print("\n\nRuntime before acceleration:", t)

# Accelerate inference

In this section, we utilize `vLLM` to optimize inference through paged attention, as described in [this paper](https://arxiv.org/abs/2309.06180). 

Before proceeding, we release the GPU memory by deleting the previous model, ensuring efficient usage of resources on a small GPU.


In [None]:
# Delete the model and free its GPU memory
try:
    del model  # Deletes the model object
except NameError:
    pass

# Optionally, clear the memory cache on the GPU
torch.cuda.empty_cache()
# Collect garbage to ensure all references are removed
gc.collect()
# Ensure all CUDA operations are finished before clearing memory (optional)
torch.cuda.synchronize()
# Check GPU info
info_gpu()

In [None]:
def generate_response(prompts, max_length=30, temperature=0.2, measure_time=False):
    # Check if input is a single prompt or batch of prompts
    if isinstance(prompts, str):
        prompts = [prompts]  # Convert single prompt to a list for batch processing

    # Tokenize the batch of prompts (single or multiple)
    inputs = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")

    # Start time measurement for model.generate() if requested
    start_time = time.time() if measure_time else None

    # Generate responses for the batch
    outputs = model.generate(
        prompts=None,
        prompt_token_ids=inputs.input_ids,
        sampling_params = vllm.SamplingParams(max_tokens=max_length, temperature=temperature),
    )

    # Measure time after generation
    if measure_time:
        end_time = time.time()
        runtime = end_time - start_time
    else:
        runtime = None

    # Decode the batch of generated outputs
    responses = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    # If the original input was a single prompt, return a single string, not a list
    if len(prompts) == 1:
        responses = responses[0]

    # Return both the response and runtime if time measurement was requested
    return (responses, runtime) if measure_time else responses

In [None]:
# Batch inference
many_responses, t = generate_response(model_vllm, many_prompts, measure_time=True)

# Print results
for res in many_responses:
    print("---------------")
    print(res)
print("\n\nRuntime after acceleration:", t)