<a href="https://colab.research.google.com/github/jlonge4/gen_ai_utils/blob/main/vllm_vs_hf_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install vllm flash-attn triton peft --upgrade

In [1]:
from typing import Dict, Optional
import torch
from peft import PeftConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

In [2]:
model_id =  'grounded-ai/phi3.5-hallucination-judge-merge'
quantization = False

In [16]:
def load_model():
    """Loads the base model with or without quantization."""
    compute_dtype = (
        torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    )
    attn_implementation = (
        "flash_attention_2" if torch.cuda.is_bf16_supported() else "sdpa"
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    print(attn_implementation)
    model_kwargs = {
        "attn_implementation": attn_implementation,
        "torch_dtype": compute_dtype,
    }
    if quantization:
        model_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
    base_model = AutoModelForCausalLM.from_pretrained(
        model_id, **model_kwargs
    )
    return base_model, tokenizer

base_model, tokenizer = load_model()

flash_attention_2


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
from vllm import SamplingParams, LLM

references = [
    "The chicken crossed the road to get to the other side",
    "The apple mac has the best hardware",
    "The cat is hungry"
  ]
queries = [
    "Why did the chicken cross the road?",
    "What computer has the best display?",
    "What pet does the context reference?"
  ]
responses = [
    "To get to the other side", # Grounded answer
    "Apple mac",                # Deviated from the question (hardware vs software)
    "Cat"                       # Grounded answer
]
sampling_params = SamplingParams(temperature=0.01, max_tokens=2, top_k=3)

In [9]:
def formatting_func(query, response, reference=""):
  knowledge_line = "" if reference == "" else "[Knowledge]: " + reference + "\n    "
  input = f"""
  <|user|>
  Your job is to evaluate whether a machine learning model has hallucinated or not.
  A hallucination occurs when the response is coherent but factually incorrect or nonsensical
  outputs that are not grounded in the provided context/knowledge.
  You are given the following information:
  ####INFO####
  {knowledge_line}[User Input]: {query}
  [Model Response]: {response}
  ####END INFO####
  Based on the information provided is the model output a hallucination? Respond with only "yes" or "no"

  You must only reply with a single word: either "yes", or "no".
  <|end|>
  <|assistant|>
  """
  return input

In [10]:
prompts = []
for reference, query, response in zip(references, queries, responses):
  input = formatting_func(query, response, reference)
  prompts.append(input)

In [10]:
from pprint import pprint
pprint(prompts[1])

('\n'
 '  <|user|>\n'
 '  Your job is to evaluate whether a machine learning model has hallucinated '
 'or not.\n'
 '  A hallucination occurs when the response is coherent but factually '
 'incorrect or nonsensical\n'
 '  outputs that are not grounded in the provided context/knowledge.\n'
 '  You are given the following information:\n'
 '  ####INFO####\n'
 '  [Knowledge]: The apple mac has the best hardware\n'
 '    [User Input]: What computer has the best display?\n'
 '  [Model Response]: Apple mac\n'
 '  ####END INFO####\n'
 '  Based on the information provided is the model output a hallucination? '
 'Respond with only "yes" or "no"\n'
 '\n'
 '  You must only reply with a single word: either "yes", or "no".\n'
 '  <|end|>\n'
 '  <|assistant|>\n'
 '  ')


In [23]:
len(tokenizer.encode(prompts[0]))

178

In [None]:
# torch.cuda.empty_cache()
# import gc
# gc.collect()

In [7]:
from vllm.lora.request import LoRARequest

llm = LLM(model="grounded-ai/phi3.5-hallucination-judge-merge", gpu_memory_utilization=0.85, max_model_len=7000)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


INFO 10-10 00:09:20 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='grounded-ai/phi3.5-hallucination-judge-merge', speculative_config=None, tokenizer='grounded-ai/phi3.5-hallucination-judge-merge', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=7000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=grounded-ai/phi3.5-hallucination-judge-merge, use_v2_block_manager

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 10-10 00:09:22 model_runner.py:1014] Starting to load model grounded-ai/phi3.5-hallucination-judge-merge...
INFO 10-10 00:09:22 selector.py:240] Cannot use FlashAttention-2 backend due to sliding window.
INFO 10-10 00:09:22 selector.py:116] Using XFormers backend.
INFO 10-10 00:09:22 weight_utils.py:242] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 10-10 00:09:25 model_runner.py:1025] Loading model weights took 7.1659 GB
INFO 10-10 00:09:28 gpu_executor.py:122] # GPU blocks: 1874, # CPU blocks: 682
INFO 10-10 00:09:30 model_runner.py:1329] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 10-10 00:09:30 model_runner.py:1333] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 10-10 00:09:58 model_runner.py:1456] Graph capturing finished in 28 secs.


In [8]:
import time
start_time = time.time()
outputs = llm.generate(
    prompts,
    sampling_params,
)
print(f"Time taken without prefix: {time.time() - start_time}")

Processed prompts: 100%|██████████| 3/3 [00:00<00:00, 17.25it/s, est. speed input: 2928.68 toks/s, output: 34.52 toks/s]

Time taken without prefix: 0.1825881004333496





# Vllm wrapped model makes inaccurate predictions as compared to the vanilla hf pipe

In [9]:
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    # print(f"Prompt: {prompt!r}")
    print(f"Generated text: {generated_text!r}")

Generated text: ' no'
Generated text: ' no'
Generated text: ' no'


# HF pipe implementation

In [13]:
def run_model(input) -> str:
  messages = [{"role": "user", "content": input}]

  pipe = pipeline(
      "text-generation",
      model=base_model,
      tokenizer=tokenizer,
      device='cuda'
  )

  generation_args = {
      "max_new_tokens": 2,
      "return_full_text": False,
      "temperature": 0.01,
      "do_sample": True,
  }

  output = pipe(messages, **generation_args)
  torch.cuda.empty_cache()
  return output[0]["generated_text"].strip().lower()

In [17]:
run_model(prompts[0])

'no'

In [15]:
run_model(prompts[1])

'yes'

In [18]:
run_model(prompts[2])

'no'

# These are the expected and accurate predictions