# vLLM Chat

vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. This server can be queried in the same format as OpenAI API.

This notebook covers how to get started with vLLM chat models using langchain's `ChatOpenAI` **as it is**, as well as how to do structured generation with `ChatVLLMOpenAI`.

We assume you already have a vLLM server running. See the [vLLM README](https://docs.vllm.ai/en/latest/serving/deploying_with_docker.html) for instructions on how to deploy a vLLM server.

In [10]:
import json
from langchain_community.chat_models import ChatVLLMOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

In [11]:
# The URL of the vLLM inference server
inference_server_url = "http://10.1.0.255:8000/v1"

llm = ChatVLLMOpenAI(
    model="meta-llama/Meta-Llama-3-8B-Instruct",
    openai_api_base=inference_server_url,
    temperature=0,
)

In [3]:
from typing import Literal

from langchain_core.pydantic_v1 import BaseModel, Field

# Example pydantic model
class CityModel(BaseModel):
    name: str = Field(..., description="Name of the city")
    population: int = Field(
        ..., description="Population of the city measured in number of inhabitants"
    )
    country: str = Field(..., description="Country of the city")
    population_category: Literal[">1M", "<1M"] = Field(
        ..., description="Population category of the city"
    )

In [10]:
llm.with_structured_output(CityModel, instructions="Paris has 10 million citizens").invoke("What is the capital of France?")

CityModel(name='What is the capital of France?Paris has 10 million citizens', population=10000000, country='France', population_category='>1M')

In [12]:
llm.with_structured_output(CityModel, instructions="").invoke("What is the capital of France?")

CityModel(name='Paris', population=2140000, country='France', population_category='>1M')

In [11]:
import json
tool_llm = llm.bind_tools([CityModel], tool_choice="CityModel")
res = tool_llm.invoke("What is the capital of France?\nParis has 10 million citizens")
json.loads(res.json())

{'content': '',
 'additional_kwargs': {'tool_calls': [{'id': 'chatcmpl-tool-b9d7f5a2d56a4968bd0ce428e2444ceb',
    'function': {'arguments': '{"name":"What is the capital of France?","population":10000000,"country":"France","population_category":">1M"}',
     'name': 'CityModel'},
    'type': 'function'}]},
 'response_metadata': {'token_usage': {'completion_tokens': 31,
   'prompt_tokens': 23,
   'total_tokens': 54},
  'model_name': 'meta-llama/Meta-Llama-3-8B-Instruct',
  'system_fingerprint': None,
  'finish_reason': 'stop',
  'logprobs': None},
 'type': 'ai',
 'name': None,
 'id': 'run-bcc2eed6-80eb-43a5-8330-0282a715e8f6-0',
 'example': False,
 'tool_calls': [{'name': 'CityModel',
   'args': {'name': 'What is the capital of France?',
    'population': 10000000,
    'country': 'France',
    'population_category': '>1M'},
   'id': 'chatcmpl-tool-b9d7f5a2d56a4968bd0ce428e2444ceb'}],
 'invalid_tool_calls': [],
 'usage_metadata': {'input_tokens': 23,
  'output_tokens': 31,
  'total_toke

In [14]:
tool_llm = llm.bind(response_format={"type": "json_object"})
res = tool_llm.invoke("""
Create a json object with the following fields:
- name: Paris
- population: 10 million
- country: France
""")
print(res.content)

{ 
"name" 	: 	"Paris" 	, 
"population" 	: 	10000000 	, 
"country" 	: 	"France" 
}
