From d00c69dad63cbb1a53ec4fe62ed54d33fef65dc0 Mon Sep 17 00:00:00 2001 From: Mason Daugherty Date: Sun, 16 Nov 2025 01:01:05 -0500 Subject: [PATCH 1/4] oss(py): update openai chat page --- src/oss/python/integrations/chat/openai.mdx | 107 ++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/src/oss/python/integrations/chat/openai.mdx b/src/oss/python/integrations/chat/openai.mdx index 1e4b7326e0..c0e276b200 100644 --- a/src/oss/python/integrations/chat/openai.mdx +++ b/src/oss/python/integrations/chat/openai.mdx @@ -89,6 +89,14 @@ llm = ChatOpenAI( See the @[`ChatOpenAI`] API Reference for the full set of available model parameters. + + **Token parameter deprecation** + + OpenAI deprecated `max_tokens` in favor of `max_completion_tokens` in September 2024. While `max_tokens` is still supported for backwards compatibility, it's automatically converted to `max_completion_tokens` internally. + + +--- + ## Invocation ```python @@ -115,6 +123,8 @@ print(ai_msg.text) J'adore la programmation. ``` +--- + ## Streaming usage metadata OpenAI's Chat Completions API does not stream token usage statistics by default (see API reference [here](https://platform.openai.com/docs/api-reference/completions/create#completions-create-stream_options)). @@ -127,6 +137,8 @@ from langchain_openai import ChatOpenAI llm = ChatOpenAI(model="gpt-4.1-mini", stream_usage=True) # [!code highlight] ``` +--- + ## Using with Azure OpenAI @@ -222,6 +234,8 @@ When using an async callable for the API key, you must use async methods (`ainvo +--- + ## Tool calling OpenAI has a [tool calling](https://platform.openai.com/docs/guides/function-calling) (we use "tool calling" and "function calling" interchangeably here) API that lets you describe tools and their arguments, and have the model return a JSON object with a tool to invoke and the inputs to that tool. tool-calling is extremely useful for building tool-using chains and agents, and for getting structured outputs from models more generally. @@ -463,6 +477,8 @@ Name: do_math +--- + ## Responses API @@ -1066,6 +1082,16 @@ for block in response.content_blocks: The user is asking about 3 raised to the power of 3. That's a pretty simple calculation! I know that 3^3 equals 27, so I can say, "3 to the power of 3 equals 27." I might also include a quick explanation that it's 3 multiplied by itself three times: 3 × 3 × 3 = 27. So, the answer is definitely 27. ``` + + **Troubleshooting: Empty responses from reasoning models** + + If you're getting empty responses from reasoning models like `gpt-5-nano`, this is likely due to restrictive token limits. The model uses tokens for internal reasoning and may not have any left for the final output. + + Set `max_tokens=None` or increase the token limit to allow sufficient tokens for both reasoning and output generation. + + +--- + ## Fine-tuning You can call fine-tuned OpenAI models by passing in your corresponding `modelName` parameter. @@ -1084,6 +1110,8 @@ fine_tuned_model.invoke(messages) AIMessage(content="J'adore la programmation.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 8, 'prompt_tokens': 31, 'total_tokens': 39}, 'model_name': 'ft:gpt-3.5-turbo-0613:langchain::7qTVM5AR', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-0f39b30e-c56e-4f3b-af99-5c948c984146-0', usage_metadata={'input_tokens': 31, 'output_tokens': 8, 'total_tokens': 39}) ``` +--- + ## Multimodal Inputs (images, PDFs, audio) OpenAI has models that support multimodal inputs. You can pass in images, PDFs, or audio to these models. For more information on how to do this in LangChain, head to the [multimodal inputs](/oss/langchain/messages#multimodal) docs. @@ -1196,6 +1224,8 @@ content_block = { ``` +--- + ## Predicted output @@ -1268,6 +1298,7 @@ public class User ``` Note that currently predictions are billed as additional tokens and may increase your usage and costs in exchange for this reduced latency. +--- ## Audio Generation (Preview) @@ -1326,6 +1357,82 @@ history = [ second_output_message = llm.invoke(history) ``` +--- + +## Prompt caching + +OpenAI's [prompt caching](https://platform.openai.com/docs/guides/prompt-caching) feature automatically caches prompts longer than 1024 tokens to reduce costs and improve response times. This feature is enabled for all recent models (`gpt-4o` and newer). + +### Basic usage + +Use the `prompt_cache_key` parameter to optimize cache hit rates: + +```python +from langchain_openai import ChatOpenAI + +llm = ChatOpenAI(model="gpt-4o") + +# Use a cache key for repeated prompts +messages = [ + ("system", "You are a helpful assistant that translates English to French."), + ("human", "I love programming."), +] + +response = llm.invoke( + messages, + prompt_cache_key="translation-assistant-v1" +) + +# Check cache usage +cache_read_tokens = response.usage_metadata.input_token_details.cache_read +print(f"Cached tokens used: {cache_read_tokens}") +``` + + + Cache hits require the prompt prefix to match exactly + + +### Cache key strategies + +You can use different cache key strategies based on your application's needs: + +```python +# Static cache keys for consistent prompt templates +customer_response = llm.invoke( + messages, + prompt_cache_key="customer-support-v1" +) + +support_response = llm.invoke( + messages, + prompt_cache_key="internal-support-v1" +) + +# Dynamic cache keys based on context +user_type = "premium" +cache_key = f"assistant-{user_type}-v1" +response = llm.invoke(messages, prompt_cache_key=cache_key) +``` + +### Model-level caching + +You can also set a default cache key at the model level using `model_kwargs`: + +```python +llm = ChatOpenAI( + model="gpt-4o-mini", + model_kwargs={"prompt_cache_key": "default-cache-v1"} +) + +# Uses default cache key +response1 = llm.invoke(messages) + +# Override with specific cache key +response2 = llm.invoke(messages, prompt_cache_key="override-cache-v1") +``` + +--- + ## Flex processing OpenAI offers a variety of [service tiers](https://platform.openai.com/docs/guides/flex-processing). The "flex" tier offers cheaper pricing for requests, with the trade-off that responses may take longer and resources might not always be available. This approach is best suited for non-critical tasks, including model testing, data enhancement, or jobs that can be run asynchronously. From 79ff577de1baddd179070212655ac9c3fabe09e4 Mon Sep 17 00:00:00 2001 From: Mason Daugherty Date: Mon, 17 Nov 2025 08:12:09 -0600 Subject: [PATCH 2/4] Update src/oss/python/integrations/chat/openai.mdx Co-authored-by: ccurme --- src/oss/python/integrations/chat/openai.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/oss/python/integrations/chat/openai.mdx b/src/oss/python/integrations/chat/openai.mdx index c0e276b200..67be93f497 100644 --- a/src/oss/python/integrations/chat/openai.mdx +++ b/src/oss/python/integrations/chat/openai.mdx @@ -1363,9 +1363,9 @@ second_output_message = llm.invoke(history) OpenAI's [prompt caching](https://platform.openai.com/docs/guides/prompt-caching) feature automatically caches prompts longer than 1024 tokens to reduce costs and improve response times. This feature is enabled for all recent models (`gpt-4o` and newer). -### Basic usage +### Manual caching -Use the `prompt_cache_key` parameter to optimize cache hit rates: +You can use the `prompt_cache_key` parameter to influence OpenAI's caching and optimize cache hit rates: ```python from langchain_openai import ChatOpenAI From aa46a496f4a8fc2e9aa83c6f625bec8343e55bbf Mon Sep 17 00:00:00 2001 From: Mason Daugherty Date: Mon, 17 Nov 2025 08:12:44 -0600 Subject: [PATCH 3/4] Update src/oss/python/integrations/chat/openai.mdx Co-authored-by: ccurme --- src/oss/python/integrations/chat/openai.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/oss/python/integrations/chat/openai.mdx b/src/oss/python/integrations/chat/openai.mdx index 67be93f497..b9b7a2c960 100644 --- a/src/oss/python/integrations/chat/openai.mdx +++ b/src/oss/python/integrations/chat/openai.mdx @@ -1374,8 +1374,8 @@ llm = ChatOpenAI(model="gpt-4o") # Use a cache key for repeated prompts messages = [ - ("system", "You are a helpful assistant that translates English to French."), - ("human", "I love programming."), + {"role": "system", "content": "You are a helpful assistant that translates English to French."}, + {"role": "user", "content": "I love programming."}, ] response = llm.invoke( From abc073a7059fb0073d68753f8676f6c06447ef96 Mon Sep 17 00:00:00 2001 From: Mason Daugherty Date: Sat, 22 Nov 2025 02:37:05 -0500 Subject: [PATCH 4/4] Update src/oss/python/integrations/chat/openai.mdx --- src/oss/python/integrations/chat/openai.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/oss/python/integrations/chat/openai.mdx b/src/oss/python/integrations/chat/openai.mdx index b9b7a2c960..3145b1dc45 100644 --- a/src/oss/python/integrations/chat/openai.mdx +++ b/src/oss/python/integrations/chat/openai.mdx @@ -1087,7 +1087,7 @@ The user is asking about 3 raised to the power of 3. That's a pretty simple calc If you're getting empty responses from reasoning models like `gpt-5-nano`, this is likely due to restrictive token limits. The model uses tokens for internal reasoning and may not have any left for the final output. - Set `max_tokens=None` or increase the token limit to allow sufficient tokens for both reasoning and output generation. + Ensure `max_tokens` is set to `None` or increase the token limit to allow sufficient tokens for both reasoning and output generation. ---