From ea86a167753e8d3b4fd2382841f6bb47d78d6ee1 Mon Sep 17 00:00:00 2001 From: jamesrochabrun Date: Fri, 14 Nov 2025 11:20:39 -0800 Subject: [PATCH 1/4] Add image input support to Realtime API conversation items MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extend Content type to support both text and image inputs - Add image case with base64 data URL format - Support data:image/{format};base64,{bytes} format - Add flexible Item initializer for multi-content messages 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- ...OpenAIRealtimeConversationItemCreate.swift | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeConversationItemCreate.swift b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeConversationItemCreate.swift index 55fbb580..2bdef268 100644 --- a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeConversationItemCreate.swift +++ b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeConversationItemCreate.swift @@ -30,7 +30,12 @@ extension OpenAIRealtimeConversationItemCreate { public init(role: String, text: String) { self.role = role - content = [.init(text: text)] + content = [.text(text)] + } + + public init(role: String, content: [Content]) { + self.role = role + self.content = content } } } @@ -38,12 +43,26 @@ extension OpenAIRealtimeConversationItemCreate { // MARK: - OpenAIRealtimeConversationItemCreate.Item.Content extension OpenAIRealtimeConversationItemCreate.Item { - public struct Content: Encodable { - public let type = "input_text" - public let text: String + public enum Content: Encodable { + case text(String) + case image(String) // base64 data URL: "data:image/{format};base64,{bytes}" + + public func encode(to encoder: Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + switch self { + case .text(let text): + try container.encode("input_text", forKey: .type) + try container.encode(text, forKey: .text) + case .image(let imageUrl): + try container.encode("input_image", forKey: .type) + try container.encode(imageUrl, forKey: .imageUrl) + } + } - public init(text: String) { - self.text = text + private enum CodingKeys: String, CodingKey { + case type + case text + case imageUrl = "image_url" } } } From 12a8dc21427686f1130039257c598faeb2da57d0 Mon Sep 17 00:00:00 2001 From: jamesrochabrun Date: Fri, 14 Nov 2025 11:51:04 -0800 Subject: [PATCH 2/4] Add MCP server support to Realtime API session configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add RealtimeTool enum supporting both function and MCP tools - Rename Tool to FunctionTool to avoid naming conflicts - Support Tool.MCPTool from shared Tool enum - Enable MCP server integration in Realtime sessions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../OpenAIRealtimeSessionConfiguration.swift | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift index ae8f65e0..e853f8b8 100644 --- a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift +++ b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift @@ -21,7 +21,7 @@ public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable { outputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil, speed: Float? = 1.0, temperature: Double? = nil, - tools: [OpenAIRealtimeSessionConfiguration.Tool]? = nil, + tools: [OpenAIRealtimeSessionConfiguration.RealtimeTool]? = nil, toolChoice: OpenAIRealtimeSessionConfiguration.ToolChoice? = nil, turnDetection: OpenAIRealtimeSessionConfiguration.TurnDetection? = nil, voice: String? = nil) @@ -130,8 +130,8 @@ public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable { /// Sampling temperature for the model. public let temperature: Double? - /// Tools (functions) available to the model. - public let tools: [Tool]? + /// Tools (functions and MCP servers) available to the model. + public let tools: [RealtimeTool]? /// How the model chooses tools. Options are "auto", "none", "required", or specify a function. public let toolChoice: ToolChoice? @@ -191,10 +191,10 @@ extension OpenAIRealtimeSessionConfiguration { } } -// MARK: OpenAIRealtimeSessionConfiguration.Tool +// MARK: OpenAIRealtimeSessionConfiguration.FunctionTool extension OpenAIRealtimeSessionConfiguration { - public struct Tool: Encodable, Sendable { + public struct FunctionTool: Encodable, Sendable { /// The description of the function public let description: String @@ -215,6 +215,25 @@ extension OpenAIRealtimeSessionConfiguration { } } +// MARK: OpenAIRealtimeSessionConfiguration.RealtimeTool + +extension OpenAIRealtimeSessionConfiguration { + /// Represents a tool that can be either a function or an MCP server + public enum RealtimeTool: Encodable, Sendable { + case function(FunctionTool) + case mcp(Tool.MCPTool) + + public func encode(to encoder: Encoder) throws { + switch self { + case .function(let tool): + try tool.encode(to: encoder) + case .mcp(let mcpTool): + try mcpTool.encode(to: encoder) + } + } + } +} + // MARK: OpenAIRealtimeSessionConfiguration.TurnDetection extension OpenAIRealtimeSessionConfiguration { From da4200f71efdac230c1f650ea8c3af26a506e32d Mon Sep 17 00:00:00 2001 From: jamesrochabrun Date: Fri, 14 Nov 2025 14:34:33 -0800 Subject: [PATCH 3/4] Add MCP message types and detailed error logging for Realtime API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add mcpListToolsInProgress, mcpListToolsCompleted, mcpListToolsFailed message cases - Implement comprehensive MCP error logging with full JSON payload inspection - Extract error details from nested and top-level fields (message, code, reason) - Add debug logging for MCP tool discovery lifecycle This enables proper MCP (Model Context Protocol) server integration diagnostics and helps identify authentication and configuration issues. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../Realtime/OpenAIRealtimeSession.swift | 36 +++++++++++++++++++ .../Realtime/OpenAIRealtimeMessage.swift | 5 +++ 2 files changed, 41 insertions(+) diff --git a/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift b/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift index 4e309b9c..50578586 100644 --- a/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift +++ b/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift @@ -211,6 +211,42 @@ open class OpenAIRealtimeSession { continuation?.yield(.inputAudioTranscriptionCompleted(transcript)) } + // MCP (Model Context Protocol) message types + case "mcp_list_tools.in_progress": + logger.debug("MCP: Tool discovery in progress") + continuation?.yield(.mcpListToolsInProgress) + + case "mcp_list_tools.completed": + logger.debug("MCP: Tool discovery completed") + if let tools = json["tools"] as? [String: Any] { + continuation?.yield(.mcpListToolsCompleted(tools)) + } else { + continuation?.yield(.mcpListToolsCompleted([:])) + } + + case "mcp_list_tools.failed": + logger.error("MCP: Tool discovery failed") + logger.error("Full JSON payload: \(String(describing: json))") + + let errorDetails = json["error"] as? [String: Any] + let errorMessage = errorDetails?["message"] as? String + let errorCode = errorDetails?["code"] as? String + + // Also check for top-level error fields + let topLevelMessage = json["message"] as? String + let topLevelCode = json["code"] as? String + let topLevelReason = json["reason"] as? String + + let finalMessage = errorMessage ?? topLevelMessage ?? topLevelReason ?? "Unknown MCP error" + let finalCode = errorCode ?? topLevelCode + let fullError = finalCode != nil ? "[\(finalCode!)] \(finalMessage)" : finalMessage + + logger.error("MCP Error: \(fullError)") + logger.error("Error details: \(String(describing: errorDetails))") + logger.error("Top-level fields: message=\(String(describing: topLevelMessage)), code=\(String(describing: topLevelCode)), reason=\(String(describing: topLevelReason))") + + continuation?.yield(.mcpListToolsFailed(fullError)) + default: // Log unhandled message types for debugging logger.debug("Unhandled message type: \(messageType)") diff --git a/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift b/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift index f4dd2f56..2545ee35 100644 --- a/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift +++ b/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift @@ -21,4 +21,9 @@ public enum OpenAIRealtimeMessage: Sendable { case inputAudioBufferTranscript(String) // "input_audio_buffer.transcript" case inputAudioTranscriptionDelta(String) // "conversation.item.input_audio_transcription.delta" case inputAudioTranscriptionCompleted(String) // "conversation.item.input_audio_transcription.completed" + + // MCP (Model Context Protocol) messages + case mcpListToolsInProgress // "mcp_list_tools.in_progress" + case mcpListToolsCompleted([String: Any]) // "mcp_list_tools.completed" with tools data + case mcpListToolsFailed(String?) // "mcp_list_tools.failed" with error details } From 8098d510b5a1c613ea6a4ef819027d1d9a224d8d Mon Sep 17 00:00:00 2001 From: jamesrochabrun Date: Fri, 14 Nov 2025 16:31:14 -0800 Subject: [PATCH 4/4] Fix SwiftFormat lint issues --- Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift | 4 +++- .../Realtime/OpenAIRealtimeConversationItemCreate.swift | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift b/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift index 50578586..bba3befb 100644 --- a/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift +++ b/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift @@ -243,7 +243,9 @@ open class OpenAIRealtimeSession { logger.error("MCP Error: \(fullError)") logger.error("Error details: \(String(describing: errorDetails))") - logger.error("Top-level fields: message=\(String(describing: topLevelMessage)), code=\(String(describing: topLevelCode)), reason=\(String(describing: topLevelReason))") + logger + .error( + "Top-level fields: message=\(String(describing: topLevelMessage)), code=\(String(describing: topLevelCode)), reason=\(String(describing: topLevelReason))") continuation?.yield(.mcpListToolsFailed(fullError)) diff --git a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeConversationItemCreate.swift b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeConversationItemCreate.swift index 2bdef268..5f8ef77b 100644 --- a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeConversationItemCreate.swift +++ b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeConversationItemCreate.swift @@ -53,6 +53,7 @@ extension OpenAIRealtimeConversationItemCreate.Item { case .text(let text): try container.encode("input_text", forKey: .type) try container.encode(text, forKey: .text) + case .image(let imageUrl): try container.encode("input_image", forKey: .type) try container.encode(imageUrl, forKey: .imageUrl)