diff --git a/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift b/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift index cd01b504..c9b59c68 100644 --- a/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift +++ b/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift @@ -62,7 +62,7 @@ class MicrophonePCMSampleVendorAE: MicrophonePCMSampleVendor { logger.debug("MicrophonePCMSampleVendorAE is being freed") } - public func start() throws -> AsyncStream { + func start() throws -> AsyncStream { guard let desiredTapFormat = AVAudioFormat( commonFormat: .pcmFormatInt16, @@ -87,14 +87,10 @@ class MicrophonePCMSampleVendorAE: MicrophonePCMSampleVendor { return AsyncStream { [weak self] continuation in guard let this = self else { return } this.continuation = continuation - this.inputNode.installTap(onBus: 0, bufferSize: targetBufferSize, format: desiredTapFormat) { [weak this] sampleBuffer, _ in - if let accumulatedBuffer = this?.microphonePCMSampleVendorCommon.resampleAndAccumulate(sampleBuffer) { - // If the buffer has accumulated to a sufficient level, give it back to the caller - Task { @RealtimeActor in - this?.continuation?.yield(accumulatedBuffer) - } - } - } + this.installTapNonIsolated( + inputNode: this.inputNode, + bufferSize: targetBufferSize, + format: desiredTapFormat) } } @@ -111,5 +107,22 @@ class MicrophonePCMSampleVendorAE: MicrophonePCMSampleVendor { private let microphonePCMSampleVendorCommon = MicrophonePCMSampleVendorCommon() private var continuation: AsyncStream.Continuation? + private nonisolated func installTapNonIsolated( + inputNode: AVAudioInputNode, + bufferSize: AVAudioFrameCount, + format: AVAudioFormat) + { + inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: format) { [weak self] sampleBuffer, _ in + guard let self else { return } + Task { await self.processBuffer(sampleBuffer) } + } + } + + private func processBuffer(_ buffer: AVAudioPCMBuffer) { + if let accumulatedBuffer = microphonePCMSampleVendorCommon.resampleAndAccumulate(buffer) { + continuation?.yield(accumulatedBuffer) + } + } + } #endif diff --git a/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift b/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift index 1a119634..cdd1ab08 100644 --- a/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift +++ b/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift @@ -273,6 +273,62 @@ open class OpenAIRealtimeSession { logger.warning("Received response.done with unexpected format") } + case "response.text.delta": + if let delta = json["delta"] as? String { + continuation?.yield(.responseTextDelta(delta)) + } + + case "response.text.done": + if let text = json["text"] as? String { + continuation?.yield(.responseTextDone(text)) + } + + case "response.output_item.added": + if + let item = json["item"] as? [String: Any], + let itemId = item["id"] as? String, + let type = item["type"] as? String + { + continuation?.yield(.responseOutputItemAdded(itemId: itemId, type: type)) + } + + case "response.output_item.done": + if + let item = json["item"] as? [String: Any], + let itemId = item["id"] as? String, + let type = item["type"] as? String + { + let content = item["content"] as? [[String: Any]] + continuation?.yield(.responseOutputItemDone(itemId: itemId, type: type, content: content)) + } + + case "response.content_part.added": + if + let part = json["part"] as? [String: Any], + let type = part["type"] as? String + { + continuation?.yield(.responseContentPartAdded(type: type)) + } + + case "response.content_part.done": + if + let part = json["part"] as? [String: Any], + let type = part["type"] as? String + { + let text = part["text"] as? String + continuation?.yield(.responseContentPartDone(type: type, text: text)) + } + + case "conversation.item.created": + if + let item = json["item"] as? [String: Any], + let itemId = item["id"] as? String, + let type = item["type"] as? String + { + let role = item["role"] as? String + continuation?.yield(.conversationItemCreated(itemId: itemId, type: type, role: role)) + } + default: // Log unhandled message types with more detail for debugging logger.warning("⚠️ Unhandled message type: \(messageType)") diff --git a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift index e853f8b8..a5e3fc81 100644 --- a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift +++ b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift @@ -11,7 +11,6 @@ /// Realtime session configuration /// https://platform.openai.com/docs/api-reference/realtime-client-events/session/update#realtime-client-events/session/update-session public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable { - public init( inputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil, inputAudioTranscription: OpenAIRealtimeSessionConfiguration.InputAudioTranscription? = nil, @@ -41,7 +40,6 @@ public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable { } public enum ToolChoice: Encodable, Sendable { - /// The model will not call any tool and instead generates a message. /// This is the default when no tools are present in the request body case none @@ -73,22 +71,14 @@ public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable { case .specific(let functionName): var container = encoder.container(keyedBy: RootKey.self) try container.encode("function", forKey: .type) - var functionContainer = container.nestedContainer( - keyedBy: FunctionKey.self, - forKey: .function) - try functionContainer.encode(functionName, forKey: .name) + try container.encode(functionName, forKey: .name) } } private enum RootKey: CodingKey { case type - case function - } - - private enum FunctionKey: CodingKey { case name } - } /// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. @@ -157,7 +147,6 @@ public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable { case turnDetection = "turn_detection" case voice } - } // MARK: OpenAIRealtimeSessionConfiguration.InputAudioTranscription @@ -238,7 +227,6 @@ extension OpenAIRealtimeSessionConfiguration { extension OpenAIRealtimeSessionConfiguration { public struct TurnDetection: Encodable, Sendable { - public init( type: DetectionType) { @@ -270,7 +258,6 @@ extension OpenAIRealtimeSessionConfiguration { case type case eagerness } - } } @@ -321,6 +308,5 @@ extension OpenAIRealtimeSessionConfiguration.TurnDetection { case medium case high } - } } diff --git a/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift b/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift index cda21a98..bb551a9c 100644 --- a/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift +++ b/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift @@ -29,4 +29,19 @@ public enum OpenAIRealtimeMessage: Sendable { /// Response completion with potential errors case responseDone(status: String, statusDetails: [String: Any]?) // "response.done" + + // Text streaming (for text-only responses) + case responseTextDelta(String) // "response.text.delta" + case responseTextDone(String) // "response.text.done" + + // Output item lifecycle + case responseOutputItemAdded(itemId: String, type: String) // "response.output_item.added" + case responseOutputItemDone(itemId: String, type: String, content: [[String: Any]]?) // "response.output_item.done" + + // Content part lifecycle + case responseContentPartAdded(type: String) // "response.content_part.added" + case responseContentPartDone(type: String, text: String?) // "response.content_part.done" + + /// Conversation item + case conversationItemCreated(itemId: String, type: String, role: String?) // "conversation.item.created" }