From 3001aa5bd70a91cfaeeffaa216af0b3fbb52c481 Mon Sep 17 00:00:00 2001 From: Hubert Date: Sat, 28 Feb 2026 14:13:21 +0800 Subject: [PATCH 01/11] feat: add PaddleOCR as VLM provider with fast/precise modes - Add PaddleOCRVLMProvider implementing VLMProviderProtocol - Support two modes: - Fast mode: paddleocr ocr command (~1s) - Precise mode: paddleocr doc_parser VL-1.5 (~12s) - Add PaddleOCRMode enum with settings UI - Add cloud API configuration options (baseURL, API key) - Fix JSON parsing for numpy arrays and float formats - Update localization (en/zh-Hans) - Fix screen recording permission check using SCShareableContent Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- .../Features/Capture/CaptureManager.swift | 30 ++- .../Features/Settings/EngineSettingsTab.swift | 253 +++++++++++++----- .../Features/Settings/SettingsViewModel.swift | 87 ++++-- ScreenTranslate/Models/AppSettings.swift | 58 ++++ ScreenTranslate/Models/VLMProviderType.swift | 14 +- .../Resources/en.lproj/Localizable.strings | 15 ++ .../zh-Hans.lproj/Localizable.strings | 15 ++ .../Services/PaddleOCREngine.swift | 237 +++++++++++++--- .../Services/PaddleOCRVLMProvider.swift | 189 +++++++++++++ .../Services/ScreenCoderEngine.swift | 2 + 10 files changed, 766 insertions(+), 134 deletions(-) create mode 100644 ScreenTranslate/Services/PaddleOCRVLMProvider.swift diff --git a/ScreenTranslate/Features/Capture/CaptureManager.swift b/ScreenTranslate/Features/Capture/CaptureManager.swift index 814059f..f093b85 100644 --- a/ScreenTranslate/Features/Capture/CaptureManager.swift +++ b/ScreenTranslate/Features/Capture/CaptureManager.swift @@ -43,9 +43,27 @@ actor CaptureManager { // MARK: - Permission Handling /// Checks if the app has screen recording permission. - /// Uses CGPreflightScreenCaptureAccess() which does NOT trigger dialog. - /// - Returns: True if permission is granted + /// Uses SCShareableContent to actually verify permission works (not just cached status). + /// - Returns: True if permission is granted and functional var hasPermission: Bool { + get async { + // Quick check first + guard CGPreflightScreenCaptureAccess() else { + return false + } + // Actually verify by trying to get shareable content + do { + _ = try await SCShareableContent.current + return true + } catch { + return false + } + } + } + + /// Synchronous permission check using only CGPreflightScreenCaptureAccess. + /// Use only when async check is not possible. + var hasPermissionSync: Bool { CGPreflightScreenCaptureAccess() } @@ -70,8 +88,8 @@ actor CaptureManager { isCapturing = true defer { isCapturing = false } - // Check permission - guard hasPermission else { + // Check permission using async method + guard await hasPermission else { throw ScreenTranslateError.permissionDenied } @@ -139,8 +157,8 @@ actor CaptureManager { isCapturing = true defer { isCapturing = false } - // Check permission - guard hasPermission else { + // Check permission using async method + guard await hasPermission else { throw ScreenTranslateError.permissionDenied } diff --git a/ScreenTranslate/Features/Settings/EngineSettingsTab.swift b/ScreenTranslate/Features/Settings/EngineSettingsTab.swift index 7b7b4d1..fee7765 100644 --- a/ScreenTranslate/Features/Settings/EngineSettingsTab.swift +++ b/ScreenTranslate/Features/Settings/EngineSettingsTab.swift @@ -37,98 +37,231 @@ struct VLMConfigurationSection: View { } } .pickerStyle(.segmented) - .frame(maxWidth: 300) + .frame(maxWidth: 400) } + } - GridRow { - Text(localized("settings.vlm.apiKey")) - .foregroundStyle(.secondary) - .gridColumnAlignment(.trailing) - HStack { - if showAPIKey { - TextField("", text: $viewModel.vlmAPIKey) - .textFieldStyle(.roundedBorder) - } else { - SecureField("", text: $viewModel.vlmAPIKey) - .textFieldStyle(.roundedBorder) + // PaddleOCR specific section + if viewModel.vlmProvider == .paddleocr { + PaddleOCRStatusSection(viewModel: viewModel) + } else { + // Standard VLM configuration for API-based providers + Grid(alignment: .leading, horizontalSpacing: 16, verticalSpacing: 12) { + GridRow { + Text(localized("settings.vlm.apiKey")) + .foregroundStyle(.secondary) + .gridColumnAlignment(.trailing) + HStack { + if showAPIKey { + TextField("", text: $viewModel.vlmAPIKey) + .textFieldStyle(.roundedBorder) + } else { + SecureField("", text: $viewModel.vlmAPIKey) + .textFieldStyle(.roundedBorder) + } + Button { + showAPIKey.toggle() + } label: { + Image(systemName: showAPIKey ? "eye.slash" : "eye") + } + .buttonStyle(.borderless) } - Button { - showAPIKey.toggle() - } label: { - Image(systemName: showAPIKey ? "eye.slash" : "eye") + .frame(maxWidth: 300) + } + + if !viewModel.vlmProvider.requiresAPIKey { + GridRow { + Color.clear.gridCellUnsizedAxes([.horizontal, .vertical]) + Text(localized("settings.vlm.apiKey.optional")) + .font(.caption) + .foregroundStyle(.secondary) } - .buttonStyle(.borderless) } - .frame(maxWidth: 300) - } - if !viewModel.vlmProvider.requiresAPIKey { GridRow { - Color.clear.gridCellUnsizedAxes([.horizontal, .vertical]) - Text(localized("settings.vlm.apiKey.optional")) - .font(.caption) + Text(localized("settings.vlm.baseURL")) .foregroundStyle(.secondary) + .gridColumnAlignment(.trailing) + TextField("", text: $viewModel.vlmBaseURL) + .textFieldStyle(.roundedBorder) + .frame(maxWidth: 300) + } + + GridRow { + Text(localized("settings.vlm.model")) + .foregroundStyle(.secondary) + .gridColumnAlignment(.trailing) + TextField("", text: $viewModel.vlmModelName) + .textFieldStyle(.roundedBorder) + .frame(maxWidth: 300) + } + } + + Text(viewModel.vlmProvider.providerDescription) + .font(.caption) + .foregroundStyle(.secondary) + + // Test API Connection Button + HStack { + Button { + viewModel.testVLMAPI() + } label: { + HStack(spacing: 6) { + if viewModel.isTestingVLM { + ProgressView() + .controlSize(.small) + } + Image(systemName: "bolt.fill") + Text(localized("settings.vlm.test.button")) + } + } + .buttonStyle(.bordered) + .controlSize(.small) + .disabled(viewModel.isTestingVLM) + + Spacer() + + if let result = viewModel.vlmTestResult { + HStack(spacing: 4) { + Image(systemName: viewModel.vlmTestSuccess ? "checkmark.circle.fill" : "xmark.circle.fill") + .foregroundStyle(viewModel.vlmTestSuccess ? Color.green : Color.red) + Text(result) + .font(.caption) + .foregroundStyle(viewModel.vlmTestSuccess ? .secondary : Color.red) + .lineLimit(2) + } + } + } + .padding(.top, 8) + } + } + .padding() + .background(Color(.controlBackgroundColor)) + .cornerRadius(8) + } +} + +// MARK: - PaddleOCR Status Section + +struct PaddleOCRStatusSection: View { + @Bindable var viewModel: SettingsViewModel + + var body: some View { + VStack(alignment: .leading, spacing: 12) { + // Status + HStack { + Image(systemName: viewModel.isPaddleOCRInstalled ? "checkmark.circle.fill" : "exclamationmark.triangle.fill") + .foregroundStyle(viewModel.isPaddleOCRInstalled ? .green : .orange) + + if viewModel.isPaddleOCRInstalled { + Text(localized("settings.paddleocr.ready")) + .foregroundStyle(.secondary) + if let version = viewModel.paddleOCRVersion, !version.isEmpty { + Text("(\(version))") + .font(.caption) + .foregroundStyle(.tertiary) } + } else { + Text(localized("settings.paddleocr.not.installed.message")) + .foregroundStyle(.secondary) } + } + // Mode selection + Grid(alignment: .leading, horizontalSpacing: 16, verticalSpacing: 12) { GridRow { - Text(localized("settings.vlm.baseURL")) + Text(localized("settings.paddleocr.mode")) .foregroundStyle(.secondary) .gridColumnAlignment(.trailing) - TextField("", text: $viewModel.vlmBaseURL) - .textFieldStyle(.roundedBorder) - .frame(maxWidth: 300) + Picker("", selection: $viewModel.paddleOCRMode) { + ForEach(PaddleOCRMode.allCases, id: \.self) { mode in + VStack(alignment: .leading) { + Text(mode.localizedName) + }.tag(mode) + } + } + .pickerStyle(.segmented) + .frame(maxWidth: 300) } + // Mode description GridRow { - Text(localized("settings.vlm.model")) + Color.clear.gridCellUnsizedAxes([.horizontal, .vertical]) + Text(viewModel.paddleOCRMode.description) + .font(.caption) + .foregroundStyle(.tertiary) + } + + // Cloud API toggle + GridRow { + Text(localized("settings.paddleocr.useCloud")) .foregroundStyle(.secondary) .gridColumnAlignment(.trailing) - TextField("", text: $viewModel.vlmModelName) - .textFieldStyle(.roundedBorder) - .frame(maxWidth: 300) + Toggle("", isOn: $viewModel.paddleOCRUseCloud) + .toggleStyle(.checkbox) + } + + // Cloud API settings (only show when useCloud is true) + if viewModel.paddleOCRUseCloud { + GridRow { + Text(localized("settings.paddleocr.cloudBaseURL")) + .foregroundStyle(.secondary) + .gridColumnAlignment(.trailing) + TextField("", text: $viewModel.paddleOCRCloudBaseURL) + .textFieldStyle(.roundedBorder) + .frame(maxWidth: 300) + } + + GridRow { + Text(localized("settings.paddleocr.cloudAPIKey")) + .foregroundStyle(.secondary) + .gridColumnAlignment(.trailing) + SecureField("", text: $viewModel.paddleOCRCloudAPIKey) + .textFieldStyle(.roundedBorder) + .frame(maxWidth: 300) + } } } - Text(viewModel.vlmProvider.providerDescription) + // Description + Text(localized("settings.paddleocr.description")) .font(.caption) - .foregroundStyle(.secondary) + .foregroundStyle(.tertiary) - // Test API Connection Button - HStack { - Button { - viewModel.testVLMAPI() - } label: { - HStack(spacing: 6) { - if viewModel.isTestingVLM { + // Install instructions or button + if !viewModel.isPaddleOCRInstalled { + VStack(alignment: .leading, spacing: 8) { + if viewModel.isInstallingPaddleOCR { + HStack { ProgressView() .controlSize(.small) + Text(localized("settings.paddleocr.installing")) + .foregroundStyle(.secondary) } - Image(systemName: "bolt.fill") - Text(localized("settings.vlm.test.button")) - } - } - .buttonStyle(.bordered) - .controlSize(.small) - .disabled(viewModel.isTestingVLM) + } else { + HStack(spacing: 12) { + Button(localized("settings.paddleocr.install.button")) { + viewModel.installPaddleOCR() + } + .buttonStyle(.bordered) + .controlSize(.small) - Spacer() + Button(localized("settings.paddleocr.copy.command.button")) { + viewModel.copyPaddleOCRInstallCommand() + } + .buttonStyle(.borderless) + .controlSize(.small) + } - if let result = viewModel.vlmTestResult { - HStack(spacing: 4) { - Image(systemName: viewModel.vlmTestSuccess ? "checkmark.circle.fill" : "xmark.circle.fill") - .foregroundStyle(viewModel.vlmTestSuccess ? Color.green : Color.red) - Text(result) - .font(.caption) - .foregroundStyle(viewModel.vlmTestSuccess ? .secondary : Color.red) - .lineLimit(2) + if let error = viewModel.paddleOCRInstallError { + Text(error) + .font(.caption) + .foregroundStyle(.red) + } } } } - .padding(.top, 8) } - .padding() - .background(Color(.controlBackgroundColor)) - .cornerRadius(8) + .padding(.top, 8) } } diff --git a/ScreenTranslate/Features/Settings/SettingsViewModel.swift b/ScreenTranslate/Features/Settings/SettingsViewModel.swift index 22e3832..b29c2f7 100644 --- a/ScreenTranslate/Features/Settings/SettingsViewModel.swift +++ b/ScreenTranslate/Features/Settings/SettingsViewModel.swift @@ -100,6 +100,32 @@ final class SettingsViewModel { /// PaddleOCR version if installed var paddleOCRVersion: String? + // MARK: - PaddleOCR Settings + + /// PaddleOCR mode: fast or precise + var paddleOCRMode: PaddleOCRMode { + get { settings.paddleOCRMode } + set { settings.paddleOCRMode = newValue } + } + + /// Whether to use cloud API + var paddleOCRUseCloud: Bool { + get { settings.paddleOCRUseCloud } + set { settings.paddleOCRUseCloud = newValue } + } + + /// Cloud API base URL + var paddleOCRCloudBaseURL: String { + get { settings.paddleOCRCloudBaseURL } + set { settings.paddleOCRCloudBaseURL = newValue } + } + + /// Cloud API key + var paddleOCRCloudAPIKey: String { + get { settings.paddleOCRCloudAPIKey } + set { settings.paddleOCRCloudAPIKey = newValue } + } + // MARK: - VLM Test State /// Whether VLM API test is in progress @@ -375,34 +401,31 @@ final class SettingsViewModel { // Check folder access permission by testing if we can write to the save location hasFolderAccessPermission = checkFolderAccess(to: saveLocation) - // Check screen recording permission - // Try CGPreflightScreenCaptureAccess first, then fallback to window count check - hasScreenRecordingPermission = checkScreenRecordingPermission() - - isCheckingPermissions = false + // Check screen recording permission using ScreenCaptureKit + Task { + let granted = await checkScreenRecordingPermission() + await MainActor.run { + self.hasScreenRecordingPermission = granted + self.isCheckingPermissions = false + } + } } - /// Checks screen recording permission using multiple methods for reliability - private func checkScreenRecordingPermission() -> Bool { - // Method 1: CGPreflightScreenCaptureAccess (may not work in all cases) - if CGPreflightScreenCaptureAccess() { - return true + /// Checks screen recording permission using ScreenCaptureKit for reliable detection + private func checkScreenRecordingPermission() async -> Bool { + // First do a quick check with CGPreflightScreenCaptureAccess + if !CGPreflightScreenCaptureAccess() { + return false } - - // Method 2: Check if we can see windows from other apps - // If we have permission, we should see windows from other apps - let windowList = CGWindowListCopyWindowInfo([.optionOnScreenOnly], kCGNullWindowID) as? [[String: Any]] ?? [] - let ownPID = ProcessInfo.processInfo.processIdentifier - - // Count windows from other processes - let otherAppWindows = windowList.filter { window in - guard let ownerPID = window[kCGWindowOwnerPID as String] as? Int32 else { return false } - return ownerPID != ownPID + + // Verify by actually trying to get shareable content + // This ensures permission is truly granted (not just cached) + do { + _ = try await SCShareableContent.current + return true + } catch { + return false } - - // If we can see windows from other apps, we likely have permission - // (There should be at least a few windows from Finder, Dock, etc.) - return otherAppWindows.count > 3 } /// Checks if we have write access to the specified folder @@ -486,8 +509,8 @@ final class SettingsViewModel { switch type { case .screenRecording: - // Use CGPreflightScreenCaptureAccess to check without triggering dialog - let granted = CGPreflightScreenCaptureAccess() + // Use the same reliable check method + let granted = await checkScreenRecordingPermission() if granted { hasScreenRecordingPermission = true permissionCheckTask = nil @@ -871,6 +894,18 @@ final class SettingsViewModel { return try await testClaudeConnection(baseURL: baseURL, apiKey: apiKey, modelName: modelName) case .ollama: return try await testOllamaConnection(baseURL: baseURL, modelName: modelName) + case .paddleocr: + return try await testPaddleOCRConnection() + } + } + + /// Tests PaddleOCR availability + private func testPaddleOCRConnection() async throws -> (success: Bool, message: String) { + let isAvailable = await PaddleOCREngine.shared.isAvailable + if isAvailable { + return (true, "PaddleOCR is ready") + } else { + throw VLMProviderError.invalidConfiguration("PaddleOCR is not installed") } } diff --git a/ScreenTranslate/Models/AppSettings.swift b/ScreenTranslate/Models/AppSettings.swift index 359ef81..e39a1fa 100644 --- a/ScreenTranslate/Models/AppSettings.swift +++ b/ScreenTranslate/Models/AppSettings.swift @@ -2,6 +2,30 @@ import Foundation import SwiftUI import os +/// PaddleOCR mode selection +enum PaddleOCRMode: String, Codable, CaseIterable, Sendable { + case fast = "fast" + case precise = "precise" + + var localizedName: String { + switch self { + case .fast: + return NSLocalizedString("settings.paddleocr.mode.fast", comment: "Fast mode") + case .precise: + return NSLocalizedString("settings.paddleocr.mode.precise", comment: "Precise mode") + } + } + + var description: String { + switch self { + case .fast: + return NSLocalizedString("settings.paddleocr.mode.fast.description", comment: "~1s, uses groupIntoLines") + case .precise: + return NSLocalizedString("settings.paddleocr.mode.precise.description", comment: "~12s, VL-1.5 model") + } + } +} + /// User preferences persisted across sessions via UserDefaults. /// All properties automatically sync to UserDefaults with the `ScreenTranslate.` prefix. @MainActor @@ -58,6 +82,11 @@ final class AppSettings { static let sceneBindings = prefix + "sceneBindings" static let parallelEngines = prefix + "parallelEngines" static let compatibleProviderConfigs = prefix + "compatibleProviderConfigs" + // PaddleOCR Configuration + static let paddleOCRMode = prefix + "paddleOCRMode" + static let paddleOCRUseCloud = prefix + "paddleOCRUseCloud" + static let paddleOCRCloudBaseURL = prefix + "paddleOCRCloudBaseURL" + static let paddleOCRCloudAPIKey = prefix + "paddleOCRCloudAPIKey" } // MARK: - Properties @@ -262,6 +291,28 @@ final class AppSettings { didSet { saveCompatibleConfigs() } } + // MARK: - PaddleOCR Configuration + + /// PaddleOCR mode: fast (ocr command) or precise (doc_parser VL-1.5) + var paddleOCRMode: PaddleOCRMode { + didSet { save(paddleOCRMode.rawValue, forKey: Keys.paddleOCRMode) } + } + + /// Whether to use cloud API instead of local CLI + var paddleOCRUseCloud: Bool { + didSet { save(paddleOCRUseCloud, forKey: Keys.paddleOCRUseCloud) } + } + + /// Cloud API base URL (for third-party PaddleOCR cloud services) + var paddleOCRCloudBaseURL: String { + didSet { save(paddleOCRCloudBaseURL, forKey: Keys.paddleOCRCloudBaseURL) } + } + + /// Cloud API key + var paddleOCRCloudAPIKey: String { + didSet { save(paddleOCRCloudAPIKey, forKey: Keys.paddleOCRCloudAPIKey) } + } + // MARK: - Initialization private init() { @@ -358,6 +409,13 @@ final class AppSettings { parallelEngines = Self.loadParallelEngines() compatibleProviderConfigs = Self.loadCompatibleConfigs() + // Load PaddleOCR configuration + paddleOCRMode = defaults.string(forKey: Keys.paddleOCRMode) + .flatMap { PaddleOCRMode(rawValue: $0) } ?? .fast + paddleOCRUseCloud = defaults.object(forKey: Keys.paddleOCRUseCloud) as? Bool ?? false + paddleOCRCloudBaseURL = defaults.string(forKey: Keys.paddleOCRCloudBaseURL) ?? "" + paddleOCRCloudAPIKey = defaults.string(forKey: Keys.paddleOCRCloudAPIKey) ?? "" + Logger.settings.info("ScreenCapture launched - settings loaded from: \(loadedLocation.path)") } diff --git a/ScreenTranslate/Models/VLMProviderType.swift b/ScreenTranslate/Models/VLMProviderType.swift index e4cc19a..bf35e38 100644 --- a/ScreenTranslate/Models/VLMProviderType.swift +++ b/ScreenTranslate/Models/VLMProviderType.swift @@ -12,6 +12,7 @@ enum VLMProviderType: String, CaseIterable, Sendable, Codable, Identifiable { case openai = "openai" case claude = "claude" case ollama = "ollama" + case paddleocr = "paddleocr" var id: String { rawValue } @@ -24,6 +25,8 @@ enum VLMProviderType: String, CaseIterable, Sendable, Codable, Identifiable { return NSLocalizedString("vlm.provider.claude", comment: "Claude") case .ollama: return NSLocalizedString("vlm.provider.ollama", comment: "Ollama") + case .paddleocr: + return NSLocalizedString("vlm.provider.paddleocr", comment: "PaddleOCR") } } @@ -45,6 +48,11 @@ enum VLMProviderType: String, CaseIterable, Sendable, Codable, Identifiable { "vlm.provider.ollama.description", comment: "Local Ollama server" ) + case .paddleocr: + return NSLocalizedString( + "vlm.provider.paddleocr.description", + comment: "Local OCR engine (free, offline)" + ) } } @@ -57,6 +65,8 @@ enum VLMProviderType: String, CaseIterable, Sendable, Codable, Identifiable { return "https://api.anthropic.com/v1" case .ollama: return "http://localhost:11434" + case .paddleocr: + return "" } } @@ -69,6 +79,8 @@ enum VLMProviderType: String, CaseIterable, Sendable, Codable, Identifiable { return "claude-sonnet-4-20250514" case .ollama: return "llava" + case .paddleocr: + return "" } } @@ -77,7 +89,7 @@ enum VLMProviderType: String, CaseIterable, Sendable, Codable, Identifiable { switch self { case .openai, .claude: return true - case .ollama: + case .ollama, .paddleocr: return false } } diff --git a/ScreenTranslate/Resources/en.lproj/Localizable.strings b/ScreenTranslate/Resources/en.lproj/Localizable.strings index 619dd4e..dd7c874 100644 --- a/ScreenTranslate/Resources/en.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/en.lproj/Localizable.strings @@ -581,6 +581,19 @@ "settings.paddleocr.install.hint" = "Requires Python 3 and pip installed on your system."; "settings.paddleocr.copy.command" = "Copy Command"; "settings.paddleocr.refresh" = "Refresh Status"; +"settings.paddleocr.ready" = "PaddleOCR is ready"; +"settings.paddleocr.not.installed.message" = "PaddleOCR is not installed"; +"settings.paddleocr.description" = "PaddleOCR is a local OCR engine. It's free, works offline, and doesn't require an API key."; +"settings.paddleocr.install.button" = "Install PaddleOCR"; +"settings.paddleocr.copy.command.button" = "Copy Install Command"; +"settings.paddleocr.mode" = "Mode"; +"settings.paddleocr.mode.fast" = "Fast"; +"settings.paddleocr.mode.precise" = "Precise"; +"settings.paddleocr.mode.fast.description" = "~1s, fast OCR with line grouping"; +"settings.paddleocr.mode.precise.description" = "~12s, VL-1.5 model with higher accuracy"; +"settings.paddleocr.useCloud" = "Use Cloud API"; +"settings.paddleocr.cloudBaseURL" = "Cloud API URL"; +"settings.paddleocr.cloudAPIKey" = "API Key"; /* ======================================== @@ -601,9 +614,11 @@ "vlm.provider.openai" = "OpenAI"; "vlm.provider.claude" = "Claude"; "vlm.provider.ollama" = "Ollama"; +"vlm.provider.paddleocr" = "PaddleOCR"; "vlm.provider.openai.description" = "OpenAI GPT-4 Vision API"; "vlm.provider.claude.description" = "Anthropic Claude Vision API"; "vlm.provider.ollama.description" = "Local Ollama server"; +"vlm.provider.paddleocr.description" = "Local OCR engine (free, offline)"; /* ======================================== diff --git a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings index bcb5432..f573774 100644 --- a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings @@ -581,6 +581,19 @@ "settings.paddleocr.install.hint" = "需要在系统上安装 Python 3 和 pip。"; "settings.paddleocr.copy.command" = "复制命令"; "settings.paddleocr.refresh" = "刷新状态"; +"settings.paddleocr.ready" = "PaddleOCR 已就绪"; +"settings.paddleocr.not.installed.message" = "PaddleOCR 未安装"; +"settings.paddleocr.description" = "PaddleOCR 是本地 OCR 引擎。免费、离线可用,无需 API 密钥。"; +"settings.paddleocr.install.button" = "安装 PaddleOCR"; +"settings.paddleocr.copy.command.button" = "复制安装命令"; +"settings.paddleocr.mode" = "模式"; +"settings.paddleocr.mode.fast" = "快速"; +"settings.paddleocr.mode.precise" = "精确"; +"settings.paddleocr.mode.fast.description" = "~1秒,快速 OCR 并自动合并行"; +"settings.paddleocr.mode.precise.description" = "~12秒,VL-1.5 模型,更高精度"; +"settings.paddleocr.useCloud" = "使用云端 API"; +"settings.paddleocr.cloudBaseURL" = "云端 API 地址"; +"settings.paddleocr.cloudAPIKey" = "API 密钥"; /* ======================================== @@ -601,9 +614,11 @@ "vlm.provider.openai" = "OpenAI"; "vlm.provider.claude" = "Claude"; "vlm.provider.ollama" = "Ollama"; +"vlm.provider.paddleocr" = "PaddleOCR"; "vlm.provider.openai.description" = "OpenAI GPT-4 Vision API"; "vlm.provider.claude.description" = "Anthropic Claude Vision API"; "vlm.provider.ollama.description" = "本地 Ollama 服务器"; +"vlm.provider.paddleocr.description" = "本地 OCR 引擎(免费、离线可用)"; /* ======================================== diff --git a/ScreenTranslate/Services/PaddleOCREngine.swift b/ScreenTranslate/Services/PaddleOCREngine.swift index 96dff27..a4e6569 100644 --- a/ScreenTranslate/Services/PaddleOCREngine.swift +++ b/ScreenTranslate/Services/PaddleOCREngine.swift @@ -42,12 +42,28 @@ actor PaddleOCREngine { /// Detection model type var detectionModel: DetectionModel + /// OCR mode: fast (ocr command) or precise (doc_parser VL-1.5) + var mode: PaddleOCRMode + + /// Whether to use cloud API + var useCloud: Bool + + /// Cloud API base URL + var cloudBaseURL: String + + /// Cloud API key + var cloudAPIKey: String + static let `default` = Configuration( languages: [.chinese, .english], minimumConfidence: 0.0, useGPU: false, useDirectionClassify: true, - detectionModel: .default + detectionModel: .default, + mode: .fast, + useCloud: false, + cloudBaseURL: "", + cloudAPIKey: "" ) } @@ -140,7 +156,7 @@ actor PaddleOCREngine { let result = try await executePaddleOCR(arguments: arguments) // Parse output - let observations = try parsePaddleOCROutput(result, imageSize: CGSize(width: image.width, height: image.height)) + let observations = try parsePaddleOCROutput(result, imageSize: CGSize(width: image.width, height: image.height), mode: config.mode) // Filter by confidence let filteredTexts = observations.filter { $0.confidence >= config.minimumConfidence } @@ -203,18 +219,25 @@ actor PaddleOCREngine { /// Builds command line arguments for PaddleOCR private func buildArguments(config: Configuration, imagePath: String) -> [String] { - var args = [ - "ocr", - "-i", imagePath, - "--lang", "ch" - ] - - if config.useGPU { - args.append("--device") - args.append("gpu") + switch config.mode { + case .fast: + // Fast mode: use ocr command (~1s) + let langCode = config.languages.contains(.chinese) ? "ch" : "en" + return [ + "ocr", + "-i", imagePath, + "--lang", langCode, + "--use_angle_cls", config.useDirectionClassify ? "true" : "false" + ] + case .precise: + // Precise mode: use doc_parser with VL-1.5 (~12s) + return [ + "doc_parser", + "-i", imagePath, + "--pipeline_version", "v1.5", + "--device", config.useGPU ? "gpu" : "cpu" + ] } - - return args } /// Executes PaddleOCR with the given arguments @@ -298,8 +321,8 @@ actor PaddleOCREngine { return nil } - /// Parses PaddleOCR JSON output into OCRText observations - private func parsePaddleOCROutput(_ output: String, imageSize: CGSize) throws -> [OCRText] { + /// Parses PaddleOCR output into OCRText observations + private func parsePaddleOCROutput(_ output: String, imageSize: CGSize, mode: PaddleOCRMode) throws -> [OCRText] { var observations: [OCRText] = [] guard let startIndex = output.firstIndex(of: "{"), @@ -310,36 +333,150 @@ actor PaddleOCREngine { let jsonLike = String(output[startIndex...endIndex]) let cleanedJson = convertPythonDictToJson(jsonLike) - + Logger.ocr.debug("Cleaned JSON: \(cleanedJson.prefix(500))") - guard let jsonData = cleanedJson.data(using: .utf8), - let json = try? JSONSerialization.jsonObject(with: jsonData) as? [String: Any], - let res = json["res"] as? [String: Any] else { - Logger.ocr.error("Failed to parse JSON") + guard let jsonData = cleanedJson.data(using: .utf8) else { + Logger.ocr.error("Failed to convert cleaned JSON to data") + return observations + } + + // Try to parse JSON and log detailed error + var json: [String: Any]? + do { + json = try JSONSerialization.jsonObject(with: jsonData) as? [String: Any] + } catch { + Logger.ocr.error("JSON parse error: \(error.localizedDescription)") + // Log the problematic JSON (last 1000 chars to find the issue) + if let jsonStr = String(data: jsonData, encoding: .utf8) { + Logger.ocr.error("JSON end portion: ...\(jsonStr.suffix(500))") + } + return observations + } + + guard let json = json else { + Logger.ocr.error("Failed to parse JSON as dictionary") + return observations + } + + guard let res = json["res"] as? [String: Any] else { + Logger.ocr.error("No 'res' key in JSON. Keys: \(json.keys.joined(separator: ", "))") return observations } + switch mode { + case .fast: + // Fast mode: parse rec_texts format + observations = try parseFastModeOutput(res: res, imageSize: imageSize) + case .precise: + // Precise mode: parse doc_parser output format: parsing_res_list + observations = try parsePreciseModeOutput(res: res, imageSize: imageSize) + } + + return observations + } + + /// Parse fast mode output (ocr command) + private func parseFastModeOutput(res: [String: Any], imageSize: CGSize) throws -> [OCRText] { + var observations: [OCRText] = [] + + // Fast mode output has parallel arrays: rec_texts, rec_scores, rec_boxes guard let recTexts = res["rec_texts"] as? [String] else { - Logger.ocr.error("No rec_texts found") + Logger.ocr.error("No rec_texts found in fast mode output. Keys: \(res.keys.joined(separator: ", "))") return observations } - - let recScores = res["rec_scores"] as? [Double] ?? [] - let recBoxes = res["rec_boxes"] as? [[Int]] ?? [] - - Logger.ocr.info("Found \(recTexts.count) texts, \(recBoxes.count) boxes") + + // Get rec_boxes and rec_scores (optional) + let recBoxes = res["rec_boxes"] as? [[Double]] + let recScores = res["rec_scores"] as? [Double] + + Logger.ocr.info("Found \(recTexts.count) text blocks from fast mode") for (index, text) in recTexts.enumerated() { - let confidence = index < recScores.count ? Float(recScores[index]) : 0.5 - + guard !text.isEmpty else { continue } + + // Get bounding box from rec_boxes (format: [[x1, y1, x2, y2], ...]) + var boundingBox: CGRect + if let boxes = recBoxes, index < boxes.count { + let box = boxes[index] + if box.count >= 4 { + let x = CGFloat(box[0]) + let y = CGFloat(box[1]) + let x2 = CGFloat(box[2]) + let y2 = CGFloat(box[3]) + boundingBox = CGRect( + x: x / imageSize.width, + y: y / imageSize.height, + width: (x2 - x) / imageSize.width, + height: (y2 - y) / imageSize.height + ) + } else { + boundingBox = CGRect(x: 0, y: CGFloat(index) * 0.1, width: 1, height: 0.1) + } + } else { + // Fallback: stack vertically + boundingBox = CGRect(x: 0, y: CGFloat(index) * 0.1, width: 1, height: 0.1) + } + + // Get confidence from rec_scores + let confidence: Float + if let scores = recScores, index < scores.count { + confidence = Float(scores[index]) + } else { + confidence = 0.9 + } + + let observation = OCRText( + text: text, + boundingBox: boundingBox, + confidence: confidence + ) + observations.append(observation) + Logger.ocr.debug("Fast mode block: '\(text)', box: \(String(describing: boundingBox))") + } + + return observations + } + + /// Parse precise mode output (doc_parser VL-1.5) + private func parsePreciseModeOutput(res: [String: Any], imageSize: CGSize) throws -> [OCRText] { + var observations: [OCRText] = [] + + // Log all keys in res for debugging + Logger.ocr.info("Precise mode res keys: \(res.keys.joined(separator: ", "))") + + guard let parsingResList = res["parsing_res_list"] as? [[String: Any]] else { + Logger.ocr.error("No parsing_res_list found in res. Available keys: \(res.keys.joined(separator: ", "))") + // Try to log the raw res for debugging + if let resData = try? JSONSerialization.data(withJSONObject: res), + let resStr = String(data: resData, encoding: .utf8) { + Logger.ocr.debug("Raw res content: \(resStr.prefix(1000))") + } + return observations + } + + Logger.ocr.info("Found \(parsingResList.count) blocks from doc_parser") + + for (index, block) in parsingResList.enumerated() { + guard let text = block["block_content"] as? String else { + continue + } + + // Skip non-text blocks (charts, seals, images, etc.) + if let label = block["block_label"] as? String { + let skipLabels = ["chart", "seal", "image", "table", "figure"] + if skipLabels.contains(where: { label.lowercased().contains($0) }) { + Logger.ocr.debug("Skipping non-text block: \(label)") + continue + } + } + var boundingBox: CGRect - if index < recBoxes.count && recBoxes[index].count >= 4 { - let box = recBoxes[index] - let x = CGFloat(box[0]) - let y = CGFloat(box[1]) - let x2 = CGFloat(box[2]) - let y2 = CGFloat(box[3]) + if let bbox = block["block_bbox"] as? [Double], bbox.count >= 4 { + let x = CGFloat(bbox[0]) + let y = CGFloat(bbox[1]) + let x2 = CGFloat(bbox[2]) + let y2 = CGFloat(bbox[3]) boundingBox = CGRect( x: x / imageSize.width, y: y / imageSize.height, @@ -349,14 +486,17 @@ actor PaddleOCREngine { } else { boundingBox = CGRect(x: 0, y: CGFloat(index) * 0.1, width: 1, height: 0.1) } - + + // doc_parser doesn't provide confidence scores per block, use default + let confidence: Float = 0.9 + let observation = OCRText( text: text, boundingBox: boundingBox, confidence: confidence ) observations.append(observation) - Logger.ocr.debug("Text: '\(text)', box: \(String(describing: boundingBox)), confidence: \(confidence)") + Logger.ocr.debug("Block: '\(text)', box: \(String(describing: boundingBox))") } return observations @@ -371,6 +511,13 @@ actor PaddleOCREngine { result = convertNumpyArraysToJson(result) + // Fix float format: "8." -> "8.0", "-5." -> "-5.0" (valid JSON) + let floatPattern = #"(-?\d+)\.\s*([,\]\}])"# + if let regex = try? NSRegularExpression(pattern: floatPattern) { + let range = NSRange(result.startIndex..., in: result) + result = regex.stringByReplacingMatches(in: result, options: [], range: range, withTemplate: "$1.0$2") + } + return result } @@ -409,26 +556,34 @@ actor PaddleOCREngine { private func extractArrayContent(from arrayContent: String) -> String { var content = arrayContent - + + // Remove shape and dtype info if let shapeRange = content.range(of: ", shape=") { content = String(content[.. ScreenAnalysisResult { + // Check availability + guard await PaddleOCREngine.shared.isAvailable else { + throw VLMProviderError.invalidConfiguration( + "PaddleOCR is not installed. Install it using: pip3 install paddleocr paddlepaddle" + ) + } + + // Build configuration from AppSettings + let config = await buildConfiguration() + + // Perform OCR using PaddleOCREngine with settings + let ocrResult = try await PaddleOCREngine.shared.recognize(image, config: config) + + // Convert OCRResult to ScreenAnalysisResult + return convertToScreenAnalysisResult(ocrResult, mode: config.mode) + } + + // MARK: - Private Methods + + @MainActor + private func buildConfiguration() -> PaddleOCREngine.Configuration { + let settings = AppSettings.shared + var config = PaddleOCREngine.Configuration.default + config.mode = settings.paddleOCRMode + config.useCloud = settings.paddleOCRUseCloud + config.cloudBaseURL = settings.paddleOCRCloudBaseURL + config.cloudAPIKey = settings.paddleOCRCloudAPIKey + return config + } + + private func convertToScreenAnalysisResult(_ ocrResult: OCRResult, mode: PaddleOCRMode) -> ScreenAnalysisResult { + // For precise mode (doc_parser), the output is already in block format, no need to group + // For fast mode (ocr command), we need to group into lines + let segments: [TextSegment] + switch mode { + case .precise: + // Precise mode: already in block format, convert directly + segments = ocrResult.observations.map { observation in + TextSegment( + text: observation.text, + boundingBox: observation.boundingBox, + confidence: observation.confidence + ) + } + case .fast: + // Fast mode: group into lines based on vertical position + let lines = groupIntoLines(ocrResult.observations, imageSize: ocrResult.imageSize) + segments = lines.map { line -> TextSegment in + TextSegment( + text: line.text, + boundingBox: line.boundingBox, + confidence: line.confidence + ) + } + } + + return ScreenAnalysisResult( + segments: segments, + imageSize: ocrResult.imageSize + ) + } + + /// Groups OCR texts into lines based on vertical position overlap + private func groupIntoLines(_ observations: [OCRText], imageSize: CGSize) -> [MergedLine] { + guard !observations.isEmpty else { return [] } + + // Sort by Y position (top to bottom), then by X position (left to right) + let sortedObservations = observations.sorted { a, b in + let yTolerance = min(a.boundingBox.height, b.boundingBox.height) * 0.5 + if abs(a.boundingBox.minY - b.boundingBox.minY) > yTolerance { + return a.boundingBox.minY < b.boundingBox.minY + } + return a.boundingBox.minX < b.boundingBox.minX + } + + var lines: [MergedLine] = [] + var currentLine: MergedLine? + + for observation in sortedObservations { + if let line = currentLine { + // Check if this observation is on the same line (Y position overlap) + let yOverlap = max(0, + min(line.boundingBox.maxY, observation.boundingBox.maxY) - + max(line.boundingBox.minY, observation.boundingBox.minY) + ) + let minHeight = min(line.boundingBox.height, observation.boundingBox.height) + + // If there's significant Y overlap, add to current line + if yOverlap > minHeight * 0.3 { + currentLine = line.merged(with: observation) + } else { + // Start a new line + lines.append(line) + currentLine = MergedLine(from: observation) + } + } else { + currentLine = MergedLine(from: observation) + } + } + + // Don't forget the last line + if let line = currentLine { + lines.append(line) + } + + return lines + } +} + +/// Helper struct to merge OCR texts into lines +private struct MergedLine { + let text: String + let boundingBox: CGRect + let confidence: Float + + init(text: String, boundingBox: CGRect, confidence: Float) { + self.text = text + self.boundingBox = boundingBox + self.confidence = confidence + } + + init(from observation: OCRText) { + self.text = observation.text + self.boundingBox = observation.boundingBox + self.confidence = observation.confidence + } + + func merged(with other: OCRText) -> MergedLine { + // Combine texts with space + let combinedText = text + " " + other.text + + // Merge bounding boxes + let mergedBox = boundingBox.union(other.boundingBox) + + // Average confidence weighted by text length + let totalLength = text.count + other.text.count + let weightedConfidence = ( + Float(text.count) * confidence + + Float(other.text.count) * other.confidence + ) / Float(totalLength) + + return MergedLine( + text: combinedText, + boundingBox: mergedBox, + confidence: weightedConfidence + ) + } +} diff --git a/ScreenTranslate/Services/ScreenCoderEngine.swift b/ScreenTranslate/Services/ScreenCoderEngine.swift index 20025d1..88e9cef 100644 --- a/ScreenTranslate/Services/ScreenCoderEngine.swift +++ b/ScreenTranslate/Services/ScreenCoderEngine.swift @@ -161,6 +161,8 @@ actor ScreenCoderEngine { return ClaudeVLMProvider(configuration: configuration) case .ollama: return OllamaVLMProvider(configuration: configuration) + case .paddleocr: + return PaddleOCRVLMProvider() } } From e4cae4c3a7ccb7d24b18cf8e3060b9819c0397e2 Mon Sep 17 00:00:00 2001 From: Hubert Date: Sat, 28 Feb 2026 14:26:37 +0800 Subject: [PATCH 02/11] fix: address code review feedback on PR #49 - Remove redundant raw values from PaddleOCRMode enum - Add PaddleOCR settings reset in resetToDefaults() - Remove redundant MainActor.run wrapper in SettingsViewModel - Add static defaultBaseURL constant to avoid force unwrap - Add CJK-aware separator in merged(with:) for proper spacing - Replace window counting heuristic with async SCShareableContent check in OnboardingViewModel Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- .../Onboarding/OnboardingViewModel.swift | 38 ++++++++---------- .../Features/Settings/SettingsViewModel.swift | 6 +-- ScreenTranslate/Models/AppSettings.swift | 9 ++++- .../Services/PaddleOCRVLMProvider.swift | 40 ++++++++++++++++--- 4 files changed, 60 insertions(+), 33 deletions(-) diff --git a/ScreenTranslate/Features/Onboarding/OnboardingViewModel.swift b/ScreenTranslate/Features/Onboarding/OnboardingViewModel.swift index f36ba12..622ffe7 100644 --- a/ScreenTranslate/Features/Onboarding/OnboardingViewModel.swift +++ b/ScreenTranslate/Features/Onboarding/OnboardingViewModel.swift @@ -144,30 +144,26 @@ final class OnboardingViewModel { func checkPermissions() { hasAccessibilityPermission = AccessibilityPermissionChecker.hasPermission - // Check screen recording permission using multiple methods for reliability - hasScreenRecordingPermission = checkScreenRecordingPermission() + // Check screen recording permission using async method + Task { + hasScreenRecordingPermission = await checkScreenRecordingPermission() + } } - /// Checks screen recording permission using multiple methods for reliability - private func checkScreenRecordingPermission() -> Bool { - // Method 1: CGPreflightScreenCaptureAccess (may not work in all cases) - if CGPreflightScreenCaptureAccess() { - return true + /// Checks screen recording permission using ScreenCaptureKit for reliable detection + private func checkScreenRecordingPermission() async -> Bool { + // First do a quick check with CGPreflightScreenCaptureAccess + if !CGPreflightScreenCaptureAccess() { + return false } - // Method 2: Check if we can see windows from other apps - // If we have permission, we should see windows from other apps - let windowList = CGWindowListCopyWindowInfo([.optionOnScreenOnly], kCGNullWindowID) as? [[String: Any]] ?? [] - let ownPID = ProcessInfo.processInfo.processIdentifier - - // Count windows from other processes - let otherAppWindows = windowList.filter { window in - guard let ownerPID = window[kCGWindowOwnerPID as String] as? Int32 else { return false } - return ownerPID != ownPID + // Verify by actually trying to get shareable content + do { + _ = try await SCShareableContent.current + return true + } catch { + return false } - - // If we can see windows from other apps, we likely have permission - return otherAppWindows.count > 3 } /// Requests screen recording permission @@ -234,8 +230,8 @@ final class OnboardingViewModel { switch type { case .screenRecording: - // Use multiple methods to check permission without triggering dialog - let granted = checkScreenRecordingPermission() + // Use async ScreenCaptureKit check for reliable detection + let granted = await checkScreenRecordingPermission() if granted { hasScreenRecordingPermission = true permissionCheckTask = nil diff --git a/ScreenTranslate/Features/Settings/SettingsViewModel.swift b/ScreenTranslate/Features/Settings/SettingsViewModel.swift index b29c2f7..42197f9 100644 --- a/ScreenTranslate/Features/Settings/SettingsViewModel.swift +++ b/ScreenTranslate/Features/Settings/SettingsViewModel.swift @@ -404,10 +404,8 @@ final class SettingsViewModel { // Check screen recording permission using ScreenCaptureKit Task { let granted = await checkScreenRecordingPermission() - await MainActor.run { - self.hasScreenRecordingPermission = granted - self.isCheckingPermissions = false - } + self.hasScreenRecordingPermission = granted + self.isCheckingPermissions = false } } diff --git a/ScreenTranslate/Models/AppSettings.swift b/ScreenTranslate/Models/AppSettings.swift index e39a1fa..918e382 100644 --- a/ScreenTranslate/Models/AppSettings.swift +++ b/ScreenTranslate/Models/AppSettings.swift @@ -4,8 +4,8 @@ import os /// PaddleOCR mode selection enum PaddleOCRMode: String, Codable, CaseIterable, Sendable { - case fast = "fast" - case precise = "precise" + case fast + case precise var localizedName: String { switch self { @@ -458,6 +458,11 @@ final class AppSettings { onboardingCompleted = false translateAndInsertSourceLanguage = .auto translateAndInsertTargetLanguage = nil + // Reset PaddleOCR settings + paddleOCRMode = .fast + paddleOCRUseCloud = false + paddleOCRCloudBaseURL = "" + paddleOCRCloudAPIKey = "" // Reset multi-engine configuration - directly create defaults, don't load from persistence engineSelectionMode = .primaryWithFallback var defaultConfigs: [TranslationEngineType: TranslationEngineConfig] = [:] diff --git a/ScreenTranslate/Services/PaddleOCRVLMProvider.swift b/ScreenTranslate/Services/PaddleOCRVLMProvider.swift index eedd809..0db5b4e 100644 --- a/ScreenTranslate/Services/PaddleOCRVLMProvider.swift +++ b/ScreenTranslate/Services/PaddleOCRVLMProvider.swift @@ -19,13 +19,16 @@ struct PaddleOCRVLMProvider: VLMProvider, Sendable { /// Empty configuration (PaddleOCR doesn't need API keys or URLs) let configuration: VLMProviderConfiguration + /// Default base URL for local PaddleOCR (not used, but required by protocol) + private static let defaultBaseURL = URL(string: "http://localhost")! + // MARK: - Initialization init() { // Create an empty configuration for PaddleOCR self.configuration = VLMProviderConfiguration( apiKey: "", - baseURL: URL(string: "http://localhost")!, + baseURL: Self.defaultBaseURL, modelName: "paddleocr" ) } @@ -167,23 +170,48 @@ private struct MergedLine { } func merged(with other: OCRText) -> MergedLine { - // Combine texts with space - let combinedText = text + " " + other.text - + // Combine texts with appropriate separator for CJK vs non-CJK + let separator = Self.separator(for: text, and: other.text) + let combinedText = text + separator + other.text + // Merge bounding boxes let mergedBox = boundingBox.union(other.boundingBox) - + // Average confidence weighted by text length let totalLength = text.count + other.text.count let weightedConfidence = ( Float(text.count) * confidence + Float(other.text.count) * other.confidence ) / Float(totalLength) - + return MergedLine( text: combinedText, boundingBox: mergedBox, confidence: weightedConfidence ) } + + /// Returns appropriate separator between two text segments based on CJK detection + private static func separator(for first: String, and second: String) -> String { + let firstIsCJK = isCJKText(first) + let secondIsCJK = isCJKText(second) + // No space between CJK characters, space otherwise + return (firstIsCJK && secondIsCJK) ? "" : " " + } + + /// Checks if text contains CJK (Chinese/Japanese/Korean) characters + private static func isCJKText(_ text: String) -> Bool { + guard let firstChar = text.first else { return false } + let scalar = firstChar.unicodeScalars.first?.value ?? 0 + // CJK Unified Ideographs: U+4E00-U+9FFF + // CJK Unified Ideographs Extension A: U+3400-U+4DBF + // Hiragana: U+3040-U+309F + // Katakana: U+30A0-U+30FF + // Hangul Syllables: U+AC00-U+D7AF + return (0x4E00...0x9FFF).contains(scalar) || + (0x3400...0x4DBF).contains(scalar) || + (0x3040...0x309F).contains(scalar) || + (0x30A0...0x30FF).contains(scalar) || + (0xAC00...0xD7AF).contains(scalar) + } } From aeb3182ad269caa4403085b8871938c7c0a0531e Mon Sep 17 00:00:00 2001 From: Hubert Date: Sat, 28 Feb 2026 14:44:08 +0800 Subject: [PATCH 03/11] fix: address second round code review feedback - testPaddleOCRConnection: check cloud mode first with URL/API validation - checkPermissions: use permissionCheckTask to avoid race conditions - paddleOCRCloudAPIKey: store in Keychain instead of UserDefaults - analyze(image:): check PaddleOCREngine availability only for local mode - isAvailable: mode-aware check (cloud checks URL, local checks installation) - weightedConfidence: handle divide-by-zero edge case Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- .../Features/Settings/SettingsViewModel.swift | 51 ++++++++- ScreenTranslate/Models/AppSettings.swift | 49 +++++++- .../Services/PaddleOCRVLMProvider.swift | 43 ++++--- .../Services/Security/KeychainService.swift | 107 ++++++++++++++++++ 4 files changed, 231 insertions(+), 19 deletions(-) diff --git a/ScreenTranslate/Features/Settings/SettingsViewModel.swift b/ScreenTranslate/Features/Settings/SettingsViewModel.swift index 42197f9..723b130 100644 --- a/ScreenTranslate/Features/Settings/SettingsViewModel.swift +++ b/ScreenTranslate/Features/Settings/SettingsViewModel.swift @@ -402,10 +402,14 @@ final class SettingsViewModel { hasFolderAccessPermission = checkFolderAccess(to: saveLocation) // Check screen recording permission using ScreenCaptureKit - Task { + // Cancel any existing task to avoid race conditions + permissionCheckTask?.cancel() + + permissionCheckTask = Task { let granted = await checkScreenRecordingPermission() self.hasScreenRecordingPermission = granted self.isCheckingPermissions = false + permissionCheckTask = nil } } @@ -897,13 +901,54 @@ final class SettingsViewModel { } } - /// Tests PaddleOCR availability + /// Tests PaddleOCR availability - checks cloud mode first, then local private func testPaddleOCRConnection() async throws -> (success: Bool, message: String) { + let settings = AppSettings.shared + + // If cloud mode is enabled, test cloud connectivity first + if settings.paddleOCRUseCloud { + let cloudBaseURL = settings.paddleOCRCloudBaseURL.trimmingCharacters(in: .whitespaces) + guard !cloudBaseURL.isEmpty, + let url = URL(string: cloudBaseURL) else { + throw VLMProviderError.invalidConfiguration("PaddleOCR cloud base URL is not configured") + } + + // Test cloud API connectivity with a simple request + var request = URLRequest(url: url) + request.timeoutInterval = 10 + + // Add API key if configured + let apiKey = settings.paddleOCRCloudAPIKey.trimmingCharacters(in: .whitespaces) + if !apiKey.isEmpty { + request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization") + } + + do { + let (_, response) = try await URLSession.shared.data(for: request) + guard let httpResponse = response as? HTTPURLResponse else { + throw VLMProviderError.invalidResponse("Invalid HTTP response from PaddleOCR cloud") + } + switch httpResponse.statusCode { + case 200, 404: // 404 is acceptable - means server is reachable + return (true, "PaddleOCR cloud is reachable") + case 401, 403: + throw VLMProviderError.authenticationFailed + default: + throw VLMProviderError.invalidResponse("HTTP \(httpResponse.statusCode)") + } + } catch let error as VLMProviderError { + throw error + } catch { + throw VLMProviderError.invalidConfiguration("PaddleOCR cloud is not reachable: \(error.localizedDescription)") + } + } + + // Local mode - check if PaddleOCR is installed let isAvailable = await PaddleOCREngine.shared.isAvailable if isAvailable { return (true, "PaddleOCR is ready") } else { - throw VLMProviderError.invalidConfiguration("PaddleOCR is not installed") + throw VLMProviderError.invalidConfiguration("PaddleOCR is not installed. Install it using: pip3 install paddleocr paddlepaddle") } } diff --git a/ScreenTranslate/Models/AppSettings.swift b/ScreenTranslate/Models/AppSettings.swift index 918e382..5fd3a6f 100644 --- a/ScreenTranslate/Models/AppSettings.swift +++ b/ScreenTranslate/Models/AppSettings.swift @@ -1,6 +1,7 @@ import Foundation import SwiftUI import os +import Security /// PaddleOCR mode selection enum PaddleOCRMode: String, Codable, CaseIterable, Sendable { @@ -308,9 +309,18 @@ final class AppSettings { didSet { save(paddleOCRCloudBaseURL, forKey: Keys.paddleOCRCloudBaseURL) } } - /// Cloud API key + /// Cloud API key (stored securely in Keychain, not UserDefaults) var paddleOCRCloudAPIKey: String { - didSet { save(paddleOCRCloudAPIKey, forKey: Keys.paddleOCRCloudAPIKey) } + didSet { + // Save to Keychain asynchronously + Task.detached { + do { + try await KeychainService.shared.savePaddleOCRCredentials(apiKey: self.paddleOCRCloudAPIKey) + } catch { + Logger.settings.error("Failed to save PaddleOCR cloud API key to Keychain: \(error)") + } + } + } } // MARK: - Initialization @@ -414,7 +424,9 @@ final class AppSettings { .flatMap { PaddleOCRMode(rawValue: $0) } ?? .fast paddleOCRUseCloud = defaults.object(forKey: Keys.paddleOCRUseCloud) as? Bool ?? false paddleOCRCloudBaseURL = defaults.string(forKey: Keys.paddleOCRCloudBaseURL) ?? "" - paddleOCRCloudAPIKey = defaults.string(forKey: Keys.paddleOCRCloudAPIKey) ?? "" + + // Load PaddleOCR cloud API key from Keychain (secure storage) + paddleOCRCloudAPIKey = Self.loadPaddleOCRAPIKeyFromKeychain() Logger.settings.info("ScreenCapture launched - settings loaded from: \(loadedLocation.path)") } @@ -463,6 +475,10 @@ final class AppSettings { paddleOCRUseCloud = false paddleOCRCloudBaseURL = "" paddleOCRCloudAPIKey = "" + // Delete PaddleOCR cloud API key from Keychain + Task.detached { + try? await KeychainService.shared.deletePaddleOCRCredentials() + } // Reset multi-engine configuration - directly create defaults, don't load from persistence engineSelectionMode = .primaryWithFallback var defaultConfigs: [TranslationEngineType: TranslationEngineConfig] = [:] @@ -516,6 +532,33 @@ final class AppSettings { return try? JSONDecoder().decode(CodableColor.self, from: data) } + // MARK: - Keychain Helpers + + /// Load PaddleOCR cloud API key from Keychain synchronously + private static func loadPaddleOCRAPIKeyFromKeychain() -> String { + let service = "com.screentranslate.credentials" + let account = "paddleocr_cloud" + + let query: [String: Any] = [ + kSecClass as String: kSecClassGenericPassword, + kSecAttrService as String: service, + kSecAttrAccount as String: account, + kSecReturnData as String: true, + kSecMatchLimit as String: kSecMatchLimitOne + ] + + var result: CFTypeRef? + let status = SecItemCopyMatching(query as CFDictionary, &result) + + guard status == errSecSuccess, + let data = result as? Data, + let credentials = try? JSONDecoder().decode(StoredCredentials.self, from: data) else { + return "" + } + + return credentials.apiKey + } + // MARK: - Multi-Engine Persistence Helpers private func saveEngineConfigs() { diff --git a/ScreenTranslate/Services/PaddleOCRVLMProvider.swift b/ScreenTranslate/Services/PaddleOCRVLMProvider.swift index 0db5b4e..3862de3 100644 --- a/ScreenTranslate/Services/PaddleOCRVLMProvider.swift +++ b/ScreenTranslate/Services/PaddleOCRVLMProvider.swift @@ -37,21 +37,32 @@ struct PaddleOCRVLMProvider: VLMProvider, Sendable { var isAvailable: Bool { get async { - await PaddleOCREngine.shared.isAvailable + // Check settings to determine mode + let useCloud = await MainActor.run { AppSettings.shared.paddleOCRUseCloud } + if useCloud { + // Cloud mode is available if base URL is configured + let baseURL = await MainActor.run { AppSettings.shared.paddleOCRCloudBaseURL } + return !baseURL.trimmingCharacters(in: .whitespaces).isEmpty + } else { + // Local mode requires PaddleOCR to be installed + return await PaddleOCREngine.shared.isAvailable + } } } func analyze(image: CGImage) async throws -> ScreenAnalysisResult { - // Check availability - guard await PaddleOCREngine.shared.isAvailable else { - throw VLMProviderError.invalidConfiguration( - "PaddleOCR is not installed. Install it using: pip3 install paddleocr paddlepaddle" - ) - } - - // Build configuration from AppSettings + // Build configuration from AppSettings first let config = await buildConfiguration() + // Check local availability only for local mode + if !config.useCloud { + guard await PaddleOCREngine.shared.isAvailable else { + throw VLMProviderError.invalidConfiguration( + "PaddleOCR is not installed. Install it using: pip3 install paddleocr paddlepaddle" + ) + } + } + // Perform OCR using PaddleOCREngine with settings let ocrResult = try await PaddleOCREngine.shared.recognize(image, config: config) @@ -179,10 +190,16 @@ private struct MergedLine { // Average confidence weighted by text length let totalLength = text.count + other.text.count - let weightedConfidence = ( - Float(text.count) * confidence + - Float(other.text.count) * other.confidence - ) / Float(totalLength) + let weightedConfidence: Float + if totalLength == 0 { + // Edge case: both texts are empty, use average of confidences + weightedConfidence = (confidence + other.confidence) / 2.0 + } else { + weightedConfidence = ( + Float(text.count) * confidence + + Float(other.text.count) * other.confidence + ) / Float(totalLength) + } return MergedLine( text: combinedText, diff --git a/ScreenTranslate/Services/Security/KeychainService.swift b/ScreenTranslate/Services/Security/KeychainService.swift index 937ea42..4215af6 100644 --- a/ScreenTranslate/Services/Security/KeychainService.swift +++ b/ScreenTranslate/Services/Security/KeychainService.swift @@ -308,6 +308,113 @@ actor KeychainService { logger.info("Deleted all credentials") } + + // MARK: - PaddleOCR Cloud Methods + + /// Save PaddleOCR cloud API key + /// - Parameter apiKey: The API key to store + func savePaddleOCRCredentials(apiKey: String) throws { + let account = "paddleocr_cloud" + + let credentials = StoredCredentials(apiKey: apiKey) + + guard let encodedData = try? JSONEncoder().encode(credentials) else { + throw KeychainError.invalidData + } + + let query: [String: Any] = [ + kSecClass as String: kSecClassGenericPassword, + kSecAttrService as String: service, + kSecAttrAccount as String: account + ] + + // Check if item exists and update it, or add new if not found + let status = SecItemCopyMatching(query as CFDictionary, nil) + if status == errSecSuccess { + // Item exists - update it + let updateQuery: [String: Any] = [ + kSecValueData as String: encodedData, + kSecAttrAccessible as String: kSecAttrAccessibleWhenUnlocked + ] + let updateStatus = SecItemUpdate(query as CFDictionary, updateQuery as CFDictionary) + guard updateStatus == errSecSuccess else { + logger.error("Failed to update PaddleOCR cloud credentials: \(updateStatus)") + throw KeychainError.unexpectedStatus(updateStatus) + } + logger.info("Updated PaddleOCR cloud credentials") + } else if status == errSecItemNotFound { + // Item doesn't exist - add new + let addQuery: [String: Any] = [ + kSecClass as String: kSecClassGenericPassword, + kSecAttrService as String: service, + kSecAttrAccount as String: account, + kSecValueData as String: encodedData, + kSecAttrAccessible as String: kSecAttrAccessibleWhenUnlocked + ] + let addStatus = SecItemAdd(addQuery as CFDictionary, nil) + guard addStatus == errSecSuccess else { + logger.error("Failed to save PaddleOCR cloud credentials: \(addStatus)") + throw KeychainError.unexpectedStatus(addStatus) + } + logger.info("Saved PaddleOCR cloud credentials") + } else { + logger.error("Failed to check PaddleOCR cloud credentials: \(status)") + throw KeychainError.unexpectedStatus(status) + } + } + + /// Retrieve stored PaddleOCR cloud API key + /// - Returns: The stored API key, or nil if not found + func getPaddleOCRCredentials() -> String? { + let account = "paddleocr_cloud" + + let query: [String: Any] = [ + kSecClass as String: kSecClassGenericPassword, + kSecAttrService as String: service, + kSecAttrAccount as String: account, + kSecReturnData as String: true, + kSecMatchLimit as String: kSecMatchLimitOne + ] + + var result: CFTypeRef? + let status = SecItemCopyMatching(query as CFDictionary, &result) + + guard status == errSecSuccess else { + if status == errSecItemNotFound { + logger.debug("No PaddleOCR cloud credentials found") + return nil + } + logger.error("Failed to retrieve PaddleOCR cloud credentials: \(status)") + return nil + } + + guard let data = result as? Data else { + return nil + } + + let credentials = try? JSONDecoder().decode(StoredCredentials.self, from: data) + return credentials?.apiKey + } + + /// Delete stored PaddleOCR cloud credentials + func deletePaddleOCRCredentials() throws { + let account = "paddleocr_cloud" + + let query: [String: Any] = [ + kSecClass as String: kSecClassGenericPassword, + kSecAttrService as String: service, + kSecAttrAccount as String: account + ] + + let status = SecItemDelete(query as CFDictionary) + + guard status == errSecSuccess || status == errSecItemNotFound else { + logger.error("Failed to delete PaddleOCR cloud credentials: \(status)") + throw KeychainError.unexpectedStatus(status) + } + + logger.info("Deleted PaddleOCR cloud credentials") + } } // MARK: - Stored Credentials From ee868a53920ff83b991c9add0fb8aea453939092 Mon Sep 17 00:00:00 2001 From: Hubert Date: Sat, 28 Feb 2026 15:04:42 +0800 Subject: [PATCH 04/11] fix: address third round code review feedback - Fix concurrency violation: capture paddleOCRCloudAPIKey before Task.detached - Extract Keychain constants: add serviceIdentifier and paddleOCRAccount to KeychainService - Fix CJK separator: check last char of first string and first char of second string - Update AppSettings to use shared KeychainService constants Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- ScreenTranslate/Models/AppSettings.swift | 12 +++++------ .../Services/PaddleOCRVLMProvider.swift | 21 ++++++++++++------- .../Services/Security/KeychainService.swift | 14 +++++++++---- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/ScreenTranslate/Models/AppSettings.swift b/ScreenTranslate/Models/AppSettings.swift index 5fd3a6f..9c020d7 100644 --- a/ScreenTranslate/Models/AppSettings.swift +++ b/ScreenTranslate/Models/AppSettings.swift @@ -312,10 +312,12 @@ final class AppSettings { /// Cloud API key (stored securely in Keychain, not UserDefaults) var paddleOCRCloudAPIKey: String { didSet { + // Capture the value on the actor before spawning detached task + let capturedKey = paddleOCRCloudAPIKey // Save to Keychain asynchronously Task.detached { do { - try await KeychainService.shared.savePaddleOCRCredentials(apiKey: self.paddleOCRCloudAPIKey) + try await KeychainService.shared.savePaddleOCRCredentials(apiKey: capturedKey) } catch { Logger.settings.error("Failed to save PaddleOCR cloud API key to Keychain: \(error)") } @@ -536,13 +538,11 @@ final class AppSettings { /// Load PaddleOCR cloud API key from Keychain synchronously private static func loadPaddleOCRAPIKeyFromKeychain() -> String { - let service = "com.screentranslate.credentials" - let account = "paddleocr_cloud" - + // Use shared constants from KeychainService let query: [String: Any] = [ kSecClass as String: kSecClassGenericPassword, - kSecAttrService as String: service, - kSecAttrAccount as String: account, + kSecAttrService as String: KeychainService.serviceIdentifier, + kSecAttrAccount as String: KeychainService.paddleOCRAccount, kSecReturnData as String: true, kSecMatchLimit as String: kSecMatchLimitOne ] diff --git a/ScreenTranslate/Services/PaddleOCRVLMProvider.swift b/ScreenTranslate/Services/PaddleOCRVLMProvider.swift index 3862de3..5f08bb6 100644 --- a/ScreenTranslate/Services/PaddleOCRVLMProvider.swift +++ b/ScreenTranslate/Services/PaddleOCRVLMProvider.swift @@ -209,17 +209,24 @@ private struct MergedLine { } /// Returns appropriate separator between two text segments based on CJK detection + /// Checks the last character of the first string and the first character of the second string private static func separator(for first: String, and second: String) -> String { - let firstIsCJK = isCJKText(first) - let secondIsCJK = isCJKText(second) + // Check last character of first string and first character of second string + // This handles mixed-content cases like "Hello世界" correctly + guard let firstLast = first.last, + let secondFirst = second.first else { + return " " // Default to space if either string is empty + } + + let firstLastIsCJK = isCJKChar(firstLast) + let secondFirstIsCJK = isCJKChar(secondFirst) // No space between CJK characters, space otherwise - return (firstIsCJK && secondIsCJK) ? "" : " " + return (firstLastIsCJK && secondFirstIsCJK) ? "" : " " } - /// Checks if text contains CJK (Chinese/Japanese/Korean) characters - private static func isCJKText(_ text: String) -> Bool { - guard let firstChar = text.first else { return false } - let scalar = firstChar.unicodeScalars.first?.value ?? 0 + /// Checks if a character is CJK (Chinese/Japanese/Korean) + private static func isCJKChar(_ char: Character) -> Bool { + let scalar = char.unicodeScalars.first?.value ?? 0 // CJK Unified Ideographs: U+4E00-U+9FFF // CJK Unified Ideographs Extension A: U+3400-U+4DBF // Hiragana: U+3040-U+309F diff --git a/ScreenTranslate/Services/Security/KeychainService.swift b/ScreenTranslate/Services/Security/KeychainService.swift index 4215af6..0ae63b7 100644 --- a/ScreenTranslate/Services/Security/KeychainService.swift +++ b/ScreenTranslate/Services/Security/KeychainService.swift @@ -17,11 +17,17 @@ actor KeychainService { static let shared = KeychainService() /// Service identifier for Keychain items - private let service = "com.screentranslate.credentials" + static let serviceIdentifier = "com.screentranslate.credentials" + + /// PaddleOCR cloud account identifier + static let paddleOCRAccount = "paddleocr_cloud" /// Logger instance private let logger = Logger(subsystem: Bundle.main.bundleIdentifier ?? "ScreenTranslate", category: "KeychainService") + /// Internal service property for instance methods + private var service: String { Self.serviceIdentifier } + private init() {} // MARK: - Public API @@ -314,7 +320,7 @@ actor KeychainService { /// Save PaddleOCR cloud API key /// - Parameter apiKey: The API key to store func savePaddleOCRCredentials(apiKey: String) throws { - let account = "paddleocr_cloud" + let account = Self.paddleOCRAccount let credentials = StoredCredentials(apiKey: apiKey) @@ -366,7 +372,7 @@ actor KeychainService { /// Retrieve stored PaddleOCR cloud API key /// - Returns: The stored API key, or nil if not found func getPaddleOCRCredentials() -> String? { - let account = "paddleocr_cloud" + let account = Self.paddleOCRAccount let query: [String: Any] = [ kSecClass as String: kSecClassGenericPassword, @@ -398,7 +404,7 @@ actor KeychainService { /// Delete stored PaddleOCR cloud credentials func deletePaddleOCRCredentials() throws { - let account = "paddleocr_cloud" + let account = Self.paddleOCRAccount let query: [String: Any] = [ kSecClass as String: kSecClassGenericPassword, From ef4545e5c7d25a4915a02e161c084139f6aba3db Mon Sep 17 00:00:00 2001 From: Hubert Date: Sat, 28 Feb 2026 15:31:24 +0800 Subject: [PATCH 05/11] feat: add MLX-VLM inference framework support for Apple Silicon - Add paddleOCRUseMLXVLM, paddleOCRMLXVLMServerURL, paddleOCRMLXVLMModelName settings - Update PaddleOCREngine.Configuration with MLX-VLM fields - Add --vl_rec_backend, --vl_rec_server_url, --vl_rec_api_model_name args - Add UI settings (visible when precise mode and not using cloud) - Add English and Chinese localizations Usage: 1. Install: pip install "mlx-vlm>=0.3.11" 2. Start server: mlx_vlm.server --port 8111 3. Enable in Settings > PaddleOCR > Use MLX-VLM Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- .../Features/Settings/EngineSettingsTab.swift | 34 +++++++++++++++++++ .../Features/Settings/SettingsViewModel.swift | 18 ++++++++++ ScreenTranslate/Models/AppSettings.swift | 28 +++++++++++++++ .../Resources/en.lproj/Localizable.strings | 3 ++ .../zh-Hans.lproj/Localizable.strings | 3 ++ .../Services/PaddleOCREngine.swift | 29 ++++++++++++++-- .../Services/PaddleOCRVLMProvider.swift | 3 ++ 7 files changed, 115 insertions(+), 3 deletions(-) diff --git a/ScreenTranslate/Features/Settings/EngineSettingsTab.swift b/ScreenTranslate/Features/Settings/EngineSettingsTab.swift index fee7765..660e10b 100644 --- a/ScreenTranslate/Features/Settings/EngineSettingsTab.swift +++ b/ScreenTranslate/Features/Settings/EngineSettingsTab.swift @@ -221,6 +221,40 @@ struct PaddleOCRStatusSection: View { .frame(maxWidth: 300) } } + + // MLX-VLM settings (only show when mode is precise and not using cloud) + if viewModel.paddleOCRMode == .precise && !viewModel.paddleOCRUseCloud { + Divider() + .gridCellUnsizedAxes(.horizontal) + + GridRow { + Text(localized("settings.paddleocr.useMLXVLM")) + .foregroundStyle(.secondary) + .gridColumnAlignment(.trailing) + Toggle("", isOn: $viewModel.paddleOCRUseMLXVLM) + .toggleStyle(.checkbox) + } + + if viewModel.paddleOCRUseMLXVLM { + GridRow { + Text(localized("settings.paddleocr.mlxVLMServerURL")) + .foregroundStyle(.secondary) + .gridColumnAlignment(.trailing) + TextField("", text: $viewModel.paddleOCRMLXVLMServerURL) + .textFieldStyle(.roundedBorder) + .frame(maxWidth: 300) + } + + GridRow { + Text(localized("settings.paddleocr.mlxVLMModelName")) + .foregroundStyle(.secondary) + .gridColumnAlignment(.trailing) + TextField("", text: $viewModel.paddleOCRMLXVLMModelName) + .textFieldStyle(.roundedBorder) + .frame(maxWidth: 300) + } + } + } } // Description diff --git a/ScreenTranslate/Features/Settings/SettingsViewModel.swift b/ScreenTranslate/Features/Settings/SettingsViewModel.swift index 723b130..dba7269 100644 --- a/ScreenTranslate/Features/Settings/SettingsViewModel.swift +++ b/ScreenTranslate/Features/Settings/SettingsViewModel.swift @@ -126,6 +126,24 @@ final class SettingsViewModel { set { settings.paddleOCRCloudAPIKey = newValue } } + /// Whether to use MLX-VLM inference framework + var paddleOCRUseMLXVLM: Bool { + get { settings.paddleOCRUseMLXVLM } + set { settings.paddleOCRUseMLXVLM = newValue } + } + + /// MLX-VLM server URL + var paddleOCRMLXVLMServerURL: String { + get { settings.paddleOCRMLXVLMServerURL } + set { settings.paddleOCRMLXVLMServerURL = newValue } + } + + /// MLX-VLM model name + var paddleOCRMLXVLMModelName: String { + get { settings.paddleOCRMLXVLMModelName } + set { settings.paddleOCRMLXVLMModelName = newValue } + } + // MARK: - VLM Test State /// Whether VLM API test is in progress diff --git a/ScreenTranslate/Models/AppSettings.swift b/ScreenTranslate/Models/AppSettings.swift index 9c020d7..1833182 100644 --- a/ScreenTranslate/Models/AppSettings.swift +++ b/ScreenTranslate/Models/AppSettings.swift @@ -88,6 +88,10 @@ final class AppSettings { static let paddleOCRUseCloud = prefix + "paddleOCRUseCloud" static let paddleOCRCloudBaseURL = prefix + "paddleOCRCloudBaseURL" static let paddleOCRCloudAPIKey = prefix + "paddleOCRCloudAPIKey" + // MLX-VLM Configuration (for Apple Silicon optimization) + static let paddleOCRUseMLXVLM = prefix + "paddleOCRUseMLXVLM" + static let paddleOCRMLXVLMServerURL = prefix + "paddleOCRMLXVLMServerURL" + static let paddleOCRMLXVLMModelName = prefix + "paddleOCRMLXVLMModelName" } // MARK: - Properties @@ -325,6 +329,21 @@ final class AppSettings { } } + /// Whether to use MLX-VLM inference framework (Apple Silicon optimization) + var paddleOCRUseMLXVLM: Bool { + didSet { save(paddleOCRUseMLXVLM, forKey: Keys.paddleOCRUseMLXVLM) } + } + + /// MLX-VLM server URL (default: http://localhost:8111) + var paddleOCRMLXVLMServerURL: String { + didSet { save(paddleOCRMLXVLMServerURL, forKey: Keys.paddleOCRMLXVLMServerURL) } + } + + /// MLX-VLM model name (default: PaddlePaddle/PaddleOCR-VL-1.5) + var paddleOCRMLXVLMModelName: String { + didSet { save(paddleOCRMLXVLMModelName, forKey: Keys.paddleOCRMLXVLMModelName) } + } + // MARK: - Initialization private init() { @@ -430,6 +449,11 @@ final class AppSettings { // Load PaddleOCR cloud API key from Keychain (secure storage) paddleOCRCloudAPIKey = Self.loadPaddleOCRAPIKeyFromKeychain() + // Load MLX-VLM configuration + paddleOCRUseMLXVLM = defaults.object(forKey: Keys.paddleOCRUseMLXVLM) as? Bool ?? false + paddleOCRMLXVLMServerURL = defaults.string(forKey: Keys.paddleOCRMLXVLMServerURL) ?? "http://localhost:8111" + paddleOCRMLXVLMModelName = defaults.string(forKey: Keys.paddleOCRMLXVLMModelName) ?? "PaddlePaddle/PaddleOCR-VL-1.5" + Logger.settings.info("ScreenCapture launched - settings loaded from: \(loadedLocation.path)") } @@ -481,6 +505,10 @@ final class AppSettings { Task.detached { try? await KeychainService.shared.deletePaddleOCRCredentials() } + // Reset MLX-VLM settings + paddleOCRUseMLXVLM = false + paddleOCRMLXVLMServerURL = "http://localhost:8111" + paddleOCRMLXVLMModelName = "PaddlePaddle/PaddleOCR-VL-1.5" // Reset multi-engine configuration - directly create defaults, don't load from persistence engineSelectionMode = .primaryWithFallback var defaultConfigs: [TranslationEngineType: TranslationEngineConfig] = [:] diff --git a/ScreenTranslate/Resources/en.lproj/Localizable.strings b/ScreenTranslate/Resources/en.lproj/Localizable.strings index dd7c874..0f2fe2f 100644 --- a/ScreenTranslate/Resources/en.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/en.lproj/Localizable.strings @@ -594,6 +594,9 @@ "settings.paddleocr.useCloud" = "Use Cloud API"; "settings.paddleocr.cloudBaseURL" = "Cloud API URL"; "settings.paddleocr.cloudAPIKey" = "API Key"; +"settings.paddleocr.useMLXVLM" = "Use MLX-VLM (Apple Silicon)"; +"settings.paddleocr.mlxVLMServerURL" = "MLX-VLM Server URL"; +"settings.paddleocr.mlxVLMModelName" = "Model Name"; /* ======================================== diff --git a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings index f573774..715e2ef 100644 --- a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings @@ -594,6 +594,9 @@ "settings.paddleocr.useCloud" = "使用云端 API"; "settings.paddleocr.cloudBaseURL" = "云端 API 地址"; "settings.paddleocr.cloudAPIKey" = "API 密钥"; +"settings.paddleocr.useMLXVLM" = "使用 MLX-VLM (Apple Silicon)"; +"settings.paddleocr.mlxVLMServerURL" = "MLX-VLM 服务地址"; +"settings.paddleocr.mlxVLMModelName" = "模型名称"; /* ======================================== diff --git a/ScreenTranslate/Services/PaddleOCREngine.swift b/ScreenTranslate/Services/PaddleOCREngine.swift index a4e6569..2b3df1a 100644 --- a/ScreenTranslate/Services/PaddleOCREngine.swift +++ b/ScreenTranslate/Services/PaddleOCREngine.swift @@ -54,6 +54,15 @@ actor PaddleOCREngine { /// Cloud API key var cloudAPIKey: String + /// Whether to use MLX-VLM inference framework (Apple Silicon optimization) + var useMLXVLM: Bool + + /// MLX-VLM server URL + var mlxVLMServerURL: String + + /// MLX-VLM model name + var mlxVLMModelName: String + static let `default` = Configuration( languages: [.chinese, .english], minimumConfidence: 0.0, @@ -63,7 +72,10 @@ actor PaddleOCREngine { mode: .fast, useCloud: false, cloudBaseURL: "", - cloudAPIKey: "" + cloudAPIKey: "", + useMLXVLM: false, + mlxVLMServerURL: "http://localhost:8111", + mlxVLMModelName: "PaddlePaddle/PaddleOCR-VL-1.5" ) } @@ -230,13 +242,24 @@ actor PaddleOCREngine { "--use_angle_cls", config.useDirectionClassify ? "true" : "false" ] case .precise: - // Precise mode: use doc_parser with VL-1.5 (~12s) - return [ + // Precise mode: use doc_parser with VL-1.5 + var args = [ "doc_parser", "-i", imagePath, "--pipeline_version", "v1.5", "--device", config.useGPU ? "gpu" : "cpu" ] + + // Add MLX-VLM backend arguments if enabled + if config.useMLXVLM { + args += [ + "--vl_rec_backend", "mlx-vlm-server", + "--vl_rec_server_url", config.mlxVLMServerURL, + "--vl_rec_api_model_name", config.mlxVLMModelName + ] + } + + return args } } diff --git a/ScreenTranslate/Services/PaddleOCRVLMProvider.swift b/ScreenTranslate/Services/PaddleOCRVLMProvider.swift index 5f08bb6..ff8af1f 100644 --- a/ScreenTranslate/Services/PaddleOCRVLMProvider.swift +++ b/ScreenTranslate/Services/PaddleOCRVLMProvider.swift @@ -80,6 +80,9 @@ struct PaddleOCRVLMProvider: VLMProvider, Sendable { config.useCloud = settings.paddleOCRUseCloud config.cloudBaseURL = settings.paddleOCRCloudBaseURL config.cloudAPIKey = settings.paddleOCRCloudAPIKey + config.useMLXVLM = settings.paddleOCRUseMLXVLM + config.mlxVLMServerURL = settings.paddleOCRMLXVLMServerURL + config.mlxVLMModelName = settings.paddleOCRMLXVLMModelName return config } From dff6c28625749fa3ac9e7377fd2f9b44c7491948 Mon Sep 17 00:00:00 2001 From: Hubert Date: Sat, 28 Feb 2026 15:45:49 +0800 Subject: [PATCH 06/11] feat: add MLX-VLM server status detection - Add isMLXVLMServerRunning and isCheckingMLXVLMServer state - Add checkMLXVLMServerStatus() method to verify server connectivity - Add UI with status indicator and refresh button - Add English and Chinese localizations for status messages Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- .../Features/Settings/EngineSettingsTab.swift | 35 +++++++++++++++ .../Features/Settings/SettingsViewModel.swift | 43 +++++++++++++++++++ .../Resources/en.lproj/Localizable.strings | 4 ++ .../zh-Hans.lproj/Localizable.strings | 4 ++ 4 files changed, 86 insertions(+) diff --git a/ScreenTranslate/Features/Settings/EngineSettingsTab.swift b/ScreenTranslate/Features/Settings/EngineSettingsTab.swift index 660e10b..b70ff2b 100644 --- a/ScreenTranslate/Features/Settings/EngineSettingsTab.swift +++ b/ScreenTranslate/Features/Settings/EngineSettingsTab.swift @@ -233,9 +233,44 @@ struct PaddleOCRStatusSection: View { .gridColumnAlignment(.trailing) Toggle("", isOn: $viewModel.paddleOCRUseMLXVLM) .toggleStyle(.checkbox) + .onChange(of: viewModel.paddleOCRUseMLXVLM) { _, newValue in + if newValue { + viewModel.checkMLXVLMServerStatus() + } + } } if viewModel.paddleOCRUseMLXVLM { + // MLX-VLM server status + GridRow { + Text(localized("settings.paddleocr.mlxVLMStatus")) + .foregroundStyle(.secondary) + .gridColumnAlignment(.trailing) + HStack { + if viewModel.isCheckingMLXVLMServer { + ProgressView() + .controlSize(.small) + Text(localized("settings.paddleocr.mlxVLMChecking")) + .foregroundStyle(.secondary) + } else { + Image(systemName: viewModel.isMLXVLMServerRunning ? "checkmark.circle.fill" : "xmark.circle.fill") + .foregroundStyle(viewModel.isMLXVLMServerRunning ? .green : .red) + Text(viewModel.isMLXVLMServerRunning + ? localized("settings.paddleocr.mlxVLMRunning") + : localized("settings.paddleocr.mlxVLMNotRunning")) + .foregroundStyle(.secondary) + } + + Button { + viewModel.checkMLXVLMServerStatus() + } label: { + Image(systemName: "arrow.clockwise") + } + .buttonStyle(.borderless) + .controlSize(.small) + } + } + GridRow { Text(localized("settings.paddleocr.mlxVLMServerURL")) .foregroundStyle(.secondary) diff --git a/ScreenTranslate/Features/Settings/SettingsViewModel.swift b/ScreenTranslate/Features/Settings/SettingsViewModel.swift index dba7269..be7fa01 100644 --- a/ScreenTranslate/Features/Settings/SettingsViewModel.swift +++ b/ScreenTranslate/Features/Settings/SettingsViewModel.swift @@ -144,6 +144,12 @@ final class SettingsViewModel { set { settings.paddleOCRMLXVLMModelName = newValue } } + /// Whether MLX-VLM server is running + var isMLXVLMServerRunning: Bool = false + + /// Whether MLX-VLM server check is in progress + var isCheckingMLXVLMServer: Bool = false + // MARK: - VLM Test State /// Whether VLM API test is in progress @@ -842,6 +848,43 @@ final class SettingsViewModel { NSPasteboard.general.setString(command, forType: .string) } + // MARK: - MLX-VLM Server Management + + func checkMLXVLMServerStatus() { + guard paddleOCRUseMLXVLM else { return } + + isCheckingMLXVLMServer = true + + Task.detached { [serverURL = paddleOCRMLXVLMServerURL] in + var isRunning = false + + do { + guard let url = URL(string: serverURL) else { + await MainActor.run { + self.isMLXVLMServerRunning = false + self.isCheckingMLXVLMServer = false + } + return + } + + // Try to connect to the server with a short timeout + let request = URLRequest(url: url, timeoutInterval: 3.0) + let (_, response) = try await URLSession.shared.data(for: request) + + if let httpResponse = response as? HTTPURLResponse { + isRunning = httpResponse.statusCode < 500 + } + } catch { + isRunning = false + } + + await MainActor.run { + self.isMLXVLMServerRunning = isRunning + self.isCheckingMLXVLMServer = false + } + } + } + // MARK: - VLM API Test /// Tests the VLM API connectivity with current configuration diff --git a/ScreenTranslate/Resources/en.lproj/Localizable.strings b/ScreenTranslate/Resources/en.lproj/Localizable.strings index 0f2fe2f..165c4e8 100644 --- a/ScreenTranslate/Resources/en.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/en.lproj/Localizable.strings @@ -597,6 +597,10 @@ "settings.paddleocr.useMLXVLM" = "Use MLX-VLM (Apple Silicon)"; "settings.paddleocr.mlxVLMServerURL" = "MLX-VLM Server URL"; "settings.paddleocr.mlxVLMModelName" = "Model Name"; +"settings.paddleocr.mlxVLMStatus" = "Server Status"; +"settings.paddleocr.mlxVLMChecking" = "Checking..."; +"settings.paddleocr.mlxVLMRunning" = "Running"; +"settings.paddleocr.mlxVLMNotRunning" = "Not Running"; /* ======================================== diff --git a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings index 715e2ef..9928f73 100644 --- a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings @@ -597,6 +597,10 @@ "settings.paddleocr.useMLXVLM" = "使用 MLX-VLM (Apple Silicon)"; "settings.paddleocr.mlxVLMServerURL" = "MLX-VLM 服务地址"; "settings.paddleocr.mlxVLMModelName" = "模型名称"; +"settings.paddleocr.mlxVLMStatus" = "服务状态"; +"settings.paddleocr.mlxVLMChecking" = "检测中..."; +"settings.paddleocr.mlxVLMRunning" = "运行中"; +"settings.paddleocr.mlxVLMNotRunning" = "未运行"; /* ======================================== From 5d6d5d09e1830eb7114fccf36826835cdaf97721 Mon Sep 17 00:00:00 2001 From: Hubert Date: Sat, 28 Feb 2026 15:58:10 +0800 Subject: [PATCH 07/11] fix: auto-check MLX-VLM server status on settings load - Add MLX-VLM check in refreshPaddleOCRStatus() - Add onAppear handler in PaddleOCRStatusSection Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- ScreenTranslate/Features/Settings/EngineSettingsTab.swift | 6 ++++++ ScreenTranslate/Features/Settings/SettingsViewModel.swift | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ScreenTranslate/Features/Settings/EngineSettingsTab.swift b/ScreenTranslate/Features/Settings/EngineSettingsTab.swift index b70ff2b..1636104 100644 --- a/ScreenTranslate/Features/Settings/EngineSettingsTab.swift +++ b/ScreenTranslate/Features/Settings/EngineSettingsTab.swift @@ -332,5 +332,11 @@ struct PaddleOCRStatusSection: View { } } .padding(.top, 8) + .onAppear { + // Auto-check MLX-VLM server status when section appears + if viewModel.paddleOCRUseMLXVLM { + viewModel.checkMLXVLMServerStatus() + } + } } } diff --git a/ScreenTranslate/Features/Settings/SettingsViewModel.swift b/ScreenTranslate/Features/Settings/SettingsViewModel.swift index be7fa01..8bab126 100644 --- a/ScreenTranslate/Features/Settings/SettingsViewModel.swift +++ b/ScreenTranslate/Features/Settings/SettingsViewModel.swift @@ -785,7 +785,7 @@ final class SettingsViewModel { func refreshPaddleOCRStatus() { PaddleOCRChecker.resetCache() PaddleOCRChecker.checkAvailabilityAsync() - + Task { for _ in 0..<20 { try? await Task.sleep(for: .milliseconds(250)) @@ -797,6 +797,11 @@ final class SettingsViewModel { isPaddleOCRInstalled = PaddleOCRChecker.isAvailable paddleOCRVersion = PaddleOCRChecker.version paddleOCRInstallError = nil + + // Auto-check MLX-VLM server status if enabled + if paddleOCRUseMLXVLM { + checkMLXVLMServerStatus() + } } } } From 61cf6e36f9d2d6b7122f753be8fb792921f3abb7 Mon Sep 17 00:00:00 2001 From: Hubert Date: Sat, 28 Feb 2026 16:47:15 +0800 Subject: [PATCH 08/11] fix: correct localization key names for error messages Keys should not include format specifier %@ Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- ScreenTranslate/Resources/en.lproj/Localizable.strings | 6 +++--- ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ScreenTranslate/Resources/en.lproj/Localizable.strings b/ScreenTranslate/Resources/en.lproj/Localizable.strings index 165c4e8..d71daac 100644 --- a/ScreenTranslate/Resources/en.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/en.lproj/Localizable.strings @@ -675,9 +675,9 @@ "translationFlow.error.title.translation" = "Translation Failed"; "translationFlow.error.title.rendering" = "Rendering Failed"; "translationFlow.error.unknown" = "An unknown error occurred."; -"translationFlow.error.analysis %@" = "Analysis failed: %@"; -"translationFlow.error.translation %@" = "Translation failed: %@"; -"translationFlow.error.rendering %@" = "Rendering failed: %@"; +"translationFlow.error.analysis" = "Analysis failed: %@"; +"translationFlow.error.translation" = "Translation failed: %@"; +"translationFlow.error.rendering" = "Rendering failed: %@"; "translationFlow.error.cancelled" = "Translation was cancelled."; "translationFlow.error.noTextFound" = "No text found in the selected area."; "translationFlow.error.translation.engine" = "Translation Engine"; diff --git a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings index 9928f73..2ae5f3c 100644 --- a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings @@ -675,9 +675,9 @@ "translationFlow.error.title.translation" = "翻译失败"; "translationFlow.error.title.rendering" = "渲染失败"; "translationFlow.error.unknown" = "发生未知错误。"; -"translationFlow.error.analysis %@" = "分析失败:%@"; -"translationFlow.error.translation %@" = "翻译失败:%@"; -"translationFlow.error.rendering %@" = "渲染失败:%@"; +"translationFlow.error.analysis" = "分析失败:%@"; +"translationFlow.error.translation" = "翻译失败:%@"; +"translationFlow.error.rendering" = "渲染失败:%@"; "translationFlow.error.cancelled" = "翻译已取消。"; "translationFlow.error.noTextFound" = "选中区域未找到文字。"; "translationFlow.error.translation.engine" = "翻译引擎"; From 2e1c0d1f1756e3ed1f0175ccfd133151a8b12b6f Mon Sep 17 00:00:00 2001 From: Hubert Date: Sat, 28 Feb 2026 18:59:43 +0800 Subject: [PATCH 09/11] feat: add local VL model directory support for PaddleOCR native backend - Add localVLModelDir configuration to PaddleOCREngine.Configuration - Support native backend with --vl_rec_model_dir when MLX-VLM is disabled - Add UI input field for local model directory (shows when MLX-VLM unchecked in precise mode) - Fix error message to show correct model info based on provider type - Fix localization key from error.ocr.recognition.failed to error.ocr.failed - Add localization strings for local model directory in EN/CN Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- .../Features/Settings/EngineSettingsTab.swift | 15 +++++++++++++++ .../Features/Settings/SettingsViewModel.swift | 6 ++++++ .../TranslationFlowController.swift | 11 ++++++++++- ScreenTranslate/Models/AppSettings.swift | 8 ++++++++ .../Resources/en.lproj/Localizable.strings | 2 ++ .../Resources/zh-Hans.lproj/Localizable.strings | 2 ++ ScreenTranslate/Services/PaddleOCREngine.swift | 16 +++++++++++++--- .../Services/PaddleOCRVLMProvider.swift | 1 + 8 files changed, 57 insertions(+), 4 deletions(-) diff --git a/ScreenTranslate/Features/Settings/EngineSettingsTab.swift b/ScreenTranslate/Features/Settings/EngineSettingsTab.swift index 1636104..b7f42bd 100644 --- a/ScreenTranslate/Features/Settings/EngineSettingsTab.swift +++ b/ScreenTranslate/Features/Settings/EngineSettingsTab.swift @@ -288,6 +288,21 @@ struct PaddleOCRStatusSection: View { .textFieldStyle(.roundedBorder) .frame(maxWidth: 300) } + } else { + // Local model directory for native backend (when not using MLX-VLM) + GridRow { + Text(localized("settings.paddleocr.localVLModelDir")) + .foregroundStyle(.secondary) + .gridColumnAlignment(.trailing) + VStack(alignment: .leading, spacing: 4) { + TextField("", text: $viewModel.paddleOCRLocalVLModelDir) + .textFieldStyle(.roundedBorder) + .frame(maxWidth: 300) + Text(localized("settings.paddleocr.localVLModelDir.hint")) + .font(.caption) + .foregroundStyle(.tertiary) + } + } } } } diff --git a/ScreenTranslate/Features/Settings/SettingsViewModel.swift b/ScreenTranslate/Features/Settings/SettingsViewModel.swift index 8bab126..68572dc 100644 --- a/ScreenTranslate/Features/Settings/SettingsViewModel.swift +++ b/ScreenTranslate/Features/Settings/SettingsViewModel.swift @@ -144,6 +144,12 @@ final class SettingsViewModel { set { settings.paddleOCRMLXVLMModelName = newValue } } + /// Local VL model directory (for native backend) + var paddleOCRLocalVLModelDir: String { + get { settings.paddleOCRLocalVLModelDir } + set { settings.paddleOCRLocalVLModelDir = newValue } + } + /// Whether MLX-VLM server is running var isMLXVLMServerRunning: Bool = false diff --git a/ScreenTranslate/Features/TranslationFlow/TranslationFlowController.swift b/ScreenTranslate/Features/TranslationFlow/TranslationFlowController.swift index ff447c1..80a055c 100644 --- a/ScreenTranslate/Features/TranslationFlow/TranslationFlowController.swift +++ b/ScreenTranslate/Features/TranslationFlow/TranslationFlowController.swift @@ -347,7 +347,16 @@ final class TranslationFlowController { if case .analysisFailure = error { let settings = AppSettings.shared errorDetails += "\n\nProvider: \(settings.vlmProvider.localizedName)" - errorDetails += "\nModel: \(settings.vlmModelName)" + // Show appropriate model info based on provider type + switch settings.vlmProvider { + case .paddleocr: + if settings.paddleOCRUseMLXVLM { + errorDetails += "\nModel: \(settings.paddleOCRMLXVLMModelName)" + } + // For local/cloud PaddleOCR modes, model info is not applicable + default: + errorDetails += "\nModel: \(settings.vlmModelName)" + } } // Add provider info for translation errors diff --git a/ScreenTranslate/Models/AppSettings.swift b/ScreenTranslate/Models/AppSettings.swift index 1833182..da06600 100644 --- a/ScreenTranslate/Models/AppSettings.swift +++ b/ScreenTranslate/Models/AppSettings.swift @@ -92,6 +92,7 @@ final class AppSettings { static let paddleOCRUseMLXVLM = prefix + "paddleOCRUseMLXVLM" static let paddleOCRMLXVLMServerURL = prefix + "paddleOCRMLXVLMServerURL" static let paddleOCRMLXVLMModelName = prefix + "paddleOCRMLXVLMModelName" + static let paddleOCRLocalVLModelDir = prefix + "paddleOCRLocalVLModelDir" } // MARK: - Properties @@ -344,6 +345,11 @@ final class AppSettings { didSet { save(paddleOCRMLXVLMModelName, forKey: Keys.paddleOCRMLXVLMModelName) } } + /// Local VL model directory (for native backend without MLX-VLM server) + var paddleOCRLocalVLModelDir: String { + didSet { save(paddleOCRLocalVLModelDir, forKey: Keys.paddleOCRLocalVLModelDir) } + } + // MARK: - Initialization private init() { @@ -453,6 +459,7 @@ final class AppSettings { paddleOCRUseMLXVLM = defaults.object(forKey: Keys.paddleOCRUseMLXVLM) as? Bool ?? false paddleOCRMLXVLMServerURL = defaults.string(forKey: Keys.paddleOCRMLXVLMServerURL) ?? "http://localhost:8111" paddleOCRMLXVLMModelName = defaults.string(forKey: Keys.paddleOCRMLXVLMModelName) ?? "PaddlePaddle/PaddleOCR-VL-1.5" + paddleOCRLocalVLModelDir = defaults.string(forKey: Keys.paddleOCRLocalVLModelDir) ?? "" Logger.settings.info("ScreenCapture launched - settings loaded from: \(loadedLocation.path)") } @@ -509,6 +516,7 @@ final class AppSettings { paddleOCRUseMLXVLM = false paddleOCRMLXVLMServerURL = "http://localhost:8111" paddleOCRMLXVLMModelName = "PaddlePaddle/PaddleOCR-VL-1.5" + paddleOCRLocalVLModelDir = "" // Reset multi-engine configuration - directly create defaults, don't load from persistence engineSelectionMode = .primaryWithFallback var defaultConfigs: [TranslationEngineType: TranslationEngineConfig] = [:] diff --git a/ScreenTranslate/Resources/en.lproj/Localizable.strings b/ScreenTranslate/Resources/en.lproj/Localizable.strings index d71daac..75838f2 100644 --- a/ScreenTranslate/Resources/en.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/en.lproj/Localizable.strings @@ -601,6 +601,8 @@ "settings.paddleocr.mlxVLMChecking" = "Checking..."; "settings.paddleocr.mlxVLMRunning" = "Running"; "settings.paddleocr.mlxVLMNotRunning" = "Not Running"; +"settings.paddleocr.localVLModelDir" = "Local Model Directory"; +"settings.paddleocr.localVLModelDir.hint" = "Path to local PaddleOCR-VL model (e.g. ~/.paddlex/official_models/PaddleOCR-VL-1.5)"; /* ======================================== diff --git a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings index 2ae5f3c..54813f8 100644 --- a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings @@ -601,6 +601,8 @@ "settings.paddleocr.mlxVLMChecking" = "检测中..."; "settings.paddleocr.mlxVLMRunning" = "运行中"; "settings.paddleocr.mlxVLMNotRunning" = "未运行"; +"settings.paddleocr.localVLModelDir" = "本地模型目录"; +"settings.paddleocr.localVLModelDir.hint" = "本地 PaddleOCR-VL 模型路径(如 ~/.paddlex/official_models/PaddleOCR-VL-1.5)"; /* ======================================== diff --git a/ScreenTranslate/Services/PaddleOCREngine.swift b/ScreenTranslate/Services/PaddleOCREngine.swift index 2b3df1a..9992f0f 100644 --- a/ScreenTranslate/Services/PaddleOCREngine.swift +++ b/ScreenTranslate/Services/PaddleOCREngine.swift @@ -63,6 +63,9 @@ actor PaddleOCREngine { /// MLX-VLM model name var mlxVLMModelName: String + /// Local VL model directory (for native backend) + var localVLModelDir: String + static let `default` = Configuration( languages: [.chinese, .english], minimumConfidence: 0.0, @@ -75,7 +78,8 @@ actor PaddleOCREngine { cloudAPIKey: "", useMLXVLM: false, mlxVLMServerURL: "http://localhost:8111", - mlxVLMModelName: "PaddlePaddle/PaddleOCR-VL-1.5" + mlxVLMModelName: "PaddlePaddle/PaddleOCR-VL-1.5", + localVLModelDir: "" ) } @@ -250,13 +254,19 @@ actor PaddleOCREngine { "--device", config.useGPU ? "gpu" : "cpu" ] - // Add MLX-VLM backend arguments if enabled + // Choose backend: MLX-VLM server or native (local model) if config.useMLXVLM { args += [ "--vl_rec_backend", "mlx-vlm-server", "--vl_rec_server_url", config.mlxVLMServerURL, "--vl_rec_api_model_name", config.mlxVLMModelName ] + } else if !config.localVLModelDir.isEmpty { + // Use native backend with local model + args += [ + "--vl_rec_backend", "native", + "--vl_rec_model_dir", config.localVLModelDir + ] } return args @@ -661,7 +671,7 @@ enum PaddleOCREngineError: LocalizedError, Sendable { ) case .recognitionFailed: return NSLocalizedString( - "error.ocr.recognition.failed", + "error.ocr.failed", comment: "Text recognition failed" ) case .invalidOutput: diff --git a/ScreenTranslate/Services/PaddleOCRVLMProvider.swift b/ScreenTranslate/Services/PaddleOCRVLMProvider.swift index ff8af1f..0917dad 100644 --- a/ScreenTranslate/Services/PaddleOCRVLMProvider.swift +++ b/ScreenTranslate/Services/PaddleOCRVLMProvider.swift @@ -83,6 +83,7 @@ struct PaddleOCRVLMProvider: VLMProvider, Sendable { config.useMLXVLM = settings.paddleOCRUseMLXVLM config.mlxVLMServerURL = settings.paddleOCRMLXVLMServerURL config.mlxVLMModelName = settings.paddleOCRMLXVLMModelName + config.localVLModelDir = settings.paddleOCRLocalVLModelDir return config } From 7a6f100e41a77d96daa9eeeafb3d25aa6aa410af Mon Sep 17 00:00:00 2001 From: Hubert Date: Sat, 28 Feb 2026 19:21:27 +0800 Subject: [PATCH 10/11] refactor: address code review nitpicks for PaddleOCR integration - Fix ScreenDetector.hasPermission to use SCShareableContent.current - Fix health-check to only accept 2xx responses (not <500) - Add error logging for keychain deletion failures - Remove no-op replacingOccurrences in PaddleOCREngine - Add tilde expansion for localVLModelDir path - Localize PaddleOCR not installed error message - Extend isCJKChar with CJK punctuation, fullwidth forms, extensions B-F - Extract saveCredentialsInternal helper to eliminate duplicate keychain logic Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- .omd/project-memory.json | 244 ++++++++++++++++++ .../0c203f54-c10d-4417-8115-005c18e9036b.json | 8 + .../Features/Capture/ScreenDetector.swift | 25 +- .../Features/Settings/SettingsViewModel.swift | 2 +- ScreenTranslate/Models/AppSettings.swift | 6 +- .../Resources/en.lproj/Localizable.strings | 1 + .../zh-Hans.lproj/Localizable.strings | 1 + .../Services/PaddleOCREngine.swift | 7 +- .../Services/PaddleOCRVLMProvider.swift | 36 ++- .../Services/Security/KeychainService.swift | 127 +++------ 10 files changed, 336 insertions(+), 121 deletions(-) create mode 100644 .omd/project-memory.json create mode 100644 .omd/sessions/0c203f54-c10d-4417-8115-005c18e9036b.json diff --git a/.omd/project-memory.json b/.omd/project-memory.json new file mode 100644 index 0000000..984d440 --- /dev/null +++ b/.omd/project-memory.json @@ -0,0 +1,244 @@ +{ + "version": "1.0.0", + "lastScanned": 1772269310504, + "projectRoot": "/Users/hubo/.superset/worktrees/screentranslate/featadd-translate-engine", + "techStack": { + "languages": [], + "frameworks": [], + "packageManager": null, + "runtime": null + }, + "build": { + "buildCommand": null, + "testCommand": null, + "lintCommand": null, + "devCommand": null, + "scripts": {} + }, + "conventions": { + "namingStyle": null, + "importStyle": null, + "testPattern": null, + "fileOrganization": null + }, + "structure": { + "isMonorepo": false, + "workspaces": [], + "mainDirectories": [ + "docs" + ], + "gitBranches": { + "defaultBranch": "main", + "branchingStrategy": null + } + }, + "customNotes": [], + "directoryMap": { + "Build": { + "path": "Build", + "purpose": "Build output", + "fileCount": 1, + "lastAccessed": 1772269310491, + "keyFiles": [] + }, + "ScreenTranslate": { + "path": "ScreenTranslate", + "purpose": null, + "fileCount": 0, + "lastAccessed": 1772269310492, + "keyFiles": [] + }, + "ScreenTranslate.xcodeproj": { + "path": "ScreenTranslate.xcodeproj", + "purpose": null, + "fileCount": 1, + "lastAccessed": 1772269310492, + "keyFiles": [ + "project.pbxproj" + ] + }, + "ScreenTranslateTests": { + "path": "ScreenTranslateTests", + "purpose": null, + "fileCount": 5, + "lastAccessed": 1772269310493, + "keyFiles": [ + "KeyboardShortcutTests.swift", + "README.md", + "ScreenTranslateErrorTests.swift", + "ShortcutRecordingTypeTests.swift", + "TextTranslationErrorTests.swift" + ] + }, + "docs": { + "path": "docs", + "purpose": "Documentation", + "fileCount": 6, + "lastAccessed": 1772269310493, + "keyFiles": [ + "README.md", + "api-reference.md", + "architecture.md", + "components.md", + "developer-guide.md" + ] + }, + "skills": { + "path": "skills", + "purpose": null, + "fileCount": 0, + "lastAccessed": 1772269310494, + "keyFiles": [] + }, + "tasks": { + "path": "tasks", + "purpose": null, + "fileCount": 6, + "lastAccessed": 1772269310494, + "keyFiles": [ + "prd-.md", + "prd-macos-screentranslate.md", + "prd-screencoder-kiss-translator.md", + "prd-screencoder.md", + "prd-text-translation.json" + ] + }, + "ScreenTranslate/App": { + "path": "ScreenTranslate/App", + "purpose": "Application code", + "fileCount": 2, + "lastAccessed": 1772269310495, + "keyFiles": [ + "AppDelegate.swift", + "ScreenTranslateApp.swift" + ] + }, + "ScreenTranslate/Models": { + "path": "ScreenTranslate/Models", + "purpose": "Data models", + "fileCount": 23, + "lastAccessed": 1772269310495, + "keyFiles": [ + "Annotation.swift", + "AppLanguage.swift", + "AppSettings.swift" + ] + }, + "ScreenTranslate/Services": { + "path": "ScreenTranslate/Services", + "purpose": "Business logic services", + "fileCount": 26, + "lastAccessed": 1772269310495, + "keyFiles": [ + "AccessibilityPermissionChecker.swift", + "AppleTranslationProvider.swift", + "ClaudeVLMProvider.swift" + ] + } + }, + "hotPaths": [ + { + "path": "ScreenTranslate/Services/PaddleOCREngine.swift", + "accessCount": 17, + "lastAccessed": 1772277198204, + "type": "file" + }, + { + "path": "ScreenTranslate/Services/Security/KeychainService.swift", + "accessCount": 14, + "lastAccessed": 1772277575721, + "type": "file" + }, + { + "path": "ScreenTranslate/Models/AppSettings.swift", + "accessCount": 13, + "lastAccessed": 1772277135092, + "type": "directory" + }, + { + "path": "ScreenTranslate/Resources/en.lproj/Localizable.strings", + "accessCount": 6, + "lastAccessed": 1772277251555, + "type": "file" + }, + { + "path": "ScreenTranslate/Services/PaddleOCRVLMProvider.swift", + "accessCount": 6, + "lastAccessed": 1772277354512, + "type": "directory" + }, + { + "path": "ScreenTranslate/Features/Settings/SettingsViewModel.swift", + "accessCount": 5, + "lastAccessed": 1772277092578, + "type": "directory" + }, + { + "path": "ScreenTranslate/Resources", + "accessCount": 4, + "lastAccessed": 1772271181502, + "type": "directory" + }, + { + "path": "ScreenTranslate/Features/Settings/EngineSettingsTab.swift", + "accessCount": 4, + "lastAccessed": 1772274724758, + "type": "directory" + }, + { + "path": "ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings", + "accessCount": 4, + "lastAccessed": 1772277283269, + "type": "file" + }, + { + "path": "ScreenTranslate/Features/TranslationFlow/TranslationFlowController.swift", + "accessCount": 3, + "lastAccessed": 1772270086752, + "type": "file" + }, + { + "path": "ScreenTranslate", + "accessCount": 3, + "lastAccessed": 1772271133277, + "type": "directory" + }, + { + "path": "ScreenTranslate/Features/Capture/ScreenDetector.swift", + "accessCount": 3, + "lastAccessed": 1772277067534, + "type": "directory" + }, + { + "path": "ScreenTranslate/Features/Capture/CaptureManager.swift", + "accessCount": 2, + "lastAccessed": 1772277030873, + "type": "directory" + }, + { + "path": "", + "accessCount": 1, + "lastAccessed": 1772269949125, + "type": "directory" + }, + { + "path": "ScreenTranslate/Errors/ScreenTranslateError.swift", + "accessCount": 1, + "lastAccessed": 1772269960584, + "type": "file" + }, + { + "path": "ScreenTranslate/Models/VLMProviderType.swift", + "accessCount": 1, + "lastAccessed": 1772269960585, + "type": "file" + }, + { + "path": "ScreenTranslate/Services/ScreenCoderEngine.swift", + "accessCount": 1, + "lastAccessed": 1772270035460, + "type": "file" + } + ], + "userDirectives": [] +} \ No newline at end of file diff --git a/.omd/sessions/0c203f54-c10d-4417-8115-005c18e9036b.json b/.omd/sessions/0c203f54-c10d-4417-8115-005c18e9036b.json new file mode 100644 index 0000000..666c2c8 --- /dev/null +++ b/.omd/sessions/0c203f54-c10d-4417-8115-005c18e9036b.json @@ -0,0 +1,8 @@ +{ + "session_id": "0c203f54-c10d-4417-8115-005c18e9036b", + "ended_at": "2026-02-28T09:01:44.054Z", + "reason": "other", + "agents_spawned": 0, + "agents_completed": 0, + "modes_used": [] +} \ No newline at end of file diff --git a/ScreenTranslate/Features/Capture/ScreenDetector.swift b/ScreenTranslate/Features/Capture/ScreenDetector.swift index c65d166..d8d9ffa 100644 --- a/ScreenTranslate/Features/Capture/ScreenDetector.swift +++ b/ScreenTranslate/Features/Capture/ScreenDetector.swift @@ -122,16 +122,27 @@ actor ScreenDetector { } /// Checks if the app has screen recording permission. - /// Uses CGPreflightScreenCaptureAccess() which does NOT trigger system dialog. - /// This API is deprecated in macOS 15 but still works correctly. + /// Uses SCShareableContent to actually verify permission works (not just cached status). /// - Parameter silent: If true, suppresses logging (default: true) /// - Returns: True if permission is granted func hasPermission(silent: Bool = true) async -> Bool { - // Use CGPreflightScreenCaptureAccess - does NOT trigger dialog - let granted = CGPreflightScreenCaptureAccess() - cachedPermissionStatus = granted - if !silent { print("[ScreenDetector] Permission check: \(granted ? "granted" : "denied")") } - return granted + // Quick check first using CGPreflightScreenCaptureAccess + guard CGPreflightScreenCaptureAccess() else { + cachedPermissionStatus = false + if !silent { print("[ScreenDetector] Permission check: denied (CGPreflight)") } + return false + } + // Actually verify by trying to get shareable content + do { + _ = try await SCShareableContent.current + cachedPermissionStatus = true + if !silent { print("[ScreenDetector] Permission check: granted") } + return true + } catch { + cachedPermissionStatus = false + if !silent { print("[ScreenDetector] Permission check: denied (SCShareableContent)") } + return false + } } /// Forces a fresh permission check (clears cache) diff --git a/ScreenTranslate/Features/Settings/SettingsViewModel.swift b/ScreenTranslate/Features/Settings/SettingsViewModel.swift index 68572dc..fa4d25e 100644 --- a/ScreenTranslate/Features/Settings/SettingsViewModel.swift +++ b/ScreenTranslate/Features/Settings/SettingsViewModel.swift @@ -883,7 +883,7 @@ final class SettingsViewModel { let (_, response) = try await URLSession.shared.data(for: request) if let httpResponse = response as? HTTPURLResponse { - isRunning = httpResponse.statusCode < 500 + isRunning = (200...299).contains(httpResponse.statusCode) } } catch { isRunning = false diff --git a/ScreenTranslate/Models/AppSettings.swift b/ScreenTranslate/Models/AppSettings.swift index da06600..43dc65c 100644 --- a/ScreenTranslate/Models/AppSettings.swift +++ b/ScreenTranslate/Models/AppSettings.swift @@ -510,7 +510,11 @@ final class AppSettings { paddleOCRCloudAPIKey = "" // Delete PaddleOCR cloud API key from Keychain Task.detached { - try? await KeychainService.shared.deletePaddleOCRCredentials() + do { + try await KeychainService.shared.deletePaddleOCRCredentials() + } catch { + Logger.settings.error("Failed to delete PaddleOCR credentials from keychain: \(error.localizedDescription)") + } } // Reset MLX-VLM settings paddleOCRUseMLXVLM = false diff --git a/ScreenTranslate/Resources/en.lproj/Localizable.strings b/ScreenTranslate/Resources/en.lproj/Localizable.strings index 75838f2..2977c6f 100644 --- a/ScreenTranslate/Resources/en.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/en.lproj/Localizable.strings @@ -603,6 +603,7 @@ "settings.paddleocr.mlxVLMNotRunning" = "Not Running"; "settings.paddleocr.localVLModelDir" = "Local Model Directory"; "settings.paddleocr.localVLModelDir.hint" = "Path to local PaddleOCR-VL model (e.g. ~/.paddlex/official_models/PaddleOCR-VL-1.5)"; +"error.paddleocr.notInstalled" = "PaddleOCR is not installed. Install it using: pip3 install paddleocr paddlepaddle"; /* ======================================== diff --git a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings index 54813f8..bee7b0c 100644 --- a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings @@ -603,6 +603,7 @@ "settings.paddleocr.mlxVLMNotRunning" = "未运行"; "settings.paddleocr.localVLModelDir" = "本地模型目录"; "settings.paddleocr.localVLModelDir.hint" = "本地 PaddleOCR-VL 模型路径(如 ~/.paddlex/official_models/PaddleOCR-VL-1.5)"; +"error.paddleocr.notInstalled" = "PaddleOCR 未安装。请使用以下命令安装:pip3 install paddleocr paddlepaddle"; /* ======================================== diff --git a/ScreenTranslate/Services/PaddleOCREngine.swift b/ScreenTranslate/Services/PaddleOCREngine.swift index 9992f0f..3a3a043 100644 --- a/ScreenTranslate/Services/PaddleOCREngine.swift +++ b/ScreenTranslate/Services/PaddleOCREngine.swift @@ -263,9 +263,11 @@ actor PaddleOCREngine { ] } else if !config.localVLModelDir.isEmpty { // Use native backend with local model + // Expand tilde in path (e.g., ~/.paddlex -> /Users/xxx/.paddlex) + let expandedPath = NSString(string: config.localVLModelDir).expandingTildeInPath args += [ "--vl_rec_backend", "native", - "--vl_rec_model_dir", config.localVLModelDir + "--vl_rec_model_dir", expandedPath ] } @@ -612,8 +614,7 @@ actor PaddleOCREngine { } content = content.replacingOccurrences(of: "[,", with: "[") content = content.replacingOccurrences(of: ",]", with: "]") - // Handle edge case of empty nested arrays - content = content.replacingOccurrences(of: "[]", with: "[]") + // Handle edge case of empty nested arrays - return early return content } diff --git a/ScreenTranslate/Services/PaddleOCRVLMProvider.swift b/ScreenTranslate/Services/PaddleOCRVLMProvider.swift index 0917dad..5d44146 100644 --- a/ScreenTranslate/Services/PaddleOCRVLMProvider.swift +++ b/ScreenTranslate/Services/PaddleOCRVLMProvider.swift @@ -58,7 +58,7 @@ struct PaddleOCRVLMProvider: VLMProvider, Sendable { if !config.useCloud { guard await PaddleOCREngine.shared.isAvailable else { throw VLMProviderError.invalidConfiguration( - "PaddleOCR is not installed. Install it using: pip3 install paddleocr paddlepaddle" + NSLocalizedString("error.paddleocr.notInstalled", comment: "PaddleOCR not installed error") ) } } @@ -230,16 +230,28 @@ private struct MergedLine { /// Checks if a character is CJK (Chinese/Japanese/Korean) private static func isCJKChar(_ char: Character) -> Bool { - let scalar = char.unicodeScalars.first?.value ?? 0 - // CJK Unified Ideographs: U+4E00-U+9FFF - // CJK Unified Ideographs Extension A: U+3400-U+4DBF - // Hiragana: U+3040-U+309F - // Katakana: U+30A0-U+30FF - // Hangul Syllables: U+AC00-U+D7AF - return (0x4E00...0x9FFF).contains(scalar) || - (0x3400...0x4DBF).contains(scalar) || - (0x3040...0x309F).contains(scalar) || - (0x30A0...0x30FF).contains(scalar) || - (0xAC00...0xD7AF).contains(scalar) + // Check all unicode scalars to handle surrogate pairs correctly + for scalar in char.unicodeScalars { + let value = scalar.value + // CJK Unified Ideographs: U+4E00-U+9FFF + // CJK Unified Ideographs Extension A: U+3400-U+4DBF + // Hiragana: U+3040-U+309F + // Katakana: U+30A0-U+30FF + // Hangul Syllables: U+AC00-U+D7AF + // CJK Symbols and Punctuation: U+3000-U+303F + // Fullwidth Forms: U+FF00-U+FFEF + // CJK Extension B-F: U+20000-U+2FA1F + if (0x4E00...0x9FFF).contains(value) || + (0x3400...0x4DBF).contains(value) || + (0x3040...0x309F).contains(value) || + (0x30A0...0x30FF).contains(value) || + (0xAC00...0xD7AF).contains(value) || + (0x3000...0x303F).contains(value) || + (0xFF00...0xFFEF).contains(value) || + (0x20000...0x2FA1F).contains(value) { + return true + } + } + return false } } diff --git a/ScreenTranslate/Services/Security/KeychainService.swift b/ScreenTranslate/Services/Security/KeychainService.swift index 0ae63b7..de5a649 100644 --- a/ScreenTranslate/Services/Security/KeychainService.swift +++ b/ScreenTranslate/Services/Security/KeychainService.swift @@ -48,6 +48,19 @@ actor KeychainService { additional: additionalData ) + try saveCredentialsInternal( + credentials: credentials, + account: engine.rawValue, + label: engine.rawValue + ) + } + + /// Internal helper for saving credentials to keychain + /// - Parameters: + /// - credentials: The credentials to save + /// - account: The account identifier for the keychain item + /// - label: A descriptive label for logging + private func saveCredentialsInternal(credentials: StoredCredentials, account: String, label: String) throws { guard let encodedData = try? JSONEncoder().encode(credentials) else { throw KeychainError.invalidData } @@ -55,7 +68,7 @@ actor KeychainService { let query: [String: Any] = [ kSecClass as String: kSecClassGenericPassword, kSecAttrService as String: service, - kSecAttrAccount as String: engine.rawValue + kSecAttrAccount as String: account ] // Check if item exists and update it, or add new if not found @@ -68,27 +81,27 @@ actor KeychainService { ] let updateStatus = SecItemUpdate(query as CFDictionary, updateQuery as CFDictionary) guard updateStatus == errSecSuccess else { - logger.error("Failed to update credentials for \(engine.rawValue): \(updateStatus)") + logger.error("Failed to update credentials for \(label): \(updateStatus)") throw KeychainError.unexpectedStatus(updateStatus) } - logger.info("Updated credentials for \(engine.rawValue)") + logger.info("Updated credentials for \(label)") } else if status == errSecItemNotFound { // Item doesn't exist - add new let addQuery: [String: Any] = [ kSecClass as String: kSecClassGenericPassword, kSecAttrService as String: service, - kSecAttrAccount as String: engine.rawValue, + kSecAttrAccount as String: account, kSecValueData as String: encodedData, kSecAttrAccessible as String: kSecAttrAccessibleWhenUnlocked ] let addStatus = SecItemAdd(addQuery as CFDictionary, nil) guard addStatus == errSecSuccess else { - logger.error("Failed to save credentials for \(engine.rawValue): \(addStatus)") + logger.error("Failed to save credentials for \(label): \(addStatus)") throw KeychainError.unexpectedStatus(addStatus) } - logger.info("Saved credentials for \(engine.rawValue)") + logger.info("Saved credentials for \(label)") } else { - logger.error("Failed to check credentials for \(engine.rawValue): \(status)") + logger.error("Failed to check credentials for \(label): \(status)") throw KeychainError.unexpectedStatus(status) } } @@ -183,50 +196,11 @@ actor KeychainService { /// - compatibleId: The compatible engine identifier (e.g., "custom:0", "custom:1") func saveCredentials(apiKey: String, forCompatibleId compatibleId: String) throws { let credentials = StoredCredentials(apiKey: apiKey) - - guard let encodedData = try? JSONEncoder().encode(credentials) else { - throw KeychainError.invalidData - } - - let query: [String: Any] = [ - kSecClass as String: kSecClassGenericPassword, - kSecAttrService as String: service, - kSecAttrAccount as String: compatibleId - ] - - // Check if item exists and update it, or add new if not found - let status = SecItemCopyMatching(query as CFDictionary, nil) - if status == errSecSuccess { - // Item exists - update it - let updateQuery: [String: Any] = [ - kSecValueData as String: encodedData, - kSecAttrAccessible as String: kSecAttrAccessibleWhenUnlocked - ] - let updateStatus = SecItemUpdate(query as CFDictionary, updateQuery as CFDictionary) - guard updateStatus == errSecSuccess else { - logger.error("Failed to update credentials for \(compatibleId): \(updateStatus)") - throw KeychainError.unexpectedStatus(updateStatus) - } - logger.info("Updated credentials for compatible engine \(compatibleId)") - } else if status == errSecItemNotFound { - // Item doesn't exist - add new - let addQuery: [String: Any] = [ - kSecClass as String: kSecClassGenericPassword, - kSecAttrService as String: service, - kSecAttrAccount as String: compatibleId, - kSecValueData as String: encodedData, - kSecAttrAccessible as String: kSecAttrAccessibleWhenUnlocked - ] - let addStatus = SecItemAdd(addQuery as CFDictionary, nil) - guard addStatus == errSecSuccess else { - logger.error("Failed to save credentials for \(compatibleId): \(addStatus)") - throw KeychainError.unexpectedStatus(addStatus) - } - logger.info("Saved credentials for compatible engine \(compatibleId)") - } else { - logger.error("Failed to check credentials for \(compatibleId): \(status)") - throw KeychainError.unexpectedStatus(status) - } + try saveCredentialsInternal( + credentials: credentials, + account: compatibleId, + label: "compatible engine \(compatibleId)" + ) } /// Retrieve stored credentials for a compatible engine instance @@ -320,53 +294,12 @@ actor KeychainService { /// Save PaddleOCR cloud API key /// - Parameter apiKey: The API key to store func savePaddleOCRCredentials(apiKey: String) throws { - let account = Self.paddleOCRAccount - let credentials = StoredCredentials(apiKey: apiKey) - - guard let encodedData = try? JSONEncoder().encode(credentials) else { - throw KeychainError.invalidData - } - - let query: [String: Any] = [ - kSecClass as String: kSecClassGenericPassword, - kSecAttrService as String: service, - kSecAttrAccount as String: account - ] - - // Check if item exists and update it, or add new if not found - let status = SecItemCopyMatching(query as CFDictionary, nil) - if status == errSecSuccess { - // Item exists - update it - let updateQuery: [String: Any] = [ - kSecValueData as String: encodedData, - kSecAttrAccessible as String: kSecAttrAccessibleWhenUnlocked - ] - let updateStatus = SecItemUpdate(query as CFDictionary, updateQuery as CFDictionary) - guard updateStatus == errSecSuccess else { - logger.error("Failed to update PaddleOCR cloud credentials: \(updateStatus)") - throw KeychainError.unexpectedStatus(updateStatus) - } - logger.info("Updated PaddleOCR cloud credentials") - } else if status == errSecItemNotFound { - // Item doesn't exist - add new - let addQuery: [String: Any] = [ - kSecClass as String: kSecClassGenericPassword, - kSecAttrService as String: service, - kSecAttrAccount as String: account, - kSecValueData as String: encodedData, - kSecAttrAccessible as String: kSecAttrAccessibleWhenUnlocked - ] - let addStatus = SecItemAdd(addQuery as CFDictionary, nil) - guard addStatus == errSecSuccess else { - logger.error("Failed to save PaddleOCR cloud credentials: \(addStatus)") - throw KeychainError.unexpectedStatus(addStatus) - } - logger.info("Saved PaddleOCR cloud credentials") - } else { - logger.error("Failed to check PaddleOCR cloud credentials: \(status)") - throw KeychainError.unexpectedStatus(status) - } + try saveCredentialsInternal( + credentials: credentials, + account: Self.paddleOCRAccount, + label: "PaddleOCR cloud" + ) } /// Retrieve stored PaddleOCR cloud API key From a90a94edda4a2bdcf450708bc8a7caae54642fe2 Mon Sep 17 00:00:00 2001 From: Hubert Date: Mon, 2 Mar 2026 09:03:05 +0800 Subject: [PATCH 11/11] fix: correct minimum system version from 26.0 to 14.0 Sparkle was unable to detect updates because minimumSystemVersion was set to 26.0, which doesn't exist. Changed to macOS 14.0 (Sonoma). Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- .github/workflows/release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 37f012e..b542e60 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,7 +33,7 @@ jobs: -destination 'platform=macOS' \ CODE_SIGN_IDENTITY="-" \ CODE_SIGN_STYLE=Automatic \ - MACOSX_DEPLOYMENT_TARGET=26.0 \ + MACOSX_DEPLOYMENT_TARGET=14.0 \ ONLY_ACTIVE_ARCH=NO \ clean build @@ -127,7 +127,7 @@ jobs: ${PUBDATE} ${BUILD_NUM} ${VERSION} - 26.0 + 14.0 ${ENCLOSURE} Latest release of ScreenTranslate

]]>