From affcc3a7126215ceb9faf2f17113d4582c16c40e Mon Sep 17 00:00:00 2001 From: Hubert Date: Wed, 4 Feb 2026 20:46:58 +0800 Subject: [PATCH] feat(ocr): integrate PaddleOCR with installation detection and multi-language support - Add PaddleOCR installation detection in onboarding and settings - Support pyenv/pip installed paddleocr via shell execution - Fix coordinate system for ScreenCaptureKit (use points not pixels) - Parse PaddleOCR output from stderr with proper JSON conversion - Add Chinese/English localization for PaddleOCR UI - Route OCR calls through OCRService based on user settings --- .../Features/Capture/CaptureManager.swift | 50 ++-- .../Features/Onboarding/OnboardingView.swift | 113 ++++++++- .../Onboarding/OnboardingViewModel.swift | 81 ++++++ .../Features/Preview/PreviewViewModel.swift | 4 +- .../Features/Settings/SettingsView.swift | 110 ++++++-- .../Features/Settings/SettingsViewModel.swift | 81 ++++++ ScreenTranslate/Models/OCREngineType.swift | 126 +++++---- .../Resources/en.lproj/Localizable.strings | 22 ++ .../zh-Hans.lproj/Localizable.strings | 22 ++ .../Services/OCREngineProtocol.swift | 14 +- .../Services/PaddleOCREngine.swift | 239 ++++++++++-------- 11 files changed, 642 insertions(+), 220 deletions(-) diff --git a/ScreenTranslate/Features/Capture/CaptureManager.swift b/ScreenTranslate/Features/Capture/CaptureManager.swift index 352f013..ef6598e 100644 --- a/ScreenTranslate/Features/Capture/CaptureManager.swift +++ b/ScreenTranslate/Features/Capture/CaptureManager.swift @@ -200,40 +200,46 @@ actor CaptureManager { // Configure capture for the full display first let filter = SCContentFilter(display: scDisplay, excludingWindows: []) - let config = createCaptureConfiguration(for: display) - - // Set source rect for region capture - // sourceRect must be in PIXEL coordinates (not normalized!) - // The rect is in points from SelectionOverlayWindow, convert to pixels - // IMPORTANT: Round to integers to avoid fractional pixel boundaries - // which cause ScreenCaptureKit to apply anti-aliasing/interpolation - let pixelX = round(rect.origin.x * display.scaleFactor) - let pixelY = round(rect.origin.y * display.scaleFactor) - let pixelWidth = round(rect.width * display.scaleFactor) - let pixelHeight = round(rect.height * display.scaleFactor) + let config = SCStreamConfiguration() + + // sourceRect is in POINTS (same coordinate system as display.frame) + // NOT in pixels! ScreenCaptureKit handles the scaling internally. + let clampedX = min(max(rect.origin.x, 0), display.frame.width - 1) + let clampedY = min(max(rect.origin.y, 0), display.frame.height - 1) + let clampedWidth = min(rect.width, display.frame.width - clampedX) + let clampedHeight = min(rect.height, display.frame.height - clampedY) let sourceRect = CGRect( - x: pixelX, - y: pixelY, - width: pixelWidth, - height: pixelHeight + x: clampedX, + y: clampedY, + width: clampedWidth, + height: clampedHeight ) + config.sourceRect = sourceRect + + // Output size should be in PIXELS for crisp capture + let outputWidth = Int(clampedWidth * display.scaleFactor) + let outputHeight = Int(clampedHeight * display.scaleFactor) + config.width = outputWidth + config.height = outputHeight + + // High quality settings + config.minimumFrameInterval = CMTime(value: 1, timescale: 1) + config.pixelFormat = kCVPixelFormatType_32BGRA + config.showsCursor = false + config.colorSpaceName = CGColorSpace.sRGB + #if DEBUG print("=== CAPTURE MANAGER DEBUG ===") print("[CAP-1] Input rect (points): \(rect)") print("[CAP-2] display.frame (points): \(display.frame)") print("[CAP-3] display.scaleFactor: \(display.scaleFactor)") - print("[CAP-4] sourceRect (pixels, rounded): \(sourceRect)") + print("[CAP-4] sourceRect (points, clamped): \(sourceRect)") + print("[CAP-5] outputSize (pixels): \(outputWidth)x\(outputHeight)") print("=== END CAPTURE MANAGER DEBUG ===") #endif - config.sourceRect = sourceRect - - // Adjust output size to match the region (use same rounded values) - config.width = Int(pixelWidth) - config.height = Int(pixelHeight) - // Perform capture with signpost for profiling os_signpost(.begin, log: Self.performanceLog, name: "RegionCapture", signpostID: Self.signpostID) let captureStartTime = CFAbsoluteTimeGetCurrent() diff --git a/ScreenTranslate/Features/Onboarding/OnboardingView.swift b/ScreenTranslate/Features/Onboarding/OnboardingView.swift index ff2b05f..61756de 100644 --- a/ScreenTranslate/Features/Onboarding/OnboardingView.swift +++ b/ScreenTranslate/Features/Onboarding/OnboardingView.swift @@ -258,19 +258,12 @@ struct OnboardingView: View { } VStack(alignment: .leading, spacing: 16) { - VStack(alignment: .leading, spacing: 8) { - Text(NSLocalizedString("onboarding.configuration.paddleocr", comment: "")) - .font(.headline) - Text(NSLocalizedString("onboarding.configuration.paddleocr.hint", comment: "")) - .font(.caption) - .foregroundStyle(.secondary) - TextField( - NSLocalizedString("onboarding.configuration.placeholder", comment: ""), - text: $viewModel.paddleOCRServerAddress - ) - .textFieldStyle(.roundedBorder) - } + // PaddleOCR Installation Section + paddleOCRConfigSection + + Divider() + // MTran Server Section VStack(alignment: .leading, spacing: 8) { Text(NSLocalizedString("onboarding.configuration.mtran", comment: "")) .font(.headline) @@ -284,6 +277,7 @@ struct OnboardingView: View { .textFieldStyle(.roundedBorder) } + // Translation Test Section VStack(alignment: .leading, spacing: 8) { Text(NSLocalizedString("onboarding.configuration.test", comment: "")) .font(.headline) @@ -341,6 +335,101 @@ struct OnboardingView: View { .padding(32) } + // MARK: - PaddleOCR Configuration Section + + private var paddleOCRConfigSection: some View { + VStack(alignment: .leading, spacing: 12) { + HStack { + Text(NSLocalizedString("onboarding.paddleocr.title", comment: "")) + .font(.headline) + + Spacer() + + // Installation status indicator + if viewModel.isPaddleOCRInstalled { + HStack(spacing: 4) { + Image(systemName: "checkmark.circle.fill") + .foregroundStyle(.green) + Text(NSLocalizedString("onboarding.paddleocr.installed", comment: "")) + .font(.caption) + .foregroundStyle(.green) + } + } else { + HStack(spacing: 4) { + Image(systemName: "xmark.circle.fill") + .foregroundStyle(.secondary) + Text(NSLocalizedString("onboarding.paddleocr.not.installed", comment: "")) + .font(.caption) + .foregroundStyle(.secondary) + } + } + } + + Text(NSLocalizedString("onboarding.paddleocr.description", comment: "")) + .font(.caption) + .foregroundStyle(.secondary) + + if !viewModel.isPaddleOCRInstalled { + // Installation options + VStack(alignment: .leading, spacing: 8) { + Text(NSLocalizedString("onboarding.paddleocr.install.hint", comment: "")) + .font(.caption) + .foregroundStyle(.secondary) + + HStack(spacing: 12) { + Button { + viewModel.installPaddleOCR() + } label: { + if viewModel.isInstallingPaddleOCR { + ProgressView() + .controlSize(.small) + .frame(width: 16, height: 16) + Text(NSLocalizedString("onboarding.paddleocr.installing", comment: "")) + } else { + Image(systemName: "arrow.down.circle") + Text(NSLocalizedString("onboarding.paddleocr.install", comment: "")) + } + } + .buttonStyle(.borderedProminent) + .disabled(viewModel.isInstallingPaddleOCR) + + Button { + viewModel.copyInstallCommand() + } label: { + Image(systemName: "doc.on.doc") + Text(NSLocalizedString("onboarding.paddleocr.copy.command", comment: "")) + } + .buttonStyle(.bordered) + + Button { + viewModel.refreshPaddleOCRStatus() + } label: { + Image(systemName: "arrow.clockwise") + } + .buttonStyle(.borderless) + .help(NSLocalizedString("onboarding.paddleocr.refresh", comment: "")) + } + + if let error = viewModel.paddleOCRInstallError { + Text(error) + .font(.caption) + .foregroundStyle(.red) + } + } + } else { + // PaddleOCR is installed - show version + if let version = viewModel.paddleOCRVersion { + Text(String(format: NSLocalizedString("onboarding.paddleocr.version", comment: ""), version)) + .font(.caption) + .foregroundStyle(.secondary) + } + } + } + .padding() + .background(Color(nsColor: .controlBackgroundColor)) + .cornerRadius(8) + } + // MARK: - Step 3: Complete private var completeStep: some View { diff --git a/ScreenTranslate/Features/Onboarding/OnboardingViewModel.swift b/ScreenTranslate/Features/Onboarding/OnboardingViewModel.swift index ad10bcb..2e195c1 100644 --- a/ScreenTranslate/Features/Onboarding/OnboardingViewModel.swift +++ b/ScreenTranslate/Features/Onboarding/OnboardingViewModel.swift @@ -40,6 +40,18 @@ final class OnboardingViewModel { /// Translation test success status var translationTestSuccess = false + /// Whether PaddleOCR is installed + var isPaddleOCRInstalled = false + + /// Whether PaddleOCR installation is in progress + var isInstallingPaddleOCR = false + + /// PaddleOCR installation error message + var paddleOCRInstallError: String? + + /// PaddleOCR version if installed + var paddleOCRVersion: String? + // MARK: - Computed Properties /// Whether we can move to the next step @@ -79,6 +91,7 @@ final class OnboardingViewModel { Task { await MainActor.run { checkPermissions() + refreshPaddleOCRStatus() } } } @@ -239,6 +252,74 @@ final class OnboardingViewModel { mtranServerURL = "" completeOnboarding() } + + // MARK: - PaddleOCR Management + + func refreshPaddleOCRStatus() { + PaddleOCRChecker.resetCache() + PaddleOCRChecker.checkAvailabilityAsync() + + Task { + for _ in 0..<20 { + try? await Task.sleep(for: .milliseconds(250)) + if PaddleOCRChecker.checkCompleted { + break + } + } + await MainActor.run { + isPaddleOCRInstalled = PaddleOCRChecker.isAvailable + paddleOCRVersion = PaddleOCRChecker.version + paddleOCRInstallError = nil + } + } + } + + func installPaddleOCR() { + isInstallingPaddleOCR = true + paddleOCRInstallError = nil + + Task.detached(priority: .userInitiated) { + let result = await self.runPipInstall() + await MainActor.run { + self.isInstallingPaddleOCR = false + if let error = result { + self.paddleOCRInstallError = error + } else { + self.refreshPaddleOCRStatus() + } + } + } + } + + private func runPipInstall() async -> String? { + let task = Process() + task.executableURL = URL(fileURLWithPath: "/usr/bin/env") + task.arguments = ["pip3", "install", "paddleocr", "paddlepaddle"] + + let stderrPipe = Pipe() + task.standardError = stderrPipe + task.standardOutput = Pipe() + + do { + try task.run() + task.waitUntilExit() + + if task.terminationStatus != 0 { + let stderrData = stderrPipe.fileHandleForReading.readDataToEndOfFile() + let stderr = String(data: stderrData, encoding: .utf8) ?? "Unknown error" + return stderr.isEmpty ? "Installation failed with exit code \(task.terminationStatus)" : stderr + } + return nil + } catch { + return error.localizedDescription + } + } + + func copyInstallCommand() { + let command = "pip3 install paddleocr paddlepaddle" + NSPasteboard.general.clearContents() + NSPasteboard.general.setString(command, forType: .string) + } } // MARK: - Notification Names diff --git a/ScreenTranslate/Features/Preview/PreviewViewModel.swift b/ScreenTranslate/Features/Preview/PreviewViewModel.swift index b545052..8bdf424 100644 --- a/ScreenTranslate/Features/Preview/PreviewViewModel.swift +++ b/ScreenTranslate/Features/Preview/PreviewViewModel.swift @@ -89,7 +89,7 @@ final class PreviewViewModel { private let recentCapturesStore: RecentCapturesStore @ObservationIgnored - private let ocrEngine = OCREngine.shared + private let ocrService = OCRService.shared @ObservationIgnored private let translationEngine = TranslationEngine.shared @@ -970,7 +970,7 @@ final class PreviewViewModel { defer { isPerformingOCR = false } do { - let result = try await ocrEngine.recognize( + let result = try await ocrService.recognize( image, languages: [.english, .chineseSimplified] ) diff --git a/ScreenTranslate/Features/Settings/SettingsView.swift b/ScreenTranslate/Features/Settings/SettingsView.swift index e73e834..d0f0ed8 100644 --- a/ScreenTranslate/Features/Settings/SettingsView.swift +++ b/ScreenTranslate/Features/Settings/SettingsView.swift @@ -546,42 +546,106 @@ private struct TextSizeSlider: View { // MARK: - OCR Engine Picker -/// Picker for selecting the OCR engine. private struct OCREnginePicker: View { @Bindable var viewModel: SettingsViewModel var body: some View { - Picker(L("settings.ocr.engine"), selection: $viewModel.ocrEngine) { - ForEach(OCREngineType.allCases, id: \.self) { engine in - VStack(alignment: .leading, spacing: 4) { - HStack { + VStack(alignment: .leading, spacing: 12) { + HStack { + Picker(L("settings.ocr.engine"), selection: $viewModel.ocrEngine) { + ForEach(OCREngineType.allCases, id: \.self) { engine in Text(engine.localizedName) - if !engine.isAvailable && engine == .paddleOCR { - Image(systemName: "exclamationmark.triangle") - .foregroundStyle(.orange) - .font(.caption) - } + .tag(engine) } - Text(engine.description) - .font(.caption) - .foregroundStyle(.secondary) } - .tag(engine) - .if(!engine.isAvailable && engine == .paddleOCR) { view in - view.foregroundStyle(.secondary) + .pickerStyle(.segmented) + + if viewModel.ocrEngine == .paddleOCR && !viewModel.isPaddleOCRInstalled { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundStyle(.orange) } } + + if viewModel.ocrEngine == .paddleOCR { + paddleOCRStatusView + } } - .pickerStyle(.inline) - .onChange(of: viewModel.ocrEngine) { _, newValue in - // If user selects an unavailable engine, show warning and revert to Vision - // Use Task to avoid setting value during update - if !newValue.isAvailable { - Task { @MainActor in - viewModel.ocrEngine = .vision + } + + private var paddleOCRStatusView: some View { + VStack(alignment: .leading, spacing: 8) { + HStack { + if viewModel.isPaddleOCRInstalled { + Image(systemName: "checkmark.circle.fill") + .foregroundStyle(.green) + Text(L("settings.paddleocr.installed")) + .foregroundStyle(.secondary) + if let version = viewModel.paddleOCRVersion { + Text("(\(version))") + .font(.caption) + .foregroundStyle(.secondary) + } + } else { + Image(systemName: "xmark.circle.fill") + .foregroundStyle(.orange) + Text(L("settings.paddleocr.not.installed")) + .foregroundStyle(.secondary) } + + Spacer() + + Button { + viewModel.refreshPaddleOCRStatus() + } label: { + Image(systemName: "arrow.clockwise") + } + .buttonStyle(.borderless) + .help(L("settings.paddleocr.refresh")) + } + + if !viewModel.isPaddleOCRInstalled { + HStack(spacing: 8) { + Button { + viewModel.installPaddleOCR() + } label: { + if viewModel.isInstallingPaddleOCR { + ProgressView() + .controlSize(.small) + Text(L("settings.paddleocr.installing")) + } else { + Image(systemName: "arrow.down.circle") + Text(L("settings.paddleocr.install")) + } + } + .buttonStyle(.borderedProminent) + .controlSize(.small) + .disabled(viewModel.isInstallingPaddleOCR) + + Button { + viewModel.copyPaddleOCRInstallCommand() + } label: { + Image(systemName: "doc.on.doc") + Text(L("settings.paddleocr.copy.command")) + } + .buttonStyle(.bordered) + .controlSize(.small) + } + + if let error = viewModel.paddleOCRInstallError { + Text(error) + .font(.caption) + .foregroundStyle(.red) + .lineLimit(3) + } + + Text(L("settings.paddleocr.install.hint")) + .font(.caption) + .foregroundStyle(.secondary) } } + .padding(10) + .background(Color(nsColor: .controlBackgroundColor)) + .cornerRadius(8) } } diff --git a/ScreenTranslate/Features/Settings/SettingsViewModel.swift b/ScreenTranslate/Features/Settings/SettingsViewModel.swift index b23ee1b..815f90b 100644 --- a/ScreenTranslate/Features/Settings/SettingsViewModel.swift +++ b/ScreenTranslate/Features/Settings/SettingsViewModel.swift @@ -39,6 +39,18 @@ final class SettingsViewModel { /// Whether permission check is in progress var isCheckingPermissions: Bool = false + /// Whether PaddleOCR is installed + var isPaddleOCRInstalled: Bool = false + + /// Whether PaddleOCR installation is in progress + var isInstallingPaddleOCR: Bool = false + + /// PaddleOCR installation error message + var paddleOCRInstallError: String? + + /// PaddleOCR version if installed + var paddleOCRVersion: String? + // MARK: - Computed Properties (Bindings to AppSettings) /// Save location URL @@ -183,6 +195,7 @@ final class SettingsViewModel { init(settings: AppSettings = .shared, appDelegate: AppDelegate? = nil) { self.settings = settings self.appDelegate = appDelegate + refreshPaddleOCRStatus() } // MARK: - Permission Checking @@ -396,6 +409,74 @@ final class SettingsViewModel { errorMessage = message showErrorAlert = true } + + // MARK: - PaddleOCR Management + + func refreshPaddleOCRStatus() { + PaddleOCRChecker.resetCache() + PaddleOCRChecker.checkAvailabilityAsync() + + Task { + for _ in 0..<20 { + try? await Task.sleep(for: .milliseconds(250)) + if PaddleOCRChecker.checkCompleted { + break + } + } + await MainActor.run { + isPaddleOCRInstalled = PaddleOCRChecker.isAvailable + paddleOCRVersion = PaddleOCRChecker.version + paddleOCRInstallError = nil + } + } + } + + func installPaddleOCR() { + isInstallingPaddleOCR = true + paddleOCRInstallError = nil + + Task.detached(priority: .userInitiated) { + let result = await self.runPipInstall() + await MainActor.run { + self.isInstallingPaddleOCR = false + if let error = result { + self.paddleOCRInstallError = error + } else { + self.refreshPaddleOCRStatus() + } + } + } + } + + private func runPipInstall() async -> String? { + let task = Process() + task.executableURL = URL(fileURLWithPath: "/usr/bin/env") + task.arguments = ["pip3", "install", "paddleocr", "paddlepaddle"] + + let stderrPipe = Pipe() + task.standardError = stderrPipe + task.standardOutput = Pipe() + + do { + try task.run() + task.waitUntilExit() + + if task.terminationStatus != 0 { + let stderrData = stderrPipe.fileHandleForReading.readDataToEndOfFile() + let stderr = String(data: stderrData, encoding: .utf8) ?? "Unknown error" + return stderr.isEmpty ? "Installation failed with exit code \(task.terminationStatus)" : stderr + } + return nil + } catch { + return error.localizedDescription + } + } + + func copyPaddleOCRInstallCommand() { + let command = "pip3 install paddleocr paddlepaddle" + NSPasteboard.general.clearContents() + NSPasteboard.general.setString(command, forType: .string) + } } // MARK: - Preset Colors diff --git a/ScreenTranslate/Models/OCREngineType.swift b/ScreenTranslate/Models/OCREngineType.swift index 0bb7a1d..571dca3 100644 --- a/ScreenTranslate/Models/OCREngineType.swift +++ b/ScreenTranslate/Models/OCREngineType.swift @@ -50,73 +50,87 @@ enum OCREngineType: String, CaseIterable, Sendable, Codable { /// Helper to check if PaddleOCR is available on the system enum PaddleOCRChecker { - /// Cached availability status (nonisolated(unsafe) for singleton cache) - private nonisolated(unsafe) static var _isAvailable: Bool? = false + private nonisolated(unsafe) static var _isAvailable: Bool = false + private nonisolated(unsafe) static var _executablePath: String? + private nonisolated(unsafe) static var _version: String? + private nonisolated(unsafe) static var _checkCompleted: Bool = false + + static var isAvailable: Bool { _isAvailable } + static var executablePath: String? { _executablePath } + static var version: String? { _version } + static var checkCompleted: Bool { _checkCompleted } - /// Check if PaddleOCR command is available (returns cached value, never blocks) - static var isAvailable: Bool { - return _isAvailable ?? false - } - - /// Async check and cache PaddleOCR availability static func checkAvailabilityAsync() { - Task.detached(priority: .background) { - let result = await checkPaddleOCRAsync() - _isAvailable = result + Task.detached(priority: .userInitiated) { + let result = await performFullCheck() + _isAvailable = result.available + _executablePath = result.path + _version = result.version + _checkCompleted = true } } - /// Perform actual check for PaddleOCR availability (async, off main thread) - private static func checkPaddleOCRAsync() async -> Bool { + private static func performFullCheck() async -> (available: Bool, path: String?, version: String?) { await withCheckedContinuation { continuation in - DispatchQueue.global(qos: .background).async { - let task = Process() - task.executableURL = URL(fileURLWithPath: "/usr/bin/which") - task.arguments = ["paddleocr"] - - let pipe = Pipe() - task.standardOutput = pipe - task.standardError = Pipe() - - do { - try task.run() - task.waitUntilExit() - continuation.resume(returning: task.terminationStatus == 0) - } catch { - continuation.resume(returning: false) + DispatchQueue.global(qos: .userInitiated).async { + let possiblePaths = [ + "\(NSHomeDirectory())/.pyenv/shims/paddleocr", + "/usr/local/bin/paddleocr", + "/opt/homebrew/bin/paddleocr", + "\(NSHomeDirectory())/.local/bin/paddleocr" + ] + + print("[PaddleOCRChecker] Checking paths: \(possiblePaths)") + + for path in possiblePaths { + if FileManager.default.isExecutableFile(atPath: path) { + print("[PaddleOCRChecker] Found executable at: \(path)") + + let task = Process() + task.executableURL = URL(fileURLWithPath: path) + task.arguments = ["--version"] + task.environment = [ + "PATH": "\(NSHomeDirectory())/.pyenv/shims:/usr/local/bin:/usr/bin:/bin", + "HOME": NSHomeDirectory(), + "PYENV_ROOT": "\(NSHomeDirectory())/.pyenv", + "PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK": "True" + ] + + let pipe = Pipe() + task.standardOutput = pipe + task.standardError = pipe + + do { + try task.run() + task.waitUntilExit() + + let data = pipe.fileHandleForReading.readDataToEndOfFile() + let output = String(data: data, encoding: .utf8) ?? "" + print("[PaddleOCRChecker] Version output: \(output)") + + let versionLine = output.components(separatedBy: .newlines) + .first { $0.contains("paddleocr") }? + .trimmingCharacters(in: .whitespaces) + + print("[PaddleOCRChecker] Found: path=\(path), version=\(versionLine ?? "unknown")") + continuation.resume(returning: (true, path, versionLine)) + return + } catch { + print("[PaddleOCRChecker] Error running \(path): \(error)") + } + } } + + print("[PaddleOCRChecker] Not found in any known path") + continuation.resume(returning: (false, nil, nil)) } } } - /// Reset the cached availability check static func resetCache() { - _isAvailable = nil - } - - /// Get the PaddleOCR version if available - static var version: String? { - guard isAvailable else { return nil } - - let task = Process() - task.executableURL = URL(fileURLWithPath: "/usr/local/bin/paddleocr") - task.arguments = ["--version"] - - let pipe = Pipe() - task.standardOutput = pipe - task.standardError = pipe - - do { - try task.run() - task.waitUntilExit() - - if task.terminationStatus == 0, - let data = try? FileHandle(fileDescriptor: pipe.fileHandleForReading.fileDescriptor).readToEnd(), - let output = String(data: data, encoding: .utf8) { - return output.trimmingCharacters(in: .whitespacesAndNewlines) - } - } catch {} - - return nil + _isAvailable = false + _executablePath = nil + _version = nil + _checkCompleted = false } } diff --git a/ScreenTranslate/Resources/en.lproj/Localizable.strings b/ScreenTranslate/Resources/en.lproj/Localizable.strings index 7659ecf..ba70f68 100644 --- a/ScreenTranslate/Resources/en.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/en.lproj/Localizable.strings @@ -403,6 +403,28 @@ "onboarding.continue" = "Continue"; "onboarding.next" = "Next"; "onboarding.skip" = "Skip"; +"onboarding.complete" = "Complete"; + +/* Onboarding - PaddleOCR */ +"onboarding.paddleocr.title" = "PaddleOCR (Optional)"; +"onboarding.paddleocr.description" = "Enhanced OCR engine for better text recognition accuracy, especially for Chinese."; +"onboarding.paddleocr.installed" = "Installed"; +"onboarding.paddleocr.not.installed" = "Not Installed"; +"onboarding.paddleocr.install" = "Install"; +"onboarding.paddleocr.installing" = "Installing..."; +"onboarding.paddleocr.install.hint" = "Requires Python 3 and pip. Run: pip3 install paddleocr paddlepaddle"; +"onboarding.paddleocr.copy.command" = "Copy Command"; +"onboarding.paddleocr.refresh" = "Refresh Status"; +"onboarding.paddleocr.version" = "Version: %@"; + +/* Settings - PaddleOCR */ +"settings.paddleocr.installed" = "Installed"; +"settings.paddleocr.not.installed" = "Not Installed"; +"settings.paddleocr.install" = "Install"; +"settings.paddleocr.installing" = "Installing..."; +"settings.paddleocr.install.hint" = "Requires Python 3 and pip installed on your system."; +"settings.paddleocr.copy.command" = "Copy Command"; +"settings.paddleocr.refresh" = "Refresh Status"; /* ======================================== diff --git a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings index f8123b8..98eb964 100644 --- a/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings +++ b/ScreenTranslate/Resources/zh-Hans.lproj/Localizable.strings @@ -403,6 +403,28 @@ "onboarding.continue" = "继续"; "onboarding.next" = "下一步"; "onboarding.skip" = "跳过"; +"onboarding.complete" = "完成"; + +/* 引导 - PaddleOCR */ +"onboarding.paddleocr.title" = "PaddleOCR(可选)"; +"onboarding.paddleocr.description" = "增强型 OCR 引擎,文字识别更准确,尤其适合中文。"; +"onboarding.paddleocr.installed" = "已安装"; +"onboarding.paddleocr.not.installed" = "未安装"; +"onboarding.paddleocr.install" = "安装"; +"onboarding.paddleocr.installing" = "安装中..."; +"onboarding.paddleocr.install.hint" = "需要 Python 3 和 pip。执行命令:pip3 install paddleocr paddlepaddle"; +"onboarding.paddleocr.copy.command" = "复制命令"; +"onboarding.paddleocr.refresh" = "刷新状态"; +"onboarding.paddleocr.version" = "版本:%@"; + +/* 设置 - PaddleOCR */ +"settings.paddleocr.installed" = "已安装"; +"settings.paddleocr.not.installed" = "未安装"; +"settings.paddleocr.install" = "安装"; +"settings.paddleocr.installing" = "安装中..."; +"settings.paddleocr.install.hint" = "需要在系统上安装 Python 3 和 pip。"; +"settings.paddleocr.copy.command" = "复制命令"; +"settings.paddleocr.refresh" = "刷新状态"; /* ======================================== diff --git a/ScreenTranslate/Services/OCREngineProtocol.swift b/ScreenTranslate/Services/OCREngineProtocol.swift index 2275737..e7f9e66 100644 --- a/ScreenTranslate/Services/OCREngineProtocol.swift +++ b/ScreenTranslate/Services/OCREngineProtocol.swift @@ -53,17 +53,25 @@ actor OCRService { languages: Set ) async throws -> OCRResult { let engineType = await AppSettings.shared.ocrEngine + print("[OCRService] Engine type: \(engineType), image size: \(image.width)x\(image.height)") switch engineType { case .vision: + print("[OCRService] Using Vision engine") return try await visionEngine.recognize(image, languages: languages) case .paddleOCR: - guard await paddleOCREngine.isAvailable else { + print("[OCRService] Using PaddleOCR engine") + let isAvailable = await paddleOCREngine.isAvailable + print("[OCRService] PaddleOCR available: \(isAvailable)") + guard isAvailable else { + print("[OCRService] PaddleOCR not available, throwing error") throw OCREngineError.engineNotAvailable } - // Convert Vision languages to PaddleOCR languages let paddleLanguages = convertToPaddleOCRLanguages(languages) - return try await paddleOCREngine.recognize(image, languages: paddleLanguages) + print("[OCRService] PaddleOCR languages: \(paddleLanguages)") + let result = try await paddleOCREngine.recognize(image, languages: paddleLanguages) + print("[OCRService] PaddleOCR result: \(result.observations.count) observations") + return result } } diff --git a/ScreenTranslate/Services/PaddleOCREngine.swift b/ScreenTranslate/Services/PaddleOCREngine.swift index 0eb5240..ee64b8b 100644 --- a/ScreenTranslate/Services/PaddleOCREngine.swift +++ b/ScreenTranslate/Services/PaddleOCREngine.swift @@ -15,8 +15,10 @@ actor PaddleOCREngine { /// Whether PaddleOCR is available on the system var isAvailable: Bool { PaddleOCRChecker.isAvailable } - /// PaddleOCR executable path - private let executablePath = "/usr/local/bin/paddleocr" + /// Get executable path from checker + private var executablePath: String { + PaddleOCRChecker.executablePath ?? "/usr/local/bin/paddleocr" + } /// Maximum concurrent operations private var isProcessing = false @@ -202,25 +204,14 @@ actor PaddleOCREngine { /// Builds command line arguments for PaddleOCR private func buildArguments(config: Configuration, imagePath: String) -> [String] { var args = [ - "--image_path", imagePath, - "--use_angle_cls", config.useDirectionClassify ? "true" : "false", - "--lang", config.languages.map(\.rawValue).joined(separator: ",") + "ocr", + "-i", imagePath, + "--lang", "ch" ] if config.useGPU { - args.append("--use_gpu") - args.append("true") - } - - switch config.detectionModel { - case .default: - break - case .server: - args.append("--det_model_dir") - args.append("inference/ch_ppocr_server_v2.0_det/") - case .mobile: - args.append("--det_model_dir") - args.append("inference/ch_ppocr_mobile_v2.0_det/") + args.append("--device") + args.append("gpu") } return args @@ -228,9 +219,19 @@ actor PaddleOCREngine { /// Executes PaddleOCR with the given arguments private func executePaddleOCR(arguments: [String]) async throws -> String { + let fullCommand = "\(executablePath) \(arguments.joined(separator: " "))" + print("[PaddleOCREngine] Executing: \(fullCommand)") + let task = Process() - task.executableURL = URL(fileURLWithPath: executablePath) - task.arguments = arguments + task.executableURL = URL(fileURLWithPath: "/bin/zsh") + task.arguments = ["-c", fullCommand] + + task.environment = [ + "PATH": "\(NSHomeDirectory())/.pyenv/shims:\(NSHomeDirectory())/.pyenv/bin:/usr/local/bin:/usr/bin:/bin", + "HOME": NSHomeDirectory(), + "PYENV_ROOT": "\(NSHomeDirectory())/.pyenv", + "PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK": "True" + ] let stdoutPipe = Pipe() let stderrPipe = Pipe() @@ -239,121 +240,155 @@ actor PaddleOCREngine { do { try task.run() + print("[PaddleOCREngine] Process started, waiting...") task.waitUntilExit() - - let stdoutHandle = stdoutPipe.fileHandleForReading - let stderrHandle = stderrPipe.fileHandleForReading - - defer { - stdoutHandle.closeFile() - stderrHandle.closeFile() + print("[PaddleOCREngine] Process finished with exit code: \(task.terminationStatus)") + + let stdoutData = stdoutPipe.fileHandleForReading.readDataToEndOfFile() + let stderrData = stderrPipe.fileHandleForReading.readDataToEndOfFile() + var stdout = String(data: stdoutData, encoding: .utf8) ?? "" + let stderr = String(data: stderrData, encoding: .utf8) ?? "" + + // PaddleOCR outputs result to stderr, extract JSON from it + if stdout.isEmpty, let resultRange = stderr.range(of: "{'res':") { + let resultStart = stderr[resultRange.lowerBound...] + // Find the matching closing brace + if let jsonEnd = findMatchingBrace(in: String(resultStart)) { + stdout = String(resultStart.prefix(jsonEnd + 1)) + // Remove ANSI color codes + stdout = stdout.replacingOccurrences(of: "\u{001B}\\[[0-9;]*m", with: "", options: .regularExpression) + print("[PaddleOCREngine] Extracted result from stderr") + } } + + print("[PaddleOCREngine] output length: \(stdout.count)") + print("[PaddleOCREngine] output: \(stdout.prefix(1000))") let exitCode = task.terminationStatus - if exitCode != 0 { - let stderrData = stderrHandle.readDataToEndOfFile() - let stderr = String(data: stderrData, encoding: .utf8) ?? "Unknown error" - throw PaddleOCREngineError.recognitionFailed(underlying: stderr) + throw PaddleOCREngineError.recognitionFailed(underlying: stderr.isEmpty ? "Exit code \(exitCode)" : stderr) } - let stdoutData = stdoutHandle.readDataToEndOfFile() - guard let output = String(data: stdoutData, encoding: .utf8) else { + guard !stdout.isEmpty else { + print("[PaddleOCREngine] No result found in output") throw PaddleOCREngineError.invalidOutput } - return output + return stdout } catch let error as PaddleOCREngineError { throw error } catch { + print("[PaddleOCREngine] Error: \(error)") throw PaddleOCREngineError.recognitionFailed(underlying: error.localizedDescription) } } + + private func findMatchingBrace(in string: String) -> Int? { + var depth = 0 + for (index, char) in string.enumerated() { + if char == "{" { depth += 1 } + else if char == "}" { + depth -= 1 + if depth == 0 { return index } + } + } + return nil + } /// Parses PaddleOCR JSON output into OCRText observations private func parsePaddleOCROutput(_ output: String, imageSize: CGSize) throws -> [OCRText] { - // PaddleOCR outputs multiple lines with format: "text [[x1,y1],[x2,y2],...] confidence" var observations: [OCRText] = [] - let lines = output.components(separatedBy: .newlines) - - for line in lines where !line.isEmpty { - // Extract text, coordinates, and confidence using regex - let pattern = #"^(.+?)\s+\[\[.+?\]\]\s+(\d+\.\d+)"# - guard let regex = try? NSRegularExpression(pattern: pattern), - let match = regex.firstMatch(in: line, range: NSRange(line.startIndex..., in: line)), - match.numberOfRanges >= 3 else { - continue - } - // Extract text - if let textRange = Range(match.range(at: 1), in: line) { - let text = String(line[textRange]).trimmingCharacters(in: .whitespaces) - - // Extract confidence - if let confidenceRange = Range(match.range(at: 2), in: line), - let confidence = Float(String(line[confidenceRange])) { - // Parse bounding box coordinates - if let bbox = parseBoundingBox(from: line, imageSize: imageSize) { - let observation = OCRText( - text: text, - boundingBox: bbox, - confidence: confidence / 100.0 // Convert from percentage to 0-1 - ) - observations.append(observation) - } - } - } + guard let startIndex = output.firstIndex(of: "{"), + let endIndex = output.lastIndex(of: "}") else { + print("[PaddleOCREngine] No JSON found in output") + return observations } - return observations - } + let jsonLike = String(output[startIndex...endIndex]) + let cleanedJson = convertPythonDictToJson(jsonLike) + + print("[PaddleOCREngine] Cleaned JSON: \(cleanedJson.prefix(500))") - /// Parses bounding box coordinates from PaddleOCR output line - private func parseBoundingBox(from line: String, imageSize: CGSize) -> CGRect? { - // Extract coordinates: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] - let coordPattern = #"\[\[.+?\]\]"# - guard let coordRegex = try? NSRegularExpression(pattern: coordPattern), - let coordMatch = coordRegex.firstMatch(in: line, range: NSRange(line.startIndex..., in: line)), - let coordRange = Range(coordMatch.range, in: line) else { - return nil + guard let jsonData = cleanedJson.data(using: .utf8), + let json = try? JSONSerialization.jsonObject(with: jsonData) as? [String: Any], + let res = json["res"] as? [String: Any] else { + print("[PaddleOCREngine] Failed to parse JSON") + return observations } - let coordString = String(line[coordRange]) - // Parse individual points - let pointPattern = #"\[(\d+),(\d+)\]"# - guard let pointRegex = try? NSRegularExpression(pattern: pointPattern) else { - return nil + guard let recTexts = res["rec_texts"] as? [String] else { + print("[PaddleOCREngine] No rec_texts found") + return observations } - - var points: [CGPoint] = [] - for match in pointRegex.matches(in: coordString, range: NSRange(coordString.startIndex..., in: coordString)) { - if match.numberOfRanges >= 3, - let xRange = Range(match.range(at: 1), in: coordString), - let yRange = Range(match.range(at: 2), in: coordString), - let x = Int(String(coordString[xRange])), - let y = Int(String(coordString[yRange])) { - points.append(CGPoint(x: x, y: y)) + + let recScores = res["rec_scores"] as? [Double] ?? [] + let recBoxes = res["rec_boxes"] as? [[Int]] ?? [] + + print("[PaddleOCREngine] Found \(recTexts.count) texts, \(recBoxes.count) boxes") + + for (index, text) in recTexts.enumerated() { + let confidence = index < recScores.count ? Float(recScores[index]) : 0.5 + + var boundingBox: CGRect + if index < recBoxes.count && recBoxes[index].count >= 4 { + let box = recBoxes[index] + let x = CGFloat(box[0]) + let y = CGFloat(box[1]) + let x2 = CGFloat(box[2]) + let y2 = CGFloat(box[3]) + boundingBox = CGRect( + x: x / imageSize.width, + y: y / imageSize.height, + width: (x2 - x) / imageSize.width, + height: (y2 - y) / imageSize.height + ) + } else { + boundingBox = CGRect(x: 0, y: CGFloat(index) * 0.1, width: 1, height: 0.1) } + + let observation = OCRText( + text: text, + boundingBox: boundingBox, + confidence: confidence + ) + observations.append(observation) + print("[PaddleOCREngine] Text: '\(text)', box: \(boundingBox), confidence: \(confidence)") } - guard points.count >= 4 else { return nil } + return observations + } - // Calculate bounding box from points - let xCoords = points.map(\.x) - let yCoords = points.map(\.y) + private func convertPythonDictToJson(_ pythonDict: String) -> String { + var result = pythonDict + result = result.replacingOccurrences(of: "None", with: "null") + result = result.replacingOccurrences(of: "True", with: "true") + result = result.replacingOccurrences(of: "False", with: "false") + result = result.replacingOccurrences(of: "'", with: "\"") + + let arrayPattern = #"array\([^)]*\)[^,}\]]*"# + if let regex = try? NSRegularExpression(pattern: arrayPattern, options: [.dotMatchesLineSeparators]) { + let range = NSRange(result.startIndex..., in: result) + result = regex.stringByReplacingMatches(in: result, range: range, withTemplate: "[]") + } + + let dtypePattern = #",?\s*dtype=[^\)]+\)"# + if let regex = try? NSRegularExpression(pattern: dtypePattern) { + let range = NSRange(result.startIndex..., in: result) + result = regex.stringByReplacingMatches(in: result, range: range, withTemplate: "") + } + + let shapePattern = #",?\s*shape=\([^\)]+\)"# + if let regex = try? NSRegularExpression(pattern: shapePattern) { + let range = NSRange(result.startIndex..., in: result) + result = regex.stringByReplacingMatches(in: result, range: range, withTemplate: "") + } - let minX = xCoords.min() ?? 0 - let maxX = xCoords.max() ?? 0 - let minY = yCoords.min() ?? 0 - let maxY = yCoords.max() ?? 0 + return result + } - // Convert to normalized coordinates (0-1) - return CGRect( - x: CGFloat(minX) / imageSize.width, - y: CGFloat(minY) / imageSize.height, - width: CGFloat(maxX - minX) / imageSize.width, - height: CGFloat(maxY - minY) / imageSize.height - ) + private func parseBoundingBox(from line: String, imageSize: CGSize) -> CGRect? { + nil } }