From 36238db2c491b27b8572c4ff7bbd7ee56e1adeaa Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 12 Dec 2025 15:33:37 +0000 Subject: [PATCH 1/5] Initial plan From 137924e52850605284a62c04088c13419d178bbe Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 12 Dec 2025 15:48:51 +0000 Subject: [PATCH 2/5] Add custom redaction rules (block words, dates, regex patterns) Co-authored-by: karant-dev <22682299+karant-dev@users.noreply.github.com> --- src/App.tsx | 26 +- src/components/Header.tsx | 30 +++ src/components/SettingsDropdown.tsx | 384 +++++++++++++++++++++++----- src/hooks/useDetectionSettings.ts | 104 +++++++- src/hooks/useOCR.ts | 15 +- src/types/index.ts | 10 + src/utils/datePatterns.ts | 192 ++++++++++++++ src/utils/ocr.ts | 83 +++++- 8 files changed, 769 insertions(+), 75 deletions(-) create mode 100644 src/utils/datePatterns.ts diff --git a/src/App.tsx b/src/App.tsx index 1e27530..7955a32 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -16,7 +16,22 @@ import { ImagePreviewModal } from './components/ImagePreviewModal'; // ============================================================================ function App() { // Detection Settings (persisted to localStorage) - const { settings, updateSetting, addToAllowlist, removeFromAllowlist, resetAllowlist } = useDetectionSettings(); + const { + settings, + updateSetting, + addToAllowlist, + removeFromAllowlist, + resetAllowlist, + addBlockWord, + removeBlockWord, + resetBlockWords, + addCustomDate, + removeCustomDate, + resetCustomDates, + addCustomRegex, + removeCustomRegex, + resetCustomRegex, + } = useDetectionSettings(); // Hooks const { @@ -132,6 +147,15 @@ function App() { onAddToAllowlist={addToAllowlist} onRemoveFromAllowlist={removeFromAllowlist} onResetAllowlist={resetAllowlist} + onAddBlockWord={addBlockWord} + onRemoveBlockWord={removeBlockWord} + onResetBlockWords={resetBlockWords} + onAddCustomDate={addCustomDate} + onRemoveCustomDate={removeCustomDate} + onResetCustomDates={resetCustomDates} + onAddCustomRegex={addCustomRegex} + onRemoveCustomRegex={removeCustomRegex} + onResetCustomRegex={resetCustomRegex} />
diff --git a/src/components/Header.tsx b/src/components/Header.tsx index 8f8f878..f24bdfd 100644 --- a/src/components/Header.tsx +++ b/src/components/Header.tsx @@ -7,6 +7,18 @@ interface HeaderProps { onAddToAllowlist?: (value: string) => void; onRemoveFromAllowlist?: (value: string) => void; onResetAllowlist?: () => void; + // Block Words + onAddBlockWord?: (word: string) => void; + onRemoveBlockWord?: (word: string) => void; + onResetBlockWords?: () => void; + // Custom Dates + onAddCustomDate?: (dateStr: string) => string | null; + onRemoveCustomDate?: (dateStr: string) => void; + onResetCustomDates?: () => void; + // Custom Regex + onAddCustomRegex?: (pattern: string, caseSensitive: boolean, label?: string) => string | null; + onRemoveCustomRegex?: (id: string) => void; + onResetCustomRegex?: () => void; } export function Header({ @@ -15,6 +27,15 @@ export function Header({ onAddToAllowlist, onRemoveFromAllowlist, onResetAllowlist, + onAddBlockWord, + onRemoveBlockWord, + onResetBlockWords, + onAddCustomDate, + onRemoveCustomDate, + onResetCustomDates, + onAddCustomRegex, + onRemoveCustomRegex, + onResetCustomRegex, }: HeaderProps) { return (
@@ -34,6 +55,15 @@ export function Header({ onAddToAllowlist={onAddToAllowlist} onRemoveFromAllowlist={onRemoveFromAllowlist} onResetAllowlist={onResetAllowlist} + onAddBlockWord={onAddBlockWord} + onRemoveBlockWord={onRemoveBlockWord} + onResetBlockWords={onResetBlockWords} + onAddCustomDate={onAddCustomDate} + onRemoveCustomDate={onRemoveCustomDate} + onResetCustomDates={onResetCustomDates} + onAddCustomRegex={onAddCustomRegex} + onRemoveCustomRegex={onRemoveCustomRegex} + onResetCustomRegex={onResetCustomRegex} />
diff --git a/src/components/SettingsDropdown.tsx b/src/components/SettingsDropdown.tsx index f222a33..70923fb 100644 --- a/src/components/SettingsDropdown.tsx +++ b/src/components/SettingsDropdown.tsx @@ -1,5 +1,5 @@ import { useState, useRef, useEffect } from 'react'; -import type { DetectionSettings } from '../types'; +import type { DetectionSettings, CustomRegexRule } from '../types'; type BooleanSettingKey = 'email' | 'ip' | 'creditCard' | 'secret' | 'pii'; @@ -9,6 +9,18 @@ interface SettingsDropdownProps { onAddToAllowlist?: (value: string) => void; onRemoveFromAllowlist?: (value: string) => void; onResetAllowlist?: () => void; + // Block Words + onAddBlockWord?: (word: string) => void; + onRemoveBlockWord?: (word: string) => void; + onResetBlockWords?: () => void; + // Custom Dates + onAddCustomDate?: (dateStr: string) => string | null; + onRemoveCustomDate?: (dateStr: string) => void; + onResetCustomDates?: () => void; + // Custom Regex + onAddCustomRegex?: (pattern: string, caseSensitive: boolean, label?: string) => string | null; + onRemoveCustomRegex?: (id: string) => void; + onResetCustomRegex?: () => void; } const SETTING_LABELS: Record = { @@ -19,15 +31,144 @@ const SETTING_LABELS: Record = { pii: 'PII (SSN, etc.)', }; +// Reusable Remove Button Component +function RemoveButton({ onClick, label }: { onClick: () => void; label: string }) { + return ( + + ); +} + +// Section Header Component +function SectionHeader({ title, subtitle, onReset }: { title: string; subtitle: string; onReset?: () => void }) { + return ( +
+
+
+

{title}

+

{subtitle}

+
+ {onReset && ( + + )} +
+
+ ); +} + +// String List Items Component +function StringListItems({ + items, + onRemove, + emptyMessage +}: { + items: string[]; + onRemove: (id: string) => void; + emptyMessage: string; +}) { + return ( +
+ {items.length > 0 ? ( + items.map((item) => ( +
+ + {item} + + onRemove(item)} label={`Remove ${item}`} /> +
+ )) + ) : ( +

{emptyMessage}

+ )} +
+ ); +} + +// Regex List Items Component +function RegexListItems({ + items, + onRemove, + emptyMessage +}: { + items: CustomRegexRule[]; + onRemove: (id: string) => void; + emptyMessage: string; +}) { + return ( +
+ {items.length > 0 ? ( + items.map((rule) => ( +
+
+ + {rule.label || rule.pattern} + + {rule.label && ( + + {rule.pattern} + + )} + {rule.caseSensitive && ( + Case sensitive + )} +
+ onRemove(rule.id)} label={`Remove ${rule.label || rule.pattern}`} /> +
+ )) + ) : ( +

{emptyMessage}

+ )} +
+ ); +} + export function SettingsDropdown({ settings, onUpdateSetting, onAddToAllowlist, onRemoveFromAllowlist, onResetAllowlist, + onAddBlockWord, + onRemoveBlockWord, + onResetBlockWords, + onAddCustomDate, + onRemoveCustomDate, + onResetCustomDates, + onAddCustomRegex, + onRemoveCustomRegex, + onResetCustomRegex, }: SettingsDropdownProps) { const [isOpen, setIsOpen] = useState(false); const [newAllowlistValue, setNewAllowlistValue] = useState(''); + const [newBlockWord, setNewBlockWord] = useState(''); + const [newCustomDate, setNewCustomDate] = useState(''); + const [customDateError, setCustomDateError] = useState(null); + const [newRegexPattern, setNewRegexPattern] = useState(''); + const [newRegexCaseSensitive, setNewRegexCaseSensitive] = useState(false); + const [newRegexLabel, setNewRegexLabel] = useState(''); + const [regexError, setRegexError] = useState(null); const dropdownRef = useRef(null); // Close dropdown when clicking outside @@ -44,6 +185,13 @@ export function SettingsDropdown({ } }, [isOpen]); + const handleKeyDown = (e: React.KeyboardEvent, action: () => void) => { + if (e.key === 'Enter') { + e.preventDefault(); + action(); + } + }; + const handleAddAllowlistEntry = () => { if (newAllowlistValue.trim() && onAddToAllowlist) { onAddToAllowlist(newAllowlistValue.trim()); @@ -51,10 +199,36 @@ export function SettingsDropdown({ } }; - const handleKeyDown = (e: React.KeyboardEvent) => { - if (e.key === 'Enter') { - e.preventDefault(); - handleAddAllowlistEntry(); + const handleAddBlockWord = () => { + if (newBlockWord.trim() && onAddBlockWord) { + onAddBlockWord(newBlockWord.trim()); + setNewBlockWord(''); + } + }; + + const handleAddCustomDate = () => { + if (newCustomDate.trim() && onAddCustomDate) { + const error = onAddCustomDate(newCustomDate.trim()); + if (error) { + setCustomDateError(error); + } else { + setNewCustomDate(''); + setCustomDateError(null); + } + } + }; + + const handleAddCustomRegex = () => { + if (newRegexPattern.trim() && onAddCustomRegex) { + const error = onAddCustomRegex(newRegexPattern.trim(), newRegexCaseSensitive, newRegexLabel.trim() || undefined); + if (error) { + setRegexError(error); + } else { + setNewRegexPattern(''); + setNewRegexLabel(''); + setNewRegexCaseSensitive(false); + setRegexError(null); + } } }; @@ -77,7 +251,7 @@ export function SettingsDropdown({ {isOpen && (
@@ -104,75 +278,151 @@ export function SettingsDropdown({
{/* Safe Values (Allowlist) Section */} -
-
-
-

Safe Values

-

Never redact these

+ +
+ {onAddToAllowlist && ( +
+
+ setNewAllowlistValue(e.target.value)} + onKeyDown={(e) => handleKeyDown(e, handleAddAllowlistEntry)} + placeholder="Add safe value..." + className="flex-1 px-2 py-1 text-sm bg-slate-800 border border-slate-700 rounded-lg text-white placeholder-slate-500 focus:outline-none focus:ring-1 focus:ring-cyan-500" + /> + +
- {onResetAllowlist && ( - - )} -
+ )} + onRemoveFromAllowlist?.(item)} + emptyMessage="No safe values defined" + /> +
+ + {/* Advanced Section Header */} +
+

Advanced

+

Custom redaction rules

+
+ + {/* Block Words Section */} + +
+ {onAddBlockWord && ( +
+
+ setNewBlockWord(e.target.value)} + onKeyDown={(e) => handleKeyDown(e, handleAddBlockWord)} + placeholder="e.g., Project Titan, John Doe" + className="flex-1 px-2 py-1 text-sm bg-slate-800 border border-slate-700 rounded-lg text-white placeholder-slate-500 focus:outline-none focus:ring-1 focus:ring-cyan-500" + /> + +
+
+ )} + onRemoveBlockWord?.(word)} + emptyMessage="No block words defined" + />
+ + {/* Custom Dates Section */} + +
+ {onAddCustomDate && ( +
+
+ { setNewCustomDate(e.target.value); setCustomDateError(null); }} + onKeyDown={(e) => handleKeyDown(e, handleAddCustomDate)} + placeholder="e.g., 1990-05-15, 05/15/1990" + className="flex-1 px-2 py-1 text-sm bg-slate-800 border border-slate-700 rounded-lg text-white placeholder-slate-500 focus:outline-none focus:ring-1 focus:ring-cyan-500" + /> + +
+ {customDateError &&

{customDateError}

} +
+ )} + onRemoveCustomDate?.(date)} + emptyMessage="No dates to redact" + /> +
+ + {/* Custom Regex Section */} +
- {/* Add new entry input */} - {onAddToAllowlist && ( -
+ {onAddCustomRegex && ( +
setNewAllowlistValue(e.target.value)} - onKeyDown={handleKeyDown} - placeholder="Add safe value..." - className="flex-1 px-2 py-1 text-sm bg-slate-800 border border-slate-700 rounded-lg text-white placeholder-slate-500 focus:outline-none focus:ring-1 focus:ring-cyan-500" + value={newRegexPattern} + onChange={(e) => { setNewRegexPattern(e.target.value); setRegexError(null); }} + onKeyDown={(e) => handleKeyDown(e, handleAddCustomRegex)} + placeholder="Regex pattern, e.g., INV-\d{4}" + className="w-full px-2 py-1 text-sm bg-slate-800 border border-slate-700 rounded-lg text-white placeholder-slate-500 focus:outline-none focus:ring-1 focus:ring-cyan-500 font-mono" /> - + setNewRegexLabel(e.target.value)} + placeholder="Label (optional)" + className="w-full mt-2 px-2 py-1 text-sm bg-slate-800 border border-slate-700 rounded-lg text-white placeholder-slate-500 focus:outline-none focus:ring-1 focus:ring-cyan-500" + /> +
+ + +
+ {regexError &&

{regexError}

}
)} - - {/* Allowlist entries */} -
- {settings.allowlist && settings.allowlist.length > 0 ? ( - settings.allowlist.map((item) => ( -
- - {item} - - {onRemoveFromAllowlist && ( - - )} -
- )) - ) : ( -

No safe values defined

- )} -
+ onRemoveCustomRegex?.(id)} + emptyMessage="No custom patterns defined" + />
)} diff --git a/src/hooks/useDetectionSettings.ts b/src/hooks/useDetectionSettings.ts index a55c7f4..d018e5a 100644 --- a/src/hooks/useDetectionSettings.ts +++ b/src/hooks/useDetectionSettings.ts @@ -1,6 +1,7 @@ import { useState, useCallback, useEffect } from 'react'; -import type { DetectionSettings } from '../types'; +import type { DetectionSettings, CustomRegexRule } from '../types'; import { DEFAULT_ALLOWLIST } from '../constants/config'; +import { validateRegex, parseDate } from '../utils/datePatterns'; const STORAGE_KEY = 'autoredact_detection_settings'; @@ -11,6 +12,9 @@ const DEFAULT_SETTINGS: DetectionSettings = { secret: true, pii: true, allowlist: DEFAULT_ALLOWLIST, + blockWords: [], + customDates: [], + customRegex: [], }; export function useDetectionSettings() { @@ -66,6 +70,92 @@ export function useDetectionSettings() { setSettings(DEFAULT_SETTINGS); }, []); + // Block Words Management + const addBlockWord = useCallback((word: string) => { + const trimmed = word.trim(); + if (!trimmed) return; + setSettings(prev => { + const lowerValue = trimmed.toLowerCase(); + const exists = prev.blockWords.some(item => item.toLowerCase() === lowerValue); + if (exists) return prev; + return { ...prev, blockWords: [...prev.blockWords, trimmed] }; + }); + }, []); + + const removeBlockWord = useCallback((word: string) => { + setSettings(prev => ({ + ...prev, + blockWords: prev.blockWords.filter(item => item.toLowerCase() !== word.toLowerCase()), + })); + }, []); + + const resetBlockWords = useCallback(() => { + setSettings(prev => ({ ...prev, blockWords: [] })); + }, []); + + // Custom Dates Management + const addCustomDate = useCallback((dateStr: string): string | null => { + const trimmed = dateStr.trim(); + if (!trimmed) return 'Date cannot be empty'; + + // Validate the date can be parsed + const parsed = parseDate(trimmed); + if (!parsed) { + return 'Invalid date format. Try: YYYY-MM-DD, MM/DD/YYYY, January 15, 2024'; + } + + setSettings(prev => { + const exists = prev.customDates.some(item => item === trimmed); + if (exists) return prev; + return { ...prev, customDates: [...prev.customDates, trimmed] }; + }); + return null; + }, []); + + const removeCustomDate = useCallback((dateStr: string) => { + setSettings(prev => ({ + ...prev, + customDates: prev.customDates.filter(item => item !== dateStr), + })); + }, []); + + const resetCustomDates = useCallback(() => { + setSettings(prev => ({ ...prev, customDates: [] })); + }, []); + + // Custom Regex Management + const addCustomRegex = useCallback((pattern: string, caseSensitive: boolean = false, label?: string): string | null => { + const trimmed = pattern.trim(); + const error = validateRegex(trimmed); + if (error) return error; + + const newRule: CustomRegexRule = { + id: `regex-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`, + pattern: trimmed, + caseSensitive, + label: label?.trim() || undefined, + }; + + setSettings(prev => { + // Check for duplicate patterns + const exists = prev.customRegex.some(r => r.pattern === trimmed); + if (exists) return prev; + return { ...prev, customRegex: [...prev.customRegex, newRule] }; + }); + return null; + }, []); + + const removeCustomRegex = useCallback((id: string) => { + setSettings(prev => ({ + ...prev, + customRegex: prev.customRegex.filter(rule => rule.id !== id), + })); + }, []); + + const resetCustomRegex = useCallback(() => { + setSettings(prev => ({ ...prev, customRegex: [] })); + }, []); + return { settings, updateSetting, @@ -73,5 +163,17 @@ export function useDetectionSettings() { removeFromAllowlist, resetAllowlist, resetSettings, + // Block Words + addBlockWord, + removeBlockWord, + resetBlockWords, + // Custom Dates + addCustomDate, + removeCustomDate, + resetCustomDates, + // Custom Regex + addCustomRegex, + removeCustomRegex, + resetCustomRegex, }; } diff --git a/src/hooks/useOCR.ts b/src/hooks/useOCR.ts index b212572..e85d6be 100644 --- a/src/hooks/useOCR.ts +++ b/src/hooks/useOCR.ts @@ -2,7 +2,7 @@ import { useState, useCallback } from 'react'; import { createWorker } from 'tesseract.js'; import type { DetectedItem, ProcessingState, DetectionSettings } from '../types'; import { SENSITIVE_PATTERNS } from '../constants/patterns'; -import { findMatches, filterAllowlistedMatches } from '../utils/ocr'; +import { findMatches, filterAllowlistedMatches, findBlockWordMatches, findCustomDateMatches, findCustomRegexMatches } from '../utils/ocr'; import { preprocessImage } from '../utils/canvas'; export function useOCR(detectionSettings: DetectionSettings) { @@ -153,6 +153,12 @@ export function useOCR(detectionSettings: DetectionSettings) { }); } + // Custom Rules: Block Words, Custom Dates, Custom Regex (all treated as 'pii' type) + const blockWordMatches = findBlockWordMatches(detectionSettings.blockWords || [], fullText, 'pii'); + const customDateMatches = findCustomDateMatches(detectionSettings.customDates || [], fullText, 'pii'); + const customRegexMatches = findCustomRegexMatches(detectionSettings.customRegex || [], fullText, 'pii'); + const customMatches = [...blockWordMatches, ...customDateMatches, ...customRegexMatches]; + // Apply allowlist filtering to all match types const allowlist = detectionSettings.allowlist || []; const filteredEmailMatches = filterAllowlistedMatches(emailMatches, allowlist); @@ -160,16 +166,17 @@ export function useOCR(detectionSettings: DetectionSettings) { const filteredCcMatches = filterAllowlistedMatches(ccMatches, allowlist); const filteredPiiMatches = filterAllowlistedMatches(piiMatches, allowlist); const filteredSecretMatches = filterAllowlistedMatches(secretMatches, allowlist); + const filteredCustomMatches = filterAllowlistedMatches(customMatches, allowlist); - const allMatches = [...filteredEmailMatches, ...filteredIpMatches, ...filteredCcMatches, ...filteredPiiMatches, ...filteredSecretMatches]; + const allMatches = [...filteredEmailMatches, ...filteredIpMatches, ...filteredCcMatches, ...filteredPiiMatches, ...filteredSecretMatches, ...filteredCustomMatches]; - // Update stats with Entity counts + // Update stats with Entity counts (custom matches added to PII count) setDetectionStats({ emails: filteredEmailMatches.length, ips: filteredIpMatches.length, creditCards: filteredCcMatches.length, secrets: filteredSecretMatches.length, - pii: filteredPiiMatches.length, + pii: filteredPiiMatches.length + filteredCustomMatches.length, }); // Use blocks for precise redaction with Positional Mapping diff --git a/src/types/index.ts b/src/types/index.ts index 9afa795..4b7edfd 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -36,6 +36,13 @@ export interface BatchProgress { isProcessing: boolean; } +export interface CustomRegexRule { + id: string; + pattern: string; + label?: string; + caseSensitive: boolean; +} + export interface DetectionSettings { email: boolean; ip: boolean; @@ -43,4 +50,7 @@ export interface DetectionSettings { secret: boolean; pii: boolean; allowlist: string[]; + blockWords: string[]; + customDates: string[]; + customRegex: CustomRegexRule[]; } diff --git a/src/utils/datePatterns.ts b/src/utils/datePatterns.ts new file mode 100644 index 0000000..fdeb1de --- /dev/null +++ b/src/utils/datePatterns.ts @@ -0,0 +1,192 @@ +/** + * Date Pattern Generator + * + * Parses a date string and generates regex patterns to match that date + * in multiple common formats. + */ + +interface ParsedDate { + year: number; + month: number; + day: number; +} + +// Month names for generating patterns +const MONTH_NAMES = [ + 'January', 'February', 'March', 'April', 'May', 'June', + 'July', 'August', 'September', 'October', 'November', 'December' +]; +const MONTH_ABBREV = [ + 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', + 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' +]; + +/** + * Parse a date string into year, month, day components + * Supports formats: + * - ISO: 2024-01-15, 2024/01/15 + * - US: 01/15/2024, 01-15-2024 + * - EU: 15/01/2024, 15-01-2024, 15.01.2024 + * - Long: January 15, 2024 | 15 January 2024 + */ +export function parseDate(dateStr: string): ParsedDate | null { + const trimmed = dateStr.trim(); + + // Try ISO format: YYYY-MM-DD or YYYY/MM/DD + const isoMatch = trimmed.match(/^(\d{4})[-/](\d{1,2})[-/](\d{1,2})$/); + if (isoMatch) { + const [, year, month, day] = isoMatch; + const parsed = { year: parseInt(year, 10), month: parseInt(month, 10), day: parseInt(day, 10) }; + if (isValidDate(parsed)) return parsed; + } + + // Try US format: MM/DD/YYYY or MM-DD-YYYY + const usMatch = trimmed.match(/^(\d{1,2})[-/](\d{1,2})[-/](\d{4})$/); + if (usMatch) { + const [, month, day, year] = usMatch; + const parsed = { year: parseInt(year, 10), month: parseInt(month, 10), day: parseInt(day, 10) }; + if (isValidDate(parsed)) return parsed; + } + + // Try EU format: DD/MM/YYYY or DD-MM-YYYY or DD.MM.YYYY + const euMatch = trimmed.match(/^(\d{1,2})[-/.](\d{1,2})[-/.](\d{4})$/); + if (euMatch) { + const [, day, month, year] = euMatch; + const parsed = { year: parseInt(year, 10), month: parseInt(month, 10), day: parseInt(day, 10) }; + if (isValidDate(parsed)) return parsed; + } + + // Try long format: Month DD, YYYY or DD Month YYYY + const longMatch1 = trimmed.match(/^([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4})$/); + if (longMatch1) { + const [, monthName, day, year] = longMatch1; + const month = parseMonthName(monthName); + if (month > 0) { + const parsed = { year: parseInt(year, 10), month, day: parseInt(day, 10) }; + if (isValidDate(parsed)) return parsed; + } + } + + const longMatch2 = trimmed.match(/^(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})$/); + if (longMatch2) { + const [, day, monthName, year] = longMatch2; + const month = parseMonthName(monthName); + if (month > 0) { + const parsed = { year: parseInt(year, 10), month, day: parseInt(day, 10) }; + if (isValidDate(parsed)) return parsed; + } + } + + return null; +} + +function parseMonthName(name: string): number { + const lower = name.toLowerCase(); + for (let i = 0; i < MONTH_NAMES.length; i++) { + if (MONTH_NAMES[i].toLowerCase() === lower || MONTH_ABBREV[i].toLowerCase() === lower) { + return i + 1; + } + } + return 0; +} + +function isValidDate(date: ParsedDate): boolean { + const { year, month, day } = date; + if (month < 1 || month > 12) return false; + if (day < 1 || day > 31) return false; + if (year < 1900 || year > 2100) return false; + return true; +} + +/** + * Escape special regex characters in a string + */ +function escapeRegex(str: string): string { + return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +/** + * Generate regex patterns to match a date in multiple common formats + */ +export function generateDatePatterns(dateStr: string): RegExp[] { + const parsed = parseDate(dateStr); + if (!parsed) return []; + + const { year, month, day } = parsed; + const patterns: string[] = []; + + // Padding helpers + const mm = month.toString().padStart(2, '0'); + const dd = day.toString().padStart(2, '0'); + const m = month.toString(); + const d = day.toString(); + const yy = year.toString().slice(-2); + const yyyy = year.toString(); + + // Month names + const monthFull = MONTH_NAMES[month - 1]; + const monthAbbr = MONTH_ABBREV[month - 1]; + + // ISO: YYYY-MM-DD, YYYY/MM/DD + patterns.push(`${yyyy}[-/]${mm}[-/]${dd}`); + patterns.push(`${yyyy}[-/]${m}[-/]${d}`); + + // US: MM/DD/YYYY, MM-DD-YYYY, M/D/YYYY + patterns.push(`${mm}[-/]${dd}[-/]${yyyy}`); + patterns.push(`${m}[-/]${d}[-/]${yyyy}`); + patterns.push(`${mm}[-/]${dd}[-/]${yy}`); + patterns.push(`${m}[-/]${d}[-/]${yy}`); + + // EU: DD/MM/YYYY, DD-MM-YYYY, DD.MM.YYYY + patterns.push(`${dd}[-/.]${mm}[-/.]${yyyy}`); + patterns.push(`${d}[-/.]${m}[-/.]${yyyy}`); + patterns.push(`${dd}[-/.]${mm}[-/.]${yy}`); + patterns.push(`${d}[-/.]${m}[-/.]${yy}`); + + // Long format: Month DD, YYYY and DD Month YYYY (case-insensitive) + const monthPatternFull = `(?:${escapeRegex(monthFull)}|${escapeRegex(monthFull.toUpperCase())}|${escapeRegex(monthFull.toLowerCase())})`; + const monthPatternAbbr = `(?:${escapeRegex(monthAbbr)}\\.?|${escapeRegex(monthAbbr.toUpperCase())}\\.?|${escapeRegex(monthAbbr.toLowerCase())}\\.?)`; + const monthPattern = `(?:${monthPatternFull}|${monthPatternAbbr})`; + + // Month DD, YYYY + patterns.push(`${monthPattern}\\s+${dd},?\\s+${yyyy}`); + patterns.push(`${monthPattern}\\s+${d},?\\s+${yyyy}`); + + // DD Month YYYY + patterns.push(`${dd}\\s+${monthPattern}\\s+${yyyy}`); + patterns.push(`${d}\\s+${monthPattern}\\s+${yyyy}`); + + // Also add ordinal day formats (1st, 2nd, 3rd, etc.) + const ordinalSuffix = getOrdinalSuffix(day); + patterns.push(`${monthPattern}\\s+${d}${ordinalSuffix},?\\s+${yyyy}`); + patterns.push(`${d}${ordinalSuffix}\\s+${monthPattern}\\s+${yyyy}`); + + // Create regex objects with word boundaries + return patterns.map(p => new RegExp(`\\b${p}\\b`, 'gi')); +} + +function getOrdinalSuffix(day: number): string { + if (day >= 11 && day <= 13) return 'th'; + switch (day % 10) { + case 1: return 'st'; + case 2: return 'nd'; + case 3: return 'rd'; + default: return 'th'; + } +} + +/** + * Validate a regex pattern string + * Returns null if valid, error message if invalid + */ +export function validateRegex(pattern: string): string | null { + if (!pattern || pattern.trim() === '') { + return 'Pattern cannot be empty'; + } + try { + new RegExp(pattern); + return null; + } catch (e) { + return `Invalid regex: ${e instanceof Error ? e.message : 'Unknown error'}`; + } +} diff --git a/src/utils/ocr.ts b/src/utils/ocr.ts index 9a69898..9b681fd 100644 --- a/src/utils/ocr.ts +++ b/src/utils/ocr.ts @@ -3,6 +3,7 @@ import type { DetectedItem, DetectionSettings } from '../types'; import { SENSITIVE_PATTERNS } from '../constants/patterns'; import { DEFAULT_ALLOWLIST } from '../constants/config'; import { preprocessImage } from './canvas'; +import { generateDatePatterns } from './datePatterns'; // Helper: find all pattern matches with their positions export const findMatches = (pattern: RegExp, text: string, type: DetectedItem['type']): Array<{ text: string, type: DetectedItem['type'], index: number }> => { @@ -15,6 +16,72 @@ export const findMatches = (pattern: RegExp, text: string, type: DetectedItem['t return matches; }; +// Helper: find block word matches (case-insensitive by default) +export const findBlockWordMatches = ( + blockWords: string[], + text: string, + type: DetectedItem['type'] +): Array<{ text: string, type: DetectedItem['type'], index: number }> => { + const matches: Array<{ text: string, type: DetectedItem['type'], index: number }> = []; + for (const word of blockWords) { + if (!word.trim()) continue; + // Create a case-insensitive regex with word boundaries + const escapedWord = word.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + const pattern = new RegExp(`\\b${escapedWord}\\b`, 'gi'); + let match; + while ((match = pattern.exec(text)) !== null) { + matches.push({ text: match[0], type, index: match.index }); + } + } + return matches; +}; + +// Helper: find custom date matches in multiple formats +export const findCustomDateMatches = ( + customDates: string[], + text: string, + type: DetectedItem['type'] +): Array<{ text: string, type: DetectedItem['type'], index: number }> => { + const matches: Array<{ text: string, type: DetectedItem['type'], index: number }> = []; + for (const dateStr of customDates) { + const patterns = generateDatePatterns(dateStr); + for (const pattern of patterns) { + let match; + while ((match = pattern.exec(text)) !== null) { + // Avoid duplicate matches at the same position + const exists = matches.some(m => m.index === match!.index && m.text === match![0]); + if (!exists) { + matches.push({ text: match[0], type, index: match.index }); + } + } + } + } + return matches; +}; + +// Helper: find custom regex matches +export const findCustomRegexMatches = ( + customRegex: DetectionSettings['customRegex'], + text: string, + type: DetectedItem['type'] +): Array<{ text: string, type: DetectedItem['type'], index: number }> => { + const matches: Array<{ text: string, type: DetectedItem['type'], index: number }> = []; + for (const rule of customRegex) { + try { + const flags = rule.caseSensitive ? 'g' : 'gi'; + const pattern = new RegExp(rule.pattern, flags); + let match; + while ((match = pattern.exec(text)) !== null) { + matches.push({ text: match[0], type, index: match.index }); + } + } catch { + // Skip invalid patterns (should be validated on input, but be safe) + console.warn(`Skipping invalid regex pattern: ${rule.pattern}`); + } + } + return matches; +}; + // Helper: check if a text matches any allowlisted value (case-insensitive) export const isAllowlisted = (text: string, allowlist: string[]): boolean => { const lowerText = text.toLowerCase(); @@ -37,6 +104,9 @@ const DEFAULT_DETECTION_SETTINGS: DetectionSettings = { secret: true, pii: true, allowlist: DEFAULT_ALLOWLIST, + blockWords: [], + customDates: [], + customRegex: [], }; interface ProcessImageOptions { @@ -154,6 +224,12 @@ export const processImageForBatch = async ( }); } + // 6. Custom Rules: Block Words, Custom Dates, Custom Regex (all treated as 'pii' type) + const blockWordMatches = findBlockWordMatches(detectionSettings.blockWords || [], fullText, 'pii'); + const customDateMatches = findCustomDateMatches(detectionSettings.customDates || [], fullText, 'pii'); + const customRegexMatches = findCustomRegexMatches(detectionSettings.customRegex || [], fullText, 'pii'); + const customMatches = [...blockWordMatches, ...customDateMatches, ...customRegexMatches]; + // Apply allowlist filtering to all match types const allowlist = detectionSettings.allowlist || []; const filteredEmailMatches = filterAllowlistedMatches(emailMatches, allowlist); @@ -161,8 +237,9 @@ export const processImageForBatch = async ( const filteredCcMatches = filterAllowlistedMatches(ccMatches, allowlist); const filteredPiiMatches = filterAllowlistedMatches(piiMatches, allowlist); const filteredSecretMatches = filterAllowlistedMatches(secretMatches, allowlist); + const filteredCustomMatches = filterAllowlistedMatches(customMatches, allowlist); - const allMatches = [...filteredEmailMatches, ...filteredIpMatches, ...filteredCcMatches, ...filteredPiiMatches, ...filteredSecretMatches]; + const allMatches = [...filteredEmailMatches, ...filteredIpMatches, ...filteredCcMatches, ...filteredPiiMatches, ...filteredSecretMatches, ...filteredCustomMatches]; console.log(`[Batch] Matches for ${file.name}:`, { emails: filteredEmailMatches.map(m => m.text), @@ -170,15 +247,17 @@ export const processImageForBatch = async ( creditCards: filteredCcMatches.map(m => m.text), secrets: filteredSecretMatches.map(m => m.text), pii: filteredPiiMatches.map(m => m.text), + custom: filteredCustomMatches.map(m => m.text), }); // Create Stats Breakdown (Count actual entities, not redaction boxes) + // Custom matches are included in PII count for simplicity const detectedBreakdown = { emails: filteredEmailMatches.length, ips: filteredIpMatches.length, creditCards: filteredCcMatches.length, secrets: filteredSecretMatches.length, - pii: filteredPiiMatches.length, + pii: filteredPiiMatches.length + filteredCustomMatches.length, }; const detectedCount = Object.values(detectedBreakdown).reduce((a, b) => a + b, 0); From bed88b5aa6d9d5b5be7808f113c7712206ad1f53 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 12 Dec 2025 15:53:36 +0000 Subject: [PATCH 3/5] Address code review feedback: add regex lastIndex reset and improve date validation Co-authored-by: karant-dev <22682299+karant-dev@users.noreply.github.com> --- src/utils/datePatterns.ts | 7 ++++++- src/utils/ocr.ts | 10 +++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/utils/datePatterns.ts b/src/utils/datePatterns.ts index fdeb1de..78e64b5 100644 --- a/src/utils/datePatterns.ts +++ b/src/utils/datePatterns.ts @@ -95,7 +95,12 @@ function isValidDate(date: ParsedDate): boolean { if (month < 1 || month > 12) return false; if (day < 1 || day > 31) return false; if (year < 1900 || year > 2100) return false; - return true; + + // Use native Date for proper validation (handles leap years, days per month) + const testDate = new Date(year, month - 1, day); + return testDate.getFullYear() === year && + testDate.getMonth() === month - 1 && + testDate.getDate() === day; } /** diff --git a/src/utils/ocr.ts b/src/utils/ocr.ts index 9b681fd..805d205 100644 --- a/src/utils/ocr.ts +++ b/src/utils/ocr.ts @@ -46,12 +46,15 @@ export const findCustomDateMatches = ( for (const dateStr of customDates) { const patterns = generateDatePatterns(dateStr); for (const pattern of patterns) { - let match; + pattern.lastIndex = 0; + let match: RegExpExecArray | null; while ((match = pattern.exec(text)) !== null) { // Avoid duplicate matches at the same position - const exists = matches.some(m => m.index === match!.index && m.text === match![0]); + const matchIndex = match.index; + const matchText = match[0]; + const exists = matches.some(m => m.index === matchIndex && m.text === matchText); if (!exists) { - matches.push({ text: match[0], type, index: match.index }); + matches.push({ text: matchText, type, index: matchIndex }); } } } @@ -70,6 +73,7 @@ export const findCustomRegexMatches = ( try { const flags = rule.caseSensitive ? 'g' : 'gi'; const pattern = new RegExp(rule.pattern, flags); + pattern.lastIndex = 0; let match; while ((match = pattern.exec(text)) !== null) { matches.push({ text: match[0], type, index: match.index }); From f7d639e4c4ae4d1a7f2d992befc1dfbded1fc216 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 12 Dec 2025 16:06:50 +0000 Subject: [PATCH 4/5] Fix over-redaction: only redact words that contain or match the sensitive text Co-authored-by: karant-dev <22682299+karant-dev@users.noreply.github.com> --- src/hooks/useOCR.ts | 14 ++++++++++++-- src/utils/ocr.ts | 13 ++++++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/hooks/useOCR.ts b/src/hooks/useOCR.ts index e85d6be..359acdf 100644 --- a/src/hooks/useOCR.ts +++ b/src/hooks/useOCR.ts @@ -201,11 +201,21 @@ export function useOCR(detectionSettings: DetectionSettings) { const wordStart = index; const wordEnd = index + wordText.length; - // Check overlap + // Check overlap with additional text-based validation const match = allMatches.find(m => { const mStart = m.index; const mEnd = m.index + m.text.length; - return wordStart < mEnd && wordEnd > mStart; + const hasPositionalOverlap = wordStart < mEnd && wordEnd > mStart; + + if (!hasPositionalOverlap) return false; + + // Additional text-based validation to avoid over-redaction: + // The word should contain the match text, or vice versa + const wordLower = wordText.toLowerCase(); + const matchLower = m.text.toLowerCase(); + const hasTextOverlap = wordLower.includes(matchLower) || matchLower.includes(wordLower); + + return hasTextOverlap; }); if (match) { diff --git a/src/utils/ocr.ts b/src/utils/ocr.ts index 805d205..f2180b1 100644 --- a/src/utils/ocr.ts +++ b/src/utils/ocr.ts @@ -285,10 +285,21 @@ export const processImageForBatch = async ( // Check for spatial overlap with any sensitive match // Overlap condition: (StartA < EndB) and (EndA > StartB) + // Additional check: word must actually contain/overlap with the match text const match = allMatches.find(m => { const matchStart = m.index; const matchEnd = m.index + m.text.length; - return wordStart < matchEnd && wordEnd > matchStart; + const hasPositionalOverlap = wordStart < matchEnd && wordEnd > matchStart; + + if (!hasPositionalOverlap) return false; + + // Additional text-based validation to avoid over-redaction: + // The word should contain the match text, or vice versa + const wordLower = wordText.toLowerCase(); + const matchLower = m.text.toLowerCase(); + const hasTextOverlap = wordLower.includes(matchLower) || matchLower.includes(wordLower); + + return hasTextOverlap; }); if (match) { From dca4867db129bc2436ce977c2b98600552197e53 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 12 Dec 2025 16:10:05 +0000 Subject: [PATCH 5/5] Refactor: extract hasValidOverlap helper to reduce code duplication Co-authored-by: karant-dev <22682299+karant-dev@users.noreply.github.com> --- src/hooks/useOCR.ts | 22 +++++----------------- src/utils/ocr.ts | 42 ++++++++++++++++++++++++------------------ 2 files changed, 29 insertions(+), 35 deletions(-) diff --git a/src/hooks/useOCR.ts b/src/hooks/useOCR.ts index 359acdf..2fe1209 100644 --- a/src/hooks/useOCR.ts +++ b/src/hooks/useOCR.ts @@ -2,7 +2,7 @@ import { useState, useCallback } from 'react'; import { createWorker } from 'tesseract.js'; import type { DetectedItem, ProcessingState, DetectionSettings } from '../types'; import { SENSITIVE_PATTERNS } from '../constants/patterns'; -import { findMatches, filterAllowlistedMatches, findBlockWordMatches, findCustomDateMatches, findCustomRegexMatches } from '../utils/ocr'; +import { findMatches, filterAllowlistedMatches, findBlockWordMatches, findCustomDateMatches, findCustomRegexMatches, hasValidOverlap } from '../utils/ocr'; import { preprocessImage } from '../utils/canvas'; export function useOCR(detectionSettings: DetectionSettings) { @@ -201,22 +201,10 @@ export function useOCR(detectionSettings: DetectionSettings) { const wordStart = index; const wordEnd = index + wordText.length; - // Check overlap with additional text-based validation - const match = allMatches.find(m => { - const mStart = m.index; - const mEnd = m.index + m.text.length; - const hasPositionalOverlap = wordStart < mEnd && wordEnd > mStart; - - if (!hasPositionalOverlap) return false; - - // Additional text-based validation to avoid over-redaction: - // The word should contain the match text, or vice versa - const wordLower = wordText.toLowerCase(); - const matchLower = m.text.toLowerCase(); - const hasTextOverlap = wordLower.includes(matchLower) || matchLower.includes(wordLower); - - return hasTextOverlap; - }); + // Check overlap using shared helper for validation + const match = allMatches.find(m => + hasValidOverlap(wordStart, wordEnd, wordText, m.index, m.index + m.text.length, m.text) + ); if (match) { const bbox = word.bbox; diff --git a/src/utils/ocr.ts b/src/utils/ocr.ts index f2180b1..f034fdc 100644 --- a/src/utils/ocr.ts +++ b/src/utils/ocr.ts @@ -100,6 +100,26 @@ export const filterAllowlistedMatches = ( return matches.filter(match => !isAllowlisted(match.text, allowlist)); }; +// Helper: check if word has valid overlap with a match (both positional and text-based) +export const hasValidOverlap = ( + wordStart: number, + wordEnd: number, + wordText: string, + matchStart: number, + matchEnd: number, + matchText: string +): boolean => { + // Check positional overlap: (StartA < EndB) and (EndA > StartB) + const hasPositionalOverlap = wordStart < matchEnd && wordEnd > matchStart; + if (!hasPositionalOverlap) return false; + + // Additional text-based validation to avoid over-redaction: + // The word should contain the match text, or vice versa + const wordLower = wordText.toLowerCase(); + const matchLower = matchText.toLowerCase(); + return wordLower.includes(matchLower) || matchLower.includes(wordLower); +}; + // Default settings for backward compatibility (all enabled) const DEFAULT_DETECTION_SETTINGS: DetectionSettings = { email: true, @@ -283,24 +303,10 @@ export const processImageForBatch = async ( const wordStart = index; const wordEnd = index + wordText.length; - // Check for spatial overlap with any sensitive match - // Overlap condition: (StartA < EndB) and (EndA > StartB) - // Additional check: word must actually contain/overlap with the match text - const match = allMatches.find(m => { - const matchStart = m.index; - const matchEnd = m.index + m.text.length; - const hasPositionalOverlap = wordStart < matchEnd && wordEnd > matchStart; - - if (!hasPositionalOverlap) return false; - - // Additional text-based validation to avoid over-redaction: - // The word should contain the match text, or vice versa - const wordLower = wordText.toLowerCase(); - const matchLower = m.text.toLowerCase(); - const hasTextOverlap = wordLower.includes(matchLower) || matchLower.includes(wordLower); - - return hasTextOverlap; - }); + // Check for spatial overlap with any sensitive match using helper + const match = allMatches.find(m => + hasValidOverlap(wordStart, wordEnd, wordText, m.index, m.index + m.text.length, m.text) + ); if (match) { detected.push({ text: wordText, type: match.type, bbox: word.bbox });