From 5489e4c5a2581de65bb9d67f83da3e10f2f515b9 Mon Sep 17 00:00:00 2001 From: FinleyGe Date: Tue, 11 Nov 2025 15:38:04 +0800 Subject: [PATCH] perf: docDiff --- .../packages/docDiff/src/diffAlgorithm.ts | 550 ++++++++++++++++++ modules/tool/packages/docDiff/src/index.ts | 207 ++----- .../packages/docDiff/src/textNormalizer.ts | 449 ++++++++++++++ .../docDiff/test/diffAlgorithm.test.ts | 398 +++++++++++++ .../test/integration-tolerance.test.ts | 91 +++ .../packages/docDiff/test/integration.test.ts | 70 +++ .../docDiff/test/lineBreakTolerance.test.ts | 393 +++++++++++++ .../docDiff/test/lineTolerance.test.ts | 378 ++++++++++++ 8 files changed, 2363 insertions(+), 173 deletions(-) create mode 100644 modules/tool/packages/docDiff/src/diffAlgorithm.ts create mode 100644 modules/tool/packages/docDiff/src/textNormalizer.ts create mode 100644 modules/tool/packages/docDiff/test/diffAlgorithm.test.ts create mode 100644 modules/tool/packages/docDiff/test/integration-tolerance.test.ts create mode 100644 modules/tool/packages/docDiff/test/integration.test.ts create mode 100644 modules/tool/packages/docDiff/test/lineBreakTolerance.test.ts create mode 100644 modules/tool/packages/docDiff/test/lineTolerance.test.ts diff --git a/modules/tool/packages/docDiff/src/diffAlgorithm.ts b/modules/tool/packages/docDiff/src/diffAlgorithm.ts new file mode 100644 index 00000000..f1d27303 --- /dev/null +++ b/modules/tool/packages/docDiff/src/diffAlgorithm.ts @@ -0,0 +1,550 @@ +// 定义换行容差选项 +export interface LineBreakToleranceOptions { + /** 是否启用换行容差逻辑 */ + enableLineBreakTolerance?: boolean; + /** 扫描范围(行数) */ + scanRange?: number; + /** 容差阈值 */ + toleranceThreshold?: number; +} + +// 定义段落差异类型 +export type DiffType = 'unchanged' | 'added' | 'removed' | 'modified'; + +export interface ParagraphDiff { + type: DiffType; + original?: string; + modified?: string; + lineNumber?: number; +} + +// 分割文档为行 +export function splitIntoLines(text: string): string[] { + return text.split('\n'); +} + +// 计算两个段的相似度(灵敏版本) +export function calculateSimilarity(text1: string, text2: string): number { + // 如果完全相同,直接返回1.0 + if (text1 === text2) return 1.0; + + // 计算编辑距离 + const distance = levenshteinDistance(text1, text2); + const maxLength = Math.max(text1.length, text2.length); + + if (maxLength === 0) return 1.0; + + // 转换为相似度(0-1之间) + const similarity = 1 - distance / maxLength; + + return similarity; +} + +// 计算编辑距离(Levenshtein距离) +function levenshteinDistance(str1: string, str2: string): number { + const matrix = []; + + for (let i = 0; i <= str2.length; i++) { + matrix[i] = [i]; + } + + for (let j = 0; j <= str1.length; j++) { + matrix[0][j] = j; + } + + for (let i = 1; i <= str2.length; i++) { + for (let j = 1; j <= str1.length; j++) { + if (str2.charAt(i - 1) === str1.charAt(j - 1)) { + matrix[i][j] = matrix[i - 1][j - 1]; + } else { + matrix[i][j] = Math.min( + matrix[i - 1][j - 1] + 1, // 替换 + matrix[i][j - 1] + 1, // 插入 + matrix[i - 1][j] + 1 // 删除 + ); + } + } + } + + return matrix[str2.length][str1.length]; +} + +// 判断是否为高相似度(应该视为修改) +export function isHighSimilarity(similarity: number): boolean { + // 相似度 > 0.7 且 < 1.0 视为高相似度,应该标记为修改 + return similarity > 0.7 && similarity < 1.0; +} + +// 判断是否为中等相似度(在LCS中考虑匹配) +export function isMediumSimilarity(similarity: number): boolean { + // 提高阈值:只有相似度 > 0.7 才在LCS中考虑为潜在匹配 + return similarity > 0.7; +} + +// 严格内容比较:用于完全相同的行 +export function isExactMatch(text1: string, text2: string): boolean { + return text1 === text2; +} + +// 寻找精确匹配行 +export function findExactMatch( + originalLine: string, + modifiedLines: string[], + startModIndex: number, + searchRange: number = 20 +): { matchIndex: number; found: boolean } { + // 向前搜索精确匹配 + for (let i = 0; i < Math.min(searchRange, modifiedLines.length - startModIndex); i++) { + if (isExactMatch(originalLine, modifiedLines[startModIndex + i])) { + return { matchIndex: i, found: true }; + } + } + + // 向后搜索精确匹配(如果可能) + for (let i = 1; i <= Math.min(searchRange, startModIndex); i++) { + if (isExactMatch(originalLine, modifiedLines[startModIndex - i])) { + return { matchIndex: -i, found: true }; // 负数表示向后搜索 + } + } + + return { matchIndex: -1, found: false }; +} + +// 构建相似度匹配矩阵(用于LCS算法) +export function buildMatchMatrix(originalLines: string[], modifiedLines: string[]): number[][] { + const matrix: number[][] = []; + + for (let i = 0; i <= originalLines.length; i++) { + matrix[i] = []; + for (let j = 0; j <= modifiedLines.length; j++) { + if (i === 0 || j === 0) { + matrix[i][j] = 0; + } else { + const similarity = calculateSimilarity(originalLines[i - 1], modifiedLines[j - 1]); + // 中等相似度以上视为潜在匹配 + if (isMediumSimilarity(similarity)) { + matrix[i][j] = matrix[i - 1][j - 1] + 1; + } else { + matrix[i][j] = Math.max(matrix[i - 1][j], matrix[i][j - 1]); + } + } + } + } + + return matrix; +} + +// 回溯相似度匹配矩阵,找到匹配的行对 +export function backtrackLCS( + matrix: number[][], + originalLines: string[], + modifiedLines: string[] +): { origIndices: number[]; modIndices: number[] } { + const origIndices: number[] = []; + const modIndices: number[] = []; + + let i = originalLines.length; + let j = modifiedLines.length; + + while (i > 0 && j > 0) { + const similarity = calculateSimilarity(originalLines[i - 1], modifiedLines[j - 1]); + + // 中等相似度以上且是匹配路径才视为匹配 + if (isMediumSimilarity(similarity) && matrix[i][j] === matrix[i - 1][j - 1] + 1) { + // 找到相似度匹配 + origIndices.unshift(i - 1); + modIndices.unshift(j - 1); + i--; + j--; + } else if (matrix[i - 1][j] >= matrix[i][j - 1]) { + i--; + } else { + j--; + } + } + + return { origIndices, modIndices }; +} + +// 灵敏文档对比算法:高相似度视为修改,低相似度视为删除+新增 +export function compareDocuments(originalText: string, modifiedText: string): ParagraphDiff[] { + const originalLines = splitIntoLines(originalText); + const modifiedLines = splitIntoLines(modifiedText); + + const diffs: ParagraphDiff[] = []; + let currentLineNumber = 1; + + // 使用相似度匹配LCS算法找到潜在的匹配行 + const matrix = buildMatchMatrix(originalLines, modifiedLines); + const { origIndices, modIndices } = backtrackLCS(matrix, originalLines, modifiedLines); + + // 添加虚拟的结束索引,便于处理 + origIndices.push(originalLines.length); + modIndices.push(modifiedLines.length); + + let origIndex = 0; + let modIndex = 0; + + // 处理每个匹配段之间的差异 + for (let matchIndex = 0; matchIndex < origIndices.length; matchIndex++) { + const matchOrigIndex = origIndices[matchIndex]; + const matchModIndex = modIndices[matchIndex]; + + // 处理当前匹配之前的差异区域 + while (origIndex < matchOrigIndex || modIndex < matchModIndex) { + // 如果原始文档已经处理完这段区域 + if (origIndex >= matchOrigIndex) { + // 这些都是新增的行 + while (modIndex < matchModIndex) { + const modifiedLine = modifiedLines[modIndex]; + diffs.push({ + type: 'added', + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + modIndex++; + } + break; + } + + // 如果修改后文档已经处理完这段区域 + if (modIndex >= matchModIndex) { + // 这些都是删除的行 + while (origIndex < matchOrigIndex) { + const originalLine = originalLines[origIndex]; + diffs.push({ + type: 'removed', + original: originalLine, + lineNumber: currentLineNumber++ + }); + origIndex++; + } + break; + } + + const originalLine = originalLines[origIndex]; + const modifiedLine = modifiedLines[modIndex]; + + // 计算相似度 + const similarity = calculateSimilarity(originalLine, modifiedLine); + + if (isHighSimilarity(similarity)) { + // 高相似度,视为修改 + diffs.push({ + type: 'modified', + original: originalLine, + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + origIndex++; + modIndex++; + } else { + // 低相似度,分别作为删除和新增处理 + diffs.push({ + type: 'removed', + original: originalLine, + lineNumber: currentLineNumber++ + }); + diffs.push({ + type: 'added', + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + origIndex++; + modIndex++; + } + } + + // 添加匹配的行 + if (matchIndex < origIndices.length - 1) { + // 只有在不是虚拟结束索引时才添加 + const originalLine = originalLines[matchOrigIndex]; + const modifiedLine = modifiedLines[matchModIndex]; + const similarity = calculateSimilarity(originalLine, modifiedLine); + + if (isExactMatch(originalLine, modifiedLine)) { + // 完全相同,视为未修改 + diffs.push({ + type: 'unchanged', + original: originalLine, + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + } else if (isHighSimilarity(similarity)) { + // 高相似度,视为修改 + diffs.push({ + type: 'modified', + original: originalLine, + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + } else { + // 中等相似度,视为未修改(这些在LCS中已经处理过了) + diffs.push({ + type: 'unchanged', + original: originalLine, + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + } + } + + origIndex = matchOrigIndex + 1; + modIndex = matchModIndex + 1; + } + + return diffs; +} + +// 换行容差比较函数 +export function compareWithLineBreakTolerance( + originalLine: string, + modifiedLine: string, + originalLines: string[], + modifiedLines: string[], + origIndex: number, + modIndex: number, + options: LineBreakToleranceOptions = {} +): boolean { + const { enableLineBreakTolerance = true, scanRange = 3, toleranceThreshold = 0.95 } = options; + + if (!enableLineBreakTolerance) { + return false; + } + + // 如果两行完全相同,直接返回 true + if (originalLine === modifiedLine) { + return true; + } + + // 扫描原始文档附近几行,合并后与修改文档比较 + for ( + let i = Math.max(0, origIndex - scanRange); + i <= Math.min(originalLines.length - 1, origIndex + scanRange); + i++ + ) { + for ( + let j = Math.max(0, modIndex - scanRange); + j <= Math.min(modifiedLines.length - 1, modIndex + scanRange); + j++ + ) { + // 跳过当前行本身的比较 + if (i === origIndex && j === modIndex) continue; + + // 合并原始文档的多行 + const originalSegment = originalLines + .slice(Math.min(origIndex, i), Math.max(origIndex, i) + 1) + .join('') + .replace(/\s+/g, '') // 移除所有空白字符 + .toLowerCase(); + + // 合并修改文档的多行 + const modifiedSegment = modifiedLines + .slice(Math.min(modIndex, j), Math.max(modIndex, j) + 1) + .join('') + .replace(/\s+/g, '') // 移除所有空白字符 + .toLowerCase(); + + // 如果合并后的内容完全相同,则认为是换行差异 + if (originalSegment === modifiedSegment && originalSegment.length > 0) { + return true; + } + + // 如果合并后的内容相似度很高,也考虑容差 + const similarity = calculateSimilarity(originalSegment, modifiedSegment); + if (similarity >= toleranceThreshold) { + return true; + } + } + } + + // 额外检查:扫描临近2行去掉换行符后的情况 + for ( + let i = Math.max(0, origIndex - 2); + i <= Math.min(originalLines.length - 1, origIndex + 2); + i++ + ) { + for ( + let j = Math.max(0, modIndex - 2); + j <= Math.min(modifiedLines.length - 1, modIndex + 2); + j++ + ) { + // 跳过完全相同的情况(已经处理过) + if (i === origIndex && j === modIndex) continue; + + // 检查去掉换行符后的多行组合 + const origSegment = originalLines + .slice(Math.min(origIndex, i), Math.max(origIndex, i) + 1) + .join('') // 去掉换行符 + .replace(/\s+/g, '') // 移除所有空白字符 + .toLowerCase(); + + const modSegment = modifiedLines + .slice(Math.min(modIndex, j), Math.max(modIndex, j) + 1) + .join('') // 去掉换行符 + .replace(/\s+/g, '') // 移除所有空白字符 + .toLowerCase(); + + if (origSegment === modSegment && origSegment.length > 0) { + return true; + } + } + } + + return false; +} + +// 带容差的文档比较函数 +export function compareDocumentsWithTolerance( + originalText: string, + modifiedText: string, + toleranceOptions?: LineBreakToleranceOptions +): ParagraphDiff[] { + const originalLines = splitIntoLines(originalText); + const modifiedLines = splitIntoLines(modifiedText); + + const diffs: ParagraphDiff[] = []; + let currentLineNumber = 1; + + // 使用相似度匹配LCS算法找到潜在的匹配行 + const matrix = buildMatchMatrix(originalLines, modifiedLines); + const { origIndices, modIndices } = backtrackLCS(matrix, originalLines, modifiedLines); + + // 添加虚拟的结束索引,便于处理 + origIndices.push(originalLines.length); + modIndices.push(modifiedLines.length); + + let origIndex = 0; + let modIndex = 0; + + // 处理每个匹配段之间的差异 + for (let matchIndex = 0; matchIndex < origIndices.length; matchIndex++) { + const matchOrigIndex = origIndices[matchIndex]; + const matchModIndex = modIndices[matchIndex]; + + // 处理当前匹配之前的差异区域 + while (origIndex < matchOrigIndex || modIndex < matchModIndex) { + // 如果原始文档已经处理完这段区域 + if (origIndex >= matchOrigIndex) { + // 这些都是新增的行 + while (modIndex < matchModIndex) { + const modifiedLine = modifiedLines[modIndex]; + diffs.push({ + type: 'added', + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + modIndex++; + } + break; + } + + // 如果修改后文档已经处理完这段区域 + if (modIndex >= matchModIndex) { + // 这些都是删除的行 + while (origIndex < matchOrigIndex) { + const originalLine = originalLines[origIndex]; + diffs.push({ + type: 'removed', + original: originalLine, + lineNumber: currentLineNumber++ + }); + origIndex++; + } + break; + } + + const originalLine = originalLines[origIndex]; + const modifiedLine = modifiedLines[modIndex]; + + // 计算相似度 + const similarity = calculateSimilarity(originalLine, modifiedLine); + + // 首先检查换行容差 + if ( + compareWithLineBreakTolerance( + originalLine, + modifiedLine, + originalLines, + modifiedLines, + origIndex, + modIndex, + toleranceOptions + ) + ) { + // 换行容差匹配成功,视为未修改 + diffs.push({ + type: 'unchanged', + original: originalLine, + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + origIndex++; + modIndex++; + } else if (isHighSimilarity(similarity)) { + // 高相似度,视为修改 + diffs.push({ + type: 'modified', + original: originalLine, + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + origIndex++; + modIndex++; + } else { + // 低相似度,分别作为删除和新增处理 + diffs.push({ + type: 'removed', + original: originalLine, + lineNumber: currentLineNumber++ + }); + diffs.push({ + type: 'added', + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + origIndex++; + modIndex++; + } + } + + // 添加匹配的行 + if (matchIndex < origIndices.length - 1) { + // 只有在不是虚拟结束索引时才添加 + const originalLine = originalLines[matchOrigIndex]; + const modifiedLine = modifiedLines[matchModIndex]; + const similarity = calculateSimilarity(originalLine, modifiedLine); + + if (similarity >= 1.0) { + // 完全相同,视为未修改 + diffs.push({ + type: 'unchanged', + original: originalLine, + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + } else if (isHighSimilarity(similarity)) { + // 高相似度,视为修改 + diffs.push({ + type: 'modified', + original: originalLine, + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + } else { + // 中等相似度,视为未修改(这些在LCS中已经处理过了) + diffs.push({ + type: 'unchanged', + original: originalLine, + modified: modifiedLine, + lineNumber: currentLineNumber++ + }); + } + } + + origIndex = matchOrigIndex + 1; + modIndex = matchModIndex + 1; + } + + return diffs; +} diff --git a/modules/tool/packages/docDiff/src/index.ts b/modules/tool/packages/docDiff/src/index.ts index bb7d9159..b733d771 100644 --- a/modules/tool/packages/docDiff/src/index.ts +++ b/modules/tool/packages/docDiff/src/index.ts @@ -1,10 +1,25 @@ import { uploadFile } from '@tool/utils/uploadFile'; import { z } from 'zod'; +import { + compareDocuments, + compareDocumentsWithTolerance, + type ParagraphDiff, + type LineBreakToleranceOptions +} from './diffAlgorithm'; +import { applyFullNormalization } from './textNormalizer'; export const InputType = z.object({ originalText: z.string().min(1, '原始文档内容不能为空'), modifiedText: z.string().min(1, '修改后文档内容不能为空'), - title: z.string().optional().default('文档对比报告') + title: z.string().optional().default('文档对比报告'), + // 换行容差选项 + lineTolerance: z + .object({ + enableLineBreakTolerance: z.boolean().optional().default(true), + scanRange: z.number().optional().default(3), + toleranceThreshold: z.number().optional().default(0.95) + }) + .optional() }); export const OutputType = z.object({ @@ -24,6 +39,8 @@ export type InputType = { originalText: string; modifiedText: string; title?: string; + // 换行容差选项 + lineTolerance?: LineBreakToleranceOptions; }; // 输出类型 @@ -37,177 +54,6 @@ export type OutputType = { }[]; }; -// 定义段落差异类型 -type DiffType = 'unchanged' | 'added' | 'removed' | 'modified'; - -interface ParagraphDiff { - type: DiffType; - original?: string; - modified?: string; - lineNumber?: number; -} - -// 分割文档为行 -function splitIntoLines(text: string): string[] { - return text.split('\n'); -} - -// 计算两个段的相似度 -function calculateSimilarity(text1: string, text2: string): number { - // 移除首尾空白字符 - const clean1 = text1.trim(); - const clean2 = text2.trim(); - - // 如果两行都为空,则完全相同 - if (!clean1 && !clean2) return 1.0; - if (!clean1 || !clean2) return 0.0; - - // 如果内容完全相同,直接返回1.0 - if (clean1 === clean2) return 1.0; - - // 移除所有空白字符并转换为小写进行比较 - const chars1 = clean1.replace(/\s+/g, '').toLowerCase(); - const chars2 = clean2.replace(/\s+/g, '').toLowerCase(); - - const longer = chars1.length > chars2.length ? chars1 : chars2; - const shorter = chars1.length > chars2.length ? chars2 : chars1; - - if (longer.length === 0) return 1.0; - - const matches = Array.from(longer).filter( - (char, index) => index < shorter.length && char === shorter[index] - ).length; - - return matches / longer.length; -} - -// 对比两个文档 -function compareDocuments(originalText: string, modifiedText: string): ParagraphDiff[] { - const originalLines = splitIntoLines(originalText); - const modifiedLines = splitIntoLines(modifiedText); - - const diffs: ParagraphDiff[] = []; - let origIndex = 0; - let modIndex = 0; - let currentLineNumber = 1; // 使用连续的行号 - - while (origIndex < originalLines.length || modIndex < modifiedLines.length) { - const originalLine = originalLines[origIndex] || ''; - const modifiedLine = modifiedLines[modIndex] || ''; - - // 如果其中一个文档已经处理完毕 - if (origIndex >= originalLines.length) { - // 只有修改后的文档有内容,这是新增行 - if (modifiedLine.trim()) { - // 只添加非空行 - diffs.push({ - type: 'added', - modified: modifiedLine, - lineNumber: currentLineNumber++ - }); - } - modIndex++; - continue; - } - - if (modIndex >= modifiedLines.length) { - // 只有原始文档有内容,这是删除行 - if (originalLine.trim()) { - // 只添加非空行 - diffs.push({ - type: 'removed', - original: originalLine, - lineNumber: currentLineNumber++ - }); - } - origIndex++; - continue; - } - - // 如果两行都是空的,跳过 - if (!originalLine.trim() && !modifiedLine.trim()) { - origIndex++; - modIndex++; - continue; - } - - // 计算行相似度 - const similarity = calculateSimilarity(originalLine, modifiedLine); - - if (similarity > 0.9) { - // 完全相同的行,标记为unchanged - diffs.push({ - type: 'unchanged', - original: originalLine, - modified: modifiedLine, - lineNumber: currentLineNumber++ - }); - origIndex++; - modIndex++; - } else if (similarity > 0.8) { - // 修改的行 - diffs.push({ - type: 'modified', - original: originalLine, - modified: modifiedLine, - lineNumber: currentLineNumber++ - }); - origIndex++; - modIndex++; - } else { - // 寻找最佳匹配 - let bestMatchIndex = -1; - let bestSimilarity = 0; - - for (let i = 0; i < Math.min(3, modifiedLines.length - modIndex); i++) { - const candidateSimilarity = calculateSimilarity(originalLine, modifiedLines[modIndex + i]); - if (candidateSimilarity > bestSimilarity) { - bestSimilarity = candidateSimilarity; - bestMatchIndex = i; - } - } - - if (bestSimilarity > 0.6) { - // 找到匹配,先添加新增的行 - for (let i = 0; i < bestMatchIndex; i++) { - const addedLine = modifiedLines[modIndex + i]; - if (addedLine.trim()) { - // 只添加非空行 - diffs.push({ - type: 'added', - modified: addedLine, - lineNumber: currentLineNumber++ - }); - } - } - - // 添加修改的行 - diffs.push({ - type: 'modified', - original: originalLine, - modified: modifiedLines[modIndex + bestMatchIndex], - lineNumber: currentLineNumber++ - }); - modIndex += bestMatchIndex + 1; - origIndex++; - } else { - // 没有找到匹配,可能是删除 - if (originalLine.trim()) { - // 只添加非空行 - diffs.push({ - type: 'removed', - original: originalLine, - lineNumber: currentLineNumber++ - }); - } - origIndex++; - } - } - } - - return diffs; -} - // 生成 HTML 报告 function generateHtmlReport(diffs: ParagraphDiff[], title: string): string { const timestamp = new Date().toLocaleString('zh-CN'); @@ -1291,7 +1137,22 @@ export async function tool(input: z.infer) { // Zod 会自动验证输入,如果验证失败会抛出错误 const validatedInput = InputType.parse(input); - const diffs = compareDocuments(validatedInput.originalText, validatedInput.modifiedText); + // 1. 文本标准化预处理(使用默认配置) + const normalizedOriginal = applyFullNormalization(validatedInput.originalText); + const normalizedModified = applyFullNormalization(validatedInput.modifiedText); + + // 2. 根据是否启用换行容差选择比较函数 + let diffs: ParagraphDiff[]; + if (validatedInput.lineTolerance?.enableLineBreakTolerance) { + diffs = compareDocumentsWithTolerance( + normalizedOriginal, + normalizedModified, + validatedInput.lineTolerance + ); + } else { + diffs = compareDocuments(normalizedOriginal, normalizedModified); + } + const html = generateHtmlReport(diffs, validatedInput.title); const uploadResult = await uploadFile({ diff --git a/modules/tool/packages/docDiff/src/textNormalizer.ts b/modules/tool/packages/docDiff/src/textNormalizer.ts new file mode 100644 index 00000000..2eb74b3d --- /dev/null +++ b/modules/tool/packages/docDiff/src/textNormalizer.ts @@ -0,0 +1,449 @@ +/** + * 文本标准化模块 + * 用于预处理文本,移除格式化语法和多余空格 + */ + +interface NormalizationOptions { + /** 是否移除 Markdown 格式化语法 */ + removeMarkdownFormatting?: boolean; + /** 是否保留表格格式 */ + preserveTables?: boolean; + /** 是否移除文本中间的多余空格 */ + removeExtraSpaces?: boolean; + /** 是否删除所有文本间的空格(更激进的处理,包括中英文间空格) */ + removeTextSpaces?: boolean; + /** 是否智能处理中英文混排空格(删除中英文间空格,保留英文单词内结构) */ + removeIntelligentSpaces?: boolean; + /** 是否将全角标点符号转换为半角 */ + convertPunctuation?: boolean; +} + +/** + * 标准化文本 + */ +function normalizeText(text: string, options: NormalizationOptions = {}): string { + const { + removeMarkdownFormatting = true, + preserveTables = true, + removeExtraSpaces = true, + removeTextSpaces = false, + removeIntelligentSpaces: enableIntelligentSpaces = false, + convertPunctuation = false + } = options; + + let result = text; + + // 标准化处理顺序: + // 1. 全角转半角(最先进行,避免影响后续格式识别) + if (convertPunctuation) { + result = convertFullWidthToHalfWidth(result); + } + + // 2. 合并多个空行(在格式处理前进行,避免空行影响格式识别) + result = mergeMultipleEmptyLines(result); + + // 3. 根据是否保留表格采用不同的处理策略 + if (preserveTables) { + // 保留表格:逐行处理,区分表格行和非表格行 + const lines = result.split('\n'); + const processedLines = lines.map((line) => { + if (isTableRow(line)) { + // 表格行:跳过 Markdown 处理,只处理空格 + return processTableRow(line, { + removeTextSpaces, + enableIntelligentSpaces, + removeExtraSpaces + }); + } else { + // 非表格行:应用完整处理流程 + let processedLine = line; + + // 先处理 Markdown 格式 + if (removeMarkdownFormatting) { + processedLine = removeMarkdownFormattingSyntax(processedLine); + } + + // 再处理空格 + processedLine = processSpaces(processedLine, { + removeTextSpaces, + enableIntelligentSpaces, + removeExtraSpaces + }); + + return processedLine; + } + }); + result = processedLines.join('\n'); + } else { + // 不保留表格:直接应用完整处理流程 + // 3. 处理 Markdown 格式(如果启用) + if (removeMarkdownFormatting) { + result = removeMarkdownFormattingSyntax(result); + } + + // 4. 最后处理空格(避免影响格式化识别) + result = processSpaces(result, { + removeTextSpaces, + enableIntelligentSpaces, + removeExtraSpaces + }); + } + + return result; +} + +/** + * 合并多个空行 + * 将连续的空行(2个或更多)合并为单个空行 + */ +function mergeMultipleEmptyLines(text: string): string { + return text.replace(/\n{3,}/g, '\n\n'); +} + +/** + * 处理空格的统一函数 + */ +function processSpaces( + text: string, + options: { + removeTextSpaces: boolean; + enableIntelligentSpaces: boolean; + removeExtraSpaces: boolean; + } +): string { + const { removeTextSpaces, enableIntelligentSpaces, removeExtraSpaces } = options; + + if (removeTextSpaces) { + return removeAllTextSpaces(text); + } else if (enableIntelligentSpaces) { + return removeIntelligentSpaces(text); + } else if (removeExtraSpaces) { + return removeExtraWhitespace(text); + } else { + return text; + } +} + +/** + * 处理表格行的空格,保留表格结构 + */ +function processTableRow( + line: string, + options: { + removeTextSpaces: boolean; + enableIntelligentSpaces: boolean; + removeExtraSpaces: boolean; + } +): string { + const { removeTextSpaces, enableIntelligentSpaces, removeExtraSpaces } = options; + + if (removeTextSpaces) { + // 删除所有空格,保留表格分隔符 + return line + .replace(/\s+/g, '') // 删除所有空格 + .replace(/\|\|/g, '|') // 修复可能连续的分隔符 + .replace(/^\||\|$/g, ''); // 删除首尾多余的分隔符 + } else if (enableIntelligentSpaces) { + // 智能处理表格单元格内的空格 + return line + .split('|') + .map((cell) => { + // eslint-disable-next-line no-control-regex + const hasNonEnglish = /[^\x00-\x7F]/.test(cell); + const isPureEnglish = /^[a-zA-Z0-9\s]*$/.test(cell.trim()); + + if (cell.trim() === '') return cell; // 空单元格 + + if (isPureEnglish) { + // 纯英文:保留单词间的单个空格 + return cell.replace(/\s+/g, ' ').trim(); + } else if (hasNonEnglish) { + // 包含中文:删除所有空格 + return cell.replace(/\s+/g, ''); + } else { + // 其他情况:删除多余空格 + return cell.replace(/\s+/g, ' ').trim(); + } + }) + .join('|'); + } else if (removeExtraSpaces) { + return removeExtraWhitespace(line); + } else { + return line; + } +} + +/** + * 判断是否是表格行 + */ +function isTableRow(line: string): boolean { + // 表格行的特征: + // 1. 包含管道符 | + // 2. 以 | 开头或包含 | | 模式(分隔行) + // 3. 不是普通文本中的单个 | + const trimmed = line.trim(); + return ( + trimmed.includes('|') && + (trimmed.startsWith('|') || + trimmed.includes('| |') || + /^[\s]*\|.*\|[\s]*$/.test(line) || + /^[\s]*\|[\s\-:]+\|[\s]*$/.test(line)) + ); +} + +/** + * 移除 Markdown 格式化语法(保留表格结构) + */ +function removeMarkdownFormattingSyntax(text: string): string { + // 1. 移除标题格式 + text = text.replace(/^(#{1,6})\s+/gm, ''); + + // 2. 移除加粗格式 **text** + text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); + + // 3. 移除斜体格式 *text* 和 _text_ + text = text.replace(/\*([^*]+)\*/g, '$1'); + text = text.replace(/_([^_]+)_/g, '$1'); + + // 4. 移除删除线格式 ~~text~~ + text = text.replace(/~~([^~]+)~~/g, '$1'); + + // 5. 移除行内代码格式 `text` + text = text.replace(/`([^`]+)`/g, '$1'); + + // 6. 移除链接格式 [text](url) + text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1'); + + // 7. 移除引用格式 > + text = text.replace(/^[>\s]+/gm, ''); + + // 8. 移除列表标记(-、*、+、数字.) + text = text.replace(/^[\s]*[-*+]\s+/gm, ''); + text = text.replace(/^[\s]*\d+\.\s+/gm, ''); + + // 9. 移除代码块标记 ``` + text = text.replace(/```(\w+)?\s*([\s\S]*?)\s*```/g, '$2'); + + return text; +} + +/** + * 移除多余的空格 + */ +function removeExtraWhitespace(text: string): string { + // 1. 将多个连续空格替换为单个空格 + text = text.replace(/[ \t]+/g, ' '); + + // 2. 移除行首行尾空格 + text = text.replace(/^[ \t]+|[ \t]+$/gm, ''); + + // 3. 移除多余的换行符(保留空行结构) + text = text.replace(/\n{3,}/g, '\n\n'); + + return text; +} + +/** + * 删除所有文本间的空格(更激进的处理) + * 对于中英文混排,会删除所有空格包括中英文之间的空格 + */ +function removeAllTextSpaces(text: string): string { + // 保留表格结构,但删除表格内容中的空格 + const lines = text.split('\n'); + return lines + .map((line) => { + if (isTableRow(line)) { + // 表格行:保留表格分隔符,但删除单元格内容中的空格 + return line + .split('|') + .map((cell) => { + // 删除单元格内的所有空格,但保留基本结构 + return cell.replace(/\s+/g, ''); + }) + .join('|'); + } else { + // 普通行:删除所有空格(包括中英文之间的空格) + return line.replace(/\s+/g, ''); + } + }) + .join('\n'); +} + +/** + * 将全角标点符号转换为半角 + */ +function convertFullWidthToHalfWidth(text: string): string { + // 全角字符到半角字符的映射 + const fullWidthToHalfWidth = { + // 标点符号 + ',': ',', + '。': '.', + '!': '!', + '?': '?', + ';': ';', + ':': ':', + '(': '(', + ')': ')', + '【': '[', + '】': ']', + '{': '{', + '}': '}', + '"': '"', + "'": "'", + '《': '<', + '》': '>', + '〈': '<', + '〉': '>', + '…': '...', + '—': '-', + '——': '--', + '·': '.', + // 数字 + '0': '0', + '1': '1', + '2': '2', + '3': '3', + '4': '4', + '5': '5', + '6': '6', + '7': '7', + '8': '8', + '9': '9', + // 字母 + a: 'a', + b: 'b', + c: 'c', + d: 'd', + e: 'e', + f: 'f', + g: 'g', + h: 'h', + i: 'i', + j: 'j', + k: 'k', + l: 'l', + m: 'm', + n: 'n', + o: 'o', + p: 'p', + q: 'q', + r: 'r', + s: 's', + t: 't', + u: 'u', + v: 'v', + w: 'w', + x: 'x', + y: 'y', + z: 'z', + A: 'A', + B: 'B', + C: 'C', + D: 'D', + E: 'E', + F: 'F', + G: 'G', + H: 'H', + I: 'I', + J: 'J', + K: 'K', + L: 'L', + M: 'M', + N: 'N', + O: 'O', + P: 'P', + Q: 'Q', + R: 'R', + S: 'S', + T: 'T', + U: 'U', + V: 'V', + W: 'W', + X: 'X', + Y: 'Y', + Z: 'Z', + // 空格 + ' ': ' ' + }; + + // 使用正则表达式替换所有全角字符 + return text.replace(/[\uff00-\uffef]/g, (char) => { + return fullWidthToHalfWidth[char as keyof typeof fullWidthToHalfWidth] || char; + }); +} + +/** + * 智能处理中英文混排的空格 + * 保留必要的分隔,但删除多余的空格 + */ +function removeIntelligentSpaces(text: string): string { + const lines = text.split('\n'); + return lines + .map((line) => { + if (isTableRow(line)) { + // 表格行:保留表格结构,但智能处理单元格内容 + return line + .split('|') + .map((cell) => { + // 保留英文单词间的单个空格,删除其他多余空格 + const processedCell = cell + .replace(/\s+/g, ' ') // 多个空格合并为单个 + // 移除中英文之间的空格,但保留英文单词间的空格 + // eslint-disable-next-line no-control-regex + .replace(/([a-zA-Z]+)\s+([^\x00-\x7F]+)/g, '$1$2') // 英文后跟中文,移除空格 + // eslint-disable-next-line no-control-regex + .replace(/([^\x00-\x7F]+)\s+([a-zA-Z]+)/g, '$1$2'); // 中文后跟英文,移除空格 + + // 判断是否是纯英文内容 + // eslint-disable-next-line no-control-regex + const hasNonEnglish = /[^\x00-\x7F]/.test(processedCell); + + if (!hasNonEnglish) { + // 纯英文:保留单词间的单个空格 + return processedCell.replace(/\s+/g, ' ').trim(); + } else { + // 包含中文:删除所有剩余空格 + return processedCell.replace(/\s+/g, '').trim(); + } + }) + .join('|'); + } else { + // 普通行:智能处理空格 + const processedLine = line + .replace(/\s+/g, ' ') // 多个空格合并为单个 + // 移除中英文之间的空格,但保留英文单词间的空格 + // eslint-disable-next-line no-control-regex + .replace(/([a-zA-Z]+)\s+([^\x00-\x7F]+)/g, '$1$2') // 英文后跟中文,移除空格 + // eslint-disable-next-line no-control-regex + .replace(/([^\x00-\x7F]+)\s+([a-zA-Z]+)/g, '$1$2'); // 中文后跟英文,移除空格 + + // 判断是否是纯英文内容 + // eslint-disable-next-line no-control-regex + const hasNonEnglish = /[^\x00-\x7F]/.test(processedLine); + + if (!hasNonEnglish) { + // 纯英文:保留单词间的单个空格 + return processedLine.replace(/\s+/g, ' ').trim(); + } else { + // 包含中文:删除所有剩余空格 + return processedLine.replace(/\s+/g, '').trim(); + } + } + }) + .join('\n'); +} + +/** + * 应用完整的标准化流程 + */ +export function applyFullNormalization(text: string): string { + // 使用默认的标准化配置 + return normalizeText(text, { + removeMarkdownFormatting: true, + preserveTables: true, + removeExtraSpaces: true, + removeTextSpaces: false, + removeIntelligentSpaces: true, + convertPunctuation: true + }); +} diff --git a/modules/tool/packages/docDiff/test/diffAlgorithm.test.ts b/modules/tool/packages/docDiff/test/diffAlgorithm.test.ts new file mode 100644 index 00000000..4dc619ba --- /dev/null +++ b/modules/tool/packages/docDiff/test/diffAlgorithm.test.ts @@ -0,0 +1,398 @@ +import { describe, it, expect } from 'vitest'; +import { + calculateSimilarity, + isHighSimilarity, + isMediumSimilarity, + buildMatchMatrix, + backtrackLCS, + compareDocuments, + splitIntoLines +} from '../src/diffAlgorithm'; + +describe('灵敏相似度 Diff 算法核心功能测试', () => { + describe('calculateSimilarity', () => { + it('应该正确计算完全相同的文本相似度', () => { + expect(calculateSimilarity('hello', 'hello')).toBe(1.0); + expect(calculateSimilarity('相同内容', '相同内容')).toBe(1.0); + expect(calculateSimilarity('', '')).toBe(1.0); + }); + + it('应该正确计算空文本的相似度', () => { + expect(calculateSimilarity('hello', '')).toBe(0.0); + expect(calculateSimilarity('', 'world')).toBe(0.0); + }); + + it('应该对空格变化敏感', () => { + const sim = calculateSimilarity('hello world', 'hello world'); + expect(sim).toBeGreaterThan(0.9); // 多一个空格,相似度应该很高 + expect(sim).toBeLessThan(1.0); + }); + + it('应该对标点符号变化敏感', () => { + const sim = calculateSimilarity('你好,世界', '你好!世界'); + expect(sim).toBeGreaterThan(0.7); // 标点符号变化,相似度应该较高 + expect(sim).toBeLessThan(1.0); + }); + + it('应该对大小写变化敏感', () => { + const sim = calculateSimilarity('Hello', 'hello'); + expect(sim).toBeGreaterThan(0.7); // 大小写变化,相似度应该较高 + expect(sim).toBeLessThan(1.0); + }); + + it('应该正确计算大幅修改的相似度', () => { + const sim = calculateSimilarity('hello world', 'completely different'); + expect(sim).toBeLessThan(0.5); // 大幅修改,相似度应该较低 + }); + }); + + describe('isHighSimilarity 和 isMediumSimilarity', () => { + it('应该正确识别高相似度', () => { + expect(isHighSimilarity(0.8)).toBe(true); + expect(isHighSimilarity(0.71)).toBe(true); + expect(isHighSimilarity(0.7)).toBe(false); + expect(isHighSimilarity(1.0)).toBe(false); // 完全匹配是精确匹配,不是高相似度 + }); + + it('应该正确识别中等相似度', () => { + expect(isMediumSimilarity(0.6)).toBe(false); // 低于0.7阈值 + expect(isMediumSimilarity(0.51)).toBe(false); // 低于0.7阈值 + expect(isMediumSimilarity(0.5)).toBe(false); + expect(isMediumSimilarity(0.8)).toBe(true); + expect(isMediumSimilarity(1.0)).toBe(true); // 完全匹配也符合中等相似度 + }); + }); + + describe('splitIntoLines', () => { + it('应该正确分割文本行', () => { + const text = '第1行\n第2行\n第3行'; + const lines = splitIntoLines(text); + expect(lines).toEqual(['第1行', '第2行', '第3行']); + }); + + it('应该处理空行', () => { + const text = '第1行\n\n第3行'; + const lines = splitIntoLines(text); + expect(lines).toEqual(['第1行', '', '第3行']); + }); + }); + + describe('buildMatchMatrix', () => { + it('应该为空文档构建正确大小的矩阵', () => { + const originalLines: string[] = []; + const modifiedLines: string[] = []; + + const matrix = buildMatchMatrix(originalLines, modifiedLines); + + expect(matrix).toHaveLength(1); + expect(matrix[0]).toHaveLength(1); + expect(matrix[0][0]).toBe(0); + }); + + it('应该构建正确大小的矩阵', () => { + const originalLines = ['a', 'b']; + const modifiedLines = ['a', 'b', 'c']; + + const matrix = buildMatchMatrix(originalLines, modifiedLines); + + expect(matrix).toHaveLength(3); // originalLines.length + 1 + expect(matrix[0]).toHaveLength(4); // modifiedLines.length + 1 + }); + + it('应该正确识别高相似度的行', () => { + const originalLines = ['第1行', '第2行']; + const modifiedLines = ['第1行', '第2行']; + + const matrix = buildMatchMatrix(originalLines, modifiedLines); + + // 完全相同的行应该增加匹配计数 + expect(matrix[1][1]).toBe(1); + expect(matrix[2][2]).toBe(2); + }); + + it('应该识别中等相似度的行', () => { + const originalLines = ['hello world', 'test']; + const modifiedLines = ['hello world', 'test']; // 多一个空格 + + const matrix = buildMatchMatrix(originalLines, modifiedLines); + + // 高相似度的行应该增加匹配计数 + expect(matrix[1][1]).toBe(1); + expect(matrix[2][2]).toBe(2); + }); + + it('应该忽略低相似度的行', () => { + const originalLines = ['hello', 'test']; + const modifiedLines = ['completely different', 'test']; + + const matrix = buildMatchMatrix(originalLines, modifiedLines); + + // 低相似度的行不应该增加匹配计数 + expect(matrix[1][1]).toBe(0); + expect(matrix[2][2]).toBe(1); + }); + }); + + describe('backtrackLCS', () => { + it('应该正确回溯高相似度的行', () => { + const originalLines = ['第1行', '第2行', '第3行']; + const modifiedLines = ['第1行', '第2行', '第3行']; + + const matrix = buildMatchMatrix(originalLines, modifiedLines); + const { origIndices, modIndices } = backtrackLCS(matrix, originalLines, modifiedLines); + + expect(origIndices).toEqual([0, 1, 2]); + expect(modIndices).toEqual([0, 1, 2]); + }); + + it('应该处理中等相似度的匹配', () => { + const originalLines = ['hello world', '第2行']; + const modifiedLines = ['hello world', '第2行']; + + const matrix = buildMatchMatrix(originalLines, modifiedLines); + const { origIndices, modIndices } = backtrackLCS(matrix, originalLines, modifiedLines); + + // 第一行是高相似度,应该被匹配 + expect(origIndices).toContain(0); + expect(modIndices).toContain(0); + expect(origIndices).toContain(1); + expect(modIndices).toContain(1); + }); + }); +}); + +describe('灵敏文档对比算法测试', () => { + describe('开头插入行的处理', () => { + it('应该正确识别在开头插入的单行', () => { + const original = '第1行\n第2行\n第3行'; + const modified = '新插入行\n第1行\n第2行\n第3行'; + + const diffs = compareDocuments(original, modified); + + // 应该识别出新插入的行 + expect(diffs.some((diff) => diff.type === 'added' && diff.modified === '新插入行')).toBe( + true + ); + + // 后续行应该被正确识别为未修改 + const unchangedDiffs = diffs.filter((diff) => diff.type === 'unchanged'); + expect(unchangedDiffs.length).toBe(3); + }); + + it('应该正确处理开头插入多行的情况', () => { + const original = '第1行\n第2行'; + const modified = '插入行A\n插入行B\n第1行\n第2行'; + + const diffs = compareDocuments(original, modified); + + const addedDiffs = diffs.filter((diff) => diff.type === 'added'); + expect(addedDiffs.length).toBe(2); + + const unchangedDiffs = diffs.filter((diff) => diff.type === 'unchanged'); + expect(unchangedDiffs.length).toBe(2); + }); + }); + + describe('微小修改检测', () => { + it('应该将空格变化识别为修改', () => { + const original = 'Hello World'; + const modified = 'Hello World'; // 多一个空格 + + const diffs = compareDocuments(original, modified); + + // 应该识别为修改而不是删除+新增 + expect(diffs.length).toBe(1); + expect(diffs[0].type).toBe('modified'); + expect(diffs[0].original).toBe('Hello World'); + expect(diffs[0].modified).toBe('Hello World'); + }); + + it('应该将标点符号变化识别为修改', () => { + const original = '你好,世界'; + const modified = '你好!世界'; + + const diffs = compareDocuments(original, modified); + + // 应该识别为修改 + expect(diffs.length).toBe(1); + expect(diffs[0].type).toBe('modified'); + expect(diffs[0].original).toBe('你好,世界'); + expect(diffs[0].modified).toBe('你好!世界'); + }); + + it('应该将大小写变化识别为修改', () => { + const original = 'Hello World'; + const modified = 'hello world'; + + const diffs = compareDocuments(original, modified); + + // 应该识别为修改 + expect(diffs.length).toBe(1); + expect(diffs[0].type).toBe('modified'); + }); + }); + + describe('大幅修改检测', () => { + it('应该将完全不同的内容识别为删除+新增', () => { + const original = 'Hello World'; + const modified = 'Completely Different Text'; + + const diffs = compareDocuments(original, modified); + + // 应该识别为删除和新增 + expect(diffs.some((diff) => diff.type === 'removed')).toBe(true); + expect(diffs.some((diff) => diff.type === 'added')).toBe(true); + expect(diffs.some((diff) => diff.type === 'modified')).toBe(false); + }); + + it('应该正确处理内容完全不同的场景', () => { + const original = '第1行\n第2行'; + const modified = '完全不同的A行\n完全不同的B行'; + + const diffs = compareDocuments(original, modified); + + const removedCount = diffs.filter((diff) => diff.type === 'removed').length; + const addedCount = diffs.filter((diff) => diff.type === 'added').length; + const modifiedCount = diffs.filter((diff) => diff.type === 'modified').length; + + expect(removedCount).toBe(2); + expect(addedCount).toBe(2); + expect(modifiedCount).toBe(0); + }); + }); + + describe('中间插入行的处理', () => { + it('应该正确识别中间插入的行', () => { + const original = '第1行\n第2行\n第3行\n第4行'; + const modified = '第1行\n插入行A\n插入行B\n第2行\n第3行\n第4行'; + + const diffs = compareDocuments(original, modified); + + const addedDiffs = diffs.filter((diff) => diff.type === 'added'); + expect(addedDiffs.length).toBe(2); + + const unchangedDiffs = diffs.filter((diff) => diff.type === 'unchanged'); + expect(unchangedDiffs.length).toBe(4); + }); + }); + + describe('删除行的处理', () => { + it('应该正确识别删除的行', () => { + const original = '第1行\n要删除的行\n第3行'; + const modified = '第1行\n第3行'; + + const diffs = compareDocuments(original, modified); + + expect(diffs.some((diff) => diff.type === 'removed' && diff.original === '要删除的行')).toBe( + true + ); + + const unchangedDiffs = diffs.filter((diff) => diff.type === 'unchanged'); + expect(unchangedDiffs.length).toBe(2); + }); + }); + + describe('复杂场景的处理', () => { + it('应该正确处理各种修改类型混合的场景', () => { + const original = `第1行 +要删除的行 +第3行 +要微改的行 +第5行 +要大幅改的行`; + + const modified = `插入的新行 +第1行 +第3行 +要微改的行${' '} +第5行 +完全不同的行`; + + const diffs = compareDocuments(original, modified); + + const addedCount = diffs.filter((diff) => diff.type === 'added').length; + const removedCount = diffs.filter((diff) => diff.type === 'removed').length; + const modifiedCount = diffs.filter((diff) => diff.type === 'modified').length; + + expect(addedCount).toBe(2); // 插入的新行 + 完全不同的行 + expect(removedCount).toBe(2); // 要删除的行 + 要大幅改的行 + expect(modifiedCount).toBe(1); // 要微改的行(增加了空格) + }); + }); + + describe('边界情况处理', () => { + it('应该处理空文档对比', () => { + const original = ''; + const modified = '新文档内容'; + + const diffs = compareDocuments(original, modified); + + expect(diffs.some((diff) => diff.type === 'added')).toBe(true); + }); + + it('应该处理相同文档对比', () => { + const text = '第1行\n第2行\n第3行'; + + const diffs = compareDocuments(text, text); + + // 所有行应该都是未修改的 + expect(diffs.every((diff) => diff.type === 'unchanged')).toBe(true); + }); + + it('应该处理只有空行的文档', () => { + const original = '\n\n\n'; + const modified = '\n\n\n\n'; + + const diffs = compareDocuments(original, modified); + + // 应该能够处理而不出错 + expect(diffs.length).toBeGreaterThan(0); + }); + }); + + describe('性能测试', () => { + it('应该在合理时间内处理大文档', () => { + const largeOriginal = Array.from({ length: 500 }, (_, i) => `第${i + 1}行`).join('\n'); + const largeModified = largeOriginal + '\n新增的最后行'; + + const startTime = Date.now(); + const diffs = compareDocuments(largeOriginal, largeModified); + const endTime = Date.now(); + + expect(diffs.length).toBeGreaterThan(0); + expect(endTime - startTime).toBeLessThan(2000); // 应该在2秒内完成 + }); + }); + + describe('特殊字符处理', () => { + it('应该正确处理包含特殊字符的微小修改', () => { + const original = '包含特殊字符的文本: <>&"\''; + const modified = '包含特殊字符的文本: <>&"\' '; // 末尾多一个空格 + + const diffs = compareDocuments(original, modified); + + // 应该识别为修改 + expect(diffs.length).toBe(1); + expect(diffs[0].type).toBe('modified'); + }); + + it('应该正确处理Unicode字符', () => { + const original = '包含Unicode: 🚀 🌟 测试中文'; + const modified = '包含Unicode: 🎉 🌟 测试中文'; + + const diffs = compareDocuments(original, modified); + + // 应该识别为修改(emoji变化,但文本相似) + expect(diffs.some((diff) => diff.type === 'modified')).toBe(true); + }); + + it('应该处理不同语言的文本', () => { + const original = 'Hello world\n你好世界\nこんにちは'; + const modified = 'Hello world\n你好世界!\nこんにちは'; // 标点变化 + + const diffs = compareDocuments(original, modified); + + expect(diffs.some((diff) => diff.type === 'modified')).toBe(true); + }); + }); +}); diff --git a/modules/tool/packages/docDiff/test/integration-tolerance.test.ts b/modules/tool/packages/docDiff/test/integration-tolerance.test.ts new file mode 100644 index 00000000..8e890811 --- /dev/null +++ b/modules/tool/packages/docDiff/test/integration-tolerance.test.ts @@ -0,0 +1,91 @@ +import { describe, it, expect } from 'bun:test'; +import { compareDocumentsWithTolerance } from '../src/diffAlgorithm'; + +describe('增强的换行容差集成测试', () => { + it('应该处理OCR和docx之间的换行差异', () => { + const docxText = `这是完整的句子。 +这是另一个完整的句子,包含多个词语和标点符号。 +第三行也是完整的。`; + + const ocrText = `这是完整的 句子。 +这是另一个 完整的句子,包含多个词语 和 标点符号。 +第三行 也是 完整的。`; + + const diffs = compareDocumentsWithTolerance(docxText, ocrText, { + enableLineBreakTolerance: true, + scanRange: 2, + toleranceThreshold: 0.9 + }); + + console.log('Diff结果:'); + diffs.forEach((diff, index) => { + console.log(`${index + 1}. ${diff.type}: "${diff.original}" -> "${diff.modified}"`); + }); + + // 检查有多少行被识别为 unchanged + const unchangedDiffs = diffs.filter((diff) => diff.type === 'unchanged'); + + // 至少应该有一些行被识别为相同(考虑容差) + expect(unchangedDiffs.length).toBeGreaterThan(0); + }); + + it('应该处理文档开头和结尾的换行差异', () => { + // 测试文档开头 + const original1 = `第一行 +第二行`; + const modified1 = `第一行第二行`; + + const diffs1 = compareDocumentsWithTolerance(original1, modified1); + + // 测试文档结尾 + const original2 = `第一行 +第二行`; + const modified2 = `第一行 +第二行第三行`; + + const diffs2 = compareDocumentsWithTolerance(original2, modified2); + + // 两种情况都应该能正确处理 + expect(diffs1.length).toBeGreaterThan(0); + expect(diffs2.length).toBeGreaterThan(0); + + // 应该有较少的修改差异(由于换行容差) + const modifiedDiffs1 = diffs1.filter((diff) => diff.type === 'modified'); + const modifiedDiffs2 = diffs2.filter((diff) => diff.type === 'modified'); + + expect(modifiedDiffs1.length).toBeLessThan(3); + expect(modifiedDiffs2.length).toBeLessThan(3); + }); + + it('应该处理完整的OCR文档场景', () => { + const ocrText = `这 是 OCR 识 别 的 文本。 +第 二行 继续测试,有 额外 空格。 +这是 第三行,包 含全 角标点符号! +第 四行也是正 常内容。`; + + const cleanText = `这是OCR识别的文本。 +第二行继续测试,有额外空格。 +这是第三行,包含全角标点符号! +第四行也是正常内容。`; + + const diffs = compareDocumentsWithTolerance(ocrText, cleanText, { + enableLineBreakTolerance: true, + scanRange: 3, + toleranceThreshold: 0.95 + }); + + console.log('OCR场景测试结果:'); + diffs.forEach((diff, index) => { + console.log(`${index + 1}. ${diff.type}: "${diff.original}" -> "${diff.modified}"`); + }); + + // 统计不同类型的差异 + const unchangedDiffs = diffs.filter((diff) => diff.type === 'unchanged'); + const modifiedDiffs = diffs.filter((diff) => diff.type === 'modified'); + + console.log(`统计: ${unchangedDiffs.length}个相同, ${modifiedDiffs.length}个修改`); + + // 应该有一些相同的行(由于容差处理) + expect(unchangedDiffs.length).toBeGreaterThan(0); + }); +}); diff --git a/modules/tool/packages/docDiff/test/integration.test.ts b/modules/tool/packages/docDiff/test/integration.test.ts new file mode 100644 index 00000000..6dac9cd3 --- /dev/null +++ b/modules/tool/packages/docDiff/test/integration.test.ts @@ -0,0 +1,70 @@ +import { describe, it, expect } from 'vitest'; +import { tool } from '../src/index'; + +describe('docDiff 工具集成测试', () => { + it('应该正确处理开头插入行的场景', async () => { + const result = await tool({ + originalText: `第1行 +第2行 +第3行`, + modifiedText: `新插入的行 +第1行 +第2行 +第3行`, + title: '开头插入测试' + }); + + expect(result).toHaveProperty('htmlUrl'); + expect(result).toHaveProperty('diffs'); + expect(Array.isArray(result.diffs)).toBe(true); + + // 检查是否正确识别了新增的行 + const addedDiffs = result.diffs.filter((diff) => diff.type === 'added'); + expect(addedDiffs.length).toBe(1); + expect(addedDiffs[0].modified).toBe('新插入的行'); + }); + + it('应该正确处理复杂修改场景', async () => { + const result = await tool({ + originalText: `这是原始文档的第一行 +这是要修改的行 +这是第三行`, + modifiedText: `这是原始文档的第一行 +这是修改后的行 +这是新增的行 +这是第三行`, + title: '复杂场景测试' + }); + + expect(result).toHaveProperty('htmlUrl'); + expect(result.diffs.length).toBeGreaterThan(0); + + const types = result.diffs.map((diff) => diff.type); + // 严格模式下应该有新增、删除操作,但没有修改类型 + expect(types).toContain('added'); + expect(types).toContain('removed'); + expect(types).not.toContain('modified'); + }); + + it('应该能处理只有一行的文档对比', async () => { + const result = await tool({ + originalText: '单行内容', + modifiedText: '修改后的单行内容', + title: '单行文档测试' + }); + + expect(result).toHaveProperty('htmlUrl'); + expect(result.diffs.length).toBeGreaterThan(0); + }); + + it('应该能处理相同文档', async () => { + const result = await tool({ + originalText: '相同内容', + modifiedText: '相同内容', + title: '相同文档测试' + }); + + expect(result).toHaveProperty('htmlUrl'); + expect(result.diffs.length).toBe(0); + }); +}); diff --git a/modules/tool/packages/docDiff/test/lineBreakTolerance.test.ts b/modules/tool/packages/docDiff/test/lineBreakTolerance.test.ts new file mode 100644 index 00000000..63a542e9 --- /dev/null +++ b/modules/tool/packages/docDiff/test/lineBreakTolerance.test.ts @@ -0,0 +1,393 @@ +import { describe, it, expect } from 'bun:test'; +import { + compareWithLineBreakTolerance, + compareDocumentsWithTolerance, + type LineBreakToleranceOptions +} from '../src/diffAlgorithm'; + +describe('换行容差功能', () => { + const defaultOptions: LineBreakToleranceOptions = { + enableLineBreakTolerance: true, + scanRange: 3, + toleranceThreshold: 0.95 + }; + + describe('compareWithLineBreakTolerance', () => { + it('应该检测到完全相同的行', () => { + const originalLines = ['这是第一行', '这是第二行']; + const modifiedLines = ['这是第一行', '这是第二行']; + + const result = compareWithLineBreakTolerance( + '这是第一行', + '这是第一行', + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(true); + }); + + it('应该检测到换行差异:单行拆分为多行', () => { + const originalLines = ['这是一整行文本']; + const modifiedLines = ['这是一整行', '文本']; + + const result = compareWithLineBreakTolerance( + '这是一整行', + '这是一整行', + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(true); + }); + + it('应该检测到换行差异:多行合并为单行', () => { + const originalLines = ['这是第一行', '这是第二行']; + const modifiedLines = ['这是第一行这是第二行']; + + const result = compareWithLineBreakTolerance( + '这是第一行', + '这是第一行这是第二行', + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(true); + }); + + it('应该检测到换行差异:复杂的多行重组', () => { + const originalLines = [ + '函数的参数列表包括:', + 'name:用户名', + 'age:年龄', + 'email:邮箱地址' + ]; + const modifiedLines = ['函数的参数列表包括:name:用户名age:年龄email:邮箱地址']; + + const result = compareWithLineBreakTolerance( + '函数的参数列表包括:', + '函数的参数列表包括:name:用户名age:年龄email:邮箱地址', + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(true); + }); + + it('应该对高相似度的内容应用容差', () => { + const originalLines = ['这是原始文本内容']; + const modifiedLines = ['这是原始文本文内容']; // 少量差异 + + const result = compareWithLineBreakTolerance( + '这是原始文本内容', + '这是原始文本文内容', + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(true); // 相似度应该超过 0.95 阈值 + }); + + it('应该对差异过大的内容不应用容差', () => { + const originalLines = ['这是第一段内容']; + const modifiedLines = ['这是完全不同的第二段内容']; + + const result = compareWithLineBreakTolerance( + '这是第一段内容', + '这是完全不同的第二段内容', + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(false); + }); + + it('应该在禁用容差时返回 false', () => { + const originalLines = ['这是一整行文本']; + const modifiedLines = ['这是一整行', '文本']; + + const result = compareWithLineBreakTolerance( + '这是一整行', + '这是一整行', + originalLines, + modifiedLines, + 0, + 0, + { ...defaultOptions, enableLineBreakTolerance: false } + ); + + expect(result).toBe(false); + }); + + it('应该正确处理扫描范围限制', () => { + const originalLines = ['第1行', '第2行', '第3行', '第4行', '第5行']; + const modifiedLines = ['第1行', '第2行第3行第4行第5行']; + + const result = compareWithLineBreakTolerance( + '第2行', + '第2行第3行第4行第5行', + originalLines, + modifiedLines, + 1, // 从第2行开始 + 0, + { ...defaultOptions, scanRange: 2 } + ); + + expect(result).toBe(true); + }); + + it('应该处理大小写混合的内容', () => { + const originalLines = ['HelloWorld Test']; + const modifiedLines = ['hello', 'world', 'test']; + + const result = compareWithLineBreakTolerance( + 'HelloWorld Test', + 'hello', + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(true); // 转换为小写后应该匹配 + }); + }); + + describe('compareDocumentsWithTolerance', () => { + it('应该处理简单的换行差异', () => { + const original = `这是第一段文本。 +这是第二段文本。`; + const modified = `这是第一段文本。这是第二段文本。`; + + const diffs = compareDocumentsWithTolerance(original, modified, defaultOptions); + + // 应该识别为未修改(换行容差生效) + expect(diffs.length).toBe(2); + expect(diffs[0].type).toBe('unchanged'); + expect(diffs[1].type).toBe('unchanged'); + }); + + it('应该保持对真正修改的检测', () => { + const original = `这是原始文本。 +这是另一段文本。`; + const modified = `这是修改后的文本。 +这是另一段文本。`; + + const diffs = compareDocumentsWithTolerance(original, modified, defaultOptions); + + // 第一段应该被识别为修改(内容确实不同) + expect(diffs.some((diff) => diff.type === 'modified')).toBe(true); + expect(diffs.some((diff) => diff.original === '这是原始文本。')).toBe(true); + expect(diffs.some((diff) => diff.modified === '这是修改后的文本。')).toBe(true); + }); + + it('应该处理复杂的文档结构', () => { + const original = `标题 + +第一章 +这是第一章的内容。 + +第二章 +这是第二章的内容。 + +结论 +文档结束。`; + + const modified = `标题 + +第一章这是第一章的内容。 + +第二章这是第二章的内容。 + +结论文档结束。`; + + const diffs = compareDocumentsWithTolerance(original, modified, defaultOptions); + + // 应该检测到标题未修改 + expect(diffs.some((diff) => diff.type === 'unchanged' && diff.original === '标题')).toBe( + true + ); + + // 章节内容应该通过容差处理 + expect(diffs.filter((diff) => diff.type === 'modified').length).toBeLessThan(3); + }); + + it('应该处理混合换行和内容修改', () => { + const original = `第一段内容。 +第二段内容。 +第三段内容。`; + + const modified = `第一段修改后的内容。第二段内容。 +第三段内容。`; + + const diffs = compareDocumentsWithTolerance(original, modified, defaultOptions); + + // 应该检测到第一段的修改 + expect( + diffs.some( + (diff) => + diff.type === 'modified' && + diff.original === '第一段内容。' && + diff.modified === '第一段修改后的内容。第二段内容。' + ) + ).toBe(true); + }); + + it('应该在禁用容差时使用严格比较', () => { + const original = `这是第一行。 +这是第二行。`; + const modified = `这是第一行。这是第二行。`; + + const strictDiffs = compareDocumentsWithTolerance(original, modified, { + ...defaultOptions, + enableLineBreakTolerance: false + }); + + const tolerantDiffs = compareDocumentsWithTolerance(original, modified, { + ...defaultOptions, + enableLineBreakTolerance: true + }); + + // 禁用容差应该产生更多差异 + expect(strictDiffs.length).toBeGreaterThan(tolerantDiffs.length); + }); + + it('应该处理空内容', () => { + const original = ''; + const modified = ''; + + const diffs = compareDocumentsWithTolerance(original, modified, defaultOptions); + + expect(diffs.length).toBe(0); + }); + + it('应该处理单行文档', () => { + const original = '这是单行文本'; + const modified = '这是单行文本'; + + const diffs = compareDocumentsWithTolerance(original, modified, defaultOptions); + + expect(diffs.length).toBe(1); + expect(diffs[0].type).toBe('unchanged'); + }); + + it('应该处理大量换行符的情况', () => { + const original = `第一段 + + + +第二段`; + + const modified = `第一段 +第二段`; + + const diffs = compareDocumentsWithTolerance(original, modified, defaultOptions); + + // 应该正确处理多余的换行符 + expect(diffs.some((diff) => diff.type === 'unchanged' && diff.original === '第一段')).toBe( + true + ); + expect(diffs.some((diff) => diff.type === 'unchanged' && diff.original === '第二段')).toBe( + true + ); + }); + }); + + describe('性能测试', () => { + it('应该在合理时间内处理大文档', () => { + // 生成大文档 + const originalLines = []; + const modifiedLines = []; + + for (let i = 0; i < 100; i++) { + originalLines.push(`这是第${i}段文本内容。`); + if (i % 10 === 0) { + // 每10段合并一次 + modifiedLines.push(`这是第${i}段文本内容。这是第${i + 1}段文本内容。`); + i++; // 跳过下一个 + } else { + modifiedLines.push(`这是第${i}段文本内容。`); + } + } + + const original = originalLines.join('\n'); + const modified = modifiedLines.join('\n'); + + const startTime = Date.now(); + const diffs = compareDocumentsWithTolerance(original, modified, defaultOptions); + const endTime = Date.now(); + + // 应该在合理时间内完成(2秒以内) + expect(endTime - startTime).toBeLessThan(2000); + expect(diffs.length).toBeGreaterThan(0); + }); + }); + + describe('边界情况', () => { + it('应该处理完全相同的文档', () => { + const text = `这是相同的文档内容。 +没有任何差异。`; + + const diffs = compareDocumentsWithTolerance(text, text, defaultOptions); + + expect(diffs.every((diff) => diff.type === 'unchanged')).toBe(true); + }); + + it('应该处理完全不同的文档', () => { + const original = '这是原始文档'; + const modified = '这是完全不同的文档'; + + const diffs = compareDocumentsWithTolerance(original, modified, defaultOptions); + + // 应该检测到修改 + expect(diffs.some((diff) => diff.type === 'modified')).toBe(true); + }); + + it('应该处理只包含空格的行', () => { + const original = `第一行 + +第三行`; + const modified = `第一行第三行`; + + const diffs = compareDocumentsWithTolerance(original, modified, defaultOptions); + + // 空行应该被容差处理 + expect(diffs.some((diff) => diff.type === 'unchanged' && diff.original === '第一行')).toBe( + true + ); + expect(diffs.some((diff) => diff.type === 'unchanged' && diff.original === '第三行')).toBe( + true + ); + }); + + it('应该处理特殊字符', () => { + const original = `特殊字符:!@#$%^&*() +中文标点:,。!?`; + const modified = `特殊字符:!@#$%^&*()中文标点:,。!?`; + + const diffs = compareDocumentsWithTolerance(original, modified, defaultOptions); + + // 应该通过容差处理换行 + expect(diffs.length).toBeGreaterThanOrEqual(1); + }); + }); +}); diff --git a/modules/tool/packages/docDiff/test/lineTolerance.test.ts b/modules/tool/packages/docDiff/test/lineTolerance.test.ts new file mode 100644 index 00000000..5c6ad5df --- /dev/null +++ b/modules/tool/packages/docDiff/test/lineTolerance.test.ts @@ -0,0 +1,378 @@ +import { describe, it, expect } from 'bun:test'; +import { + compareWithLineBreakTolerance, + compareDocumentsWithTolerance, + type LineBreakToleranceOptions +} from '../src/diffAlgorithm'; +import type { ParagraphDiff } from '../src/diffAlgorithm'; + +describe('换行容差算法', () => { + describe('compareWithLineBreakTolerance', () => { + const defaultOptions: LineBreakToleranceOptions = { + enableLineBreakTolerance: true, + scanRange: 3, + toleranceThreshold: 0.95 + }; + + it('应该识别完全相同的行', () => { + const originalLine = '这是相同的文本'; + const modifiedLine = '这是相同的文本'; + const originalLines = ['这是相同的文本', '下一行']; + const modifiedLines = ['这是相同的文本', '下一行']; + + const result = compareWithLineBreakTolerance( + originalLine, + modifiedLine, + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(true); + }); + + it('应该识别换行差异(两行合并为一行)', () => { + const originalLines = ['第一行', '第二行', '第三行']; + const modifiedLines = ['第一行第二行', '第三行']; + + // 测试原始文档的第一行 vs 修改文档的第一行 + const result = compareWithLineBreakTolerance( + '第一行', + '第一行第二行', + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(true); + }); + + it('应该识别换行差异(一行拆分为两行)', () => { + const originalLines = ['第一行第二行', '第三行']; + const modifiedLines = ['第一行', '第二行', '第三行']; + + // 测试原始文档的第一行 vs 修改文档的第一行 + const result = compareWithLineBreakTolerance( + '第一行第二行', + '第一行', + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(true); + }); + + it('应该处理复杂的换行重组', () => { + const originalLines = ['这是第一段文本,', '内容比较长。', '这是第二段文本。']; + const modifiedLines = ['这是第一段文本,内容比较长。', '这是第二段文本。']; + + const result = compareWithLineBreakTolerance( + '这是第一段文本,', + '这是第一段文本,内容比较长。', + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(true); + }); + + it('应该处理空格差异的换行容差', () => { + const originalLines = ['单词1 单词2', '单词3']; + const modifiedLines = ['单词1', '单词2', '单词3']; + + const result = compareWithLineBreakTolerance( + '单词1 单词2', + '单词1', + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(true); + }); + + it('应该在禁用时返回 false', () => { + const originalLine = '第一行'; + const modifiedLine = '第一行第二行'; + const originalLines = ['第一行', '第二行']; + const modifiedLines = ['第一行第二行']; + + const result = compareWithLineBreakTolerance( + originalLine, + modifiedLine, + originalLines, + modifiedLines, + 0, + 0, + { ...defaultOptions, enableLineBreakTolerance: false } + ); + + expect(result).toBe(false); + }); + + it('应该拒绝真正的差异', () => { + const originalLine = '这是原始文本'; + const modifiedLine = '这是完全不同的文本'; + const originalLines = ['这是原始文本', '其他内容']; + const modifiedLines = ['这是完全不同的文本', '其他内容']; + + const result = compareWithLineBreakTolerance( + originalLine, + modifiedLine, + originalLines, + modifiedLines, + 0, + 0, + defaultOptions + ); + + expect(result).toBe(false); + }); + + it('应该处理扫描范围边界', () => { + const originalLines = Array.from({ length: 10 }, (_, i) => `行${i}`); + const modifiedLines = Array.from({ length: 10 }, (_, i) => `行${i}`); + + // 在中间位置测试 + const result = compareWithLineBreakTolerance( + '行5', + '行5', + originalLines, + modifiedLines, + 5, + 5, + { ...defaultOptions, scanRange: 2 } + ); + + expect(result).toBe(true); + }); + }); + + describe('compareDocumentsWithTolerance', () => { + it('应该处理简单的换行差异', () => { + const original = `第一行 +第二行 +第三行`; + + const modified = `第一行第二行 +第三行`; + + const diffs = compareDocumentsWithTolerance(original, modified); + + // 应该只有一个 unchanged(换行容差处理) + expect(diffs.length).toBe(2); + expect(diffs[0].type).toBe('unchanged'); + expect(diffs[1].type).toBe('unchanged'); + }); + + it('应该处理一行拆分为多行', () => { + const original = `第一行第二行 +第三行`; + + const modified = `第一行 +第二行 +第三行`; + + const diffs = compareDocumentsWithTolerance(original, modified); + + // 应该都识别为 unchanged(换行容差处理) + expect(diffs.length).toBe(3); + expect(diffs.every((diff) => diff.type === 'unchanged')).toBe(true); + }); + + it('应该保持对真实差异的敏感度', () => { + const original = `第一行 +第二行 +第三行`; + + const modified = `第一行 +修改的第二行 +第三行`; + + const diffs = compareDocumentsWithTolerance(original, modified); + + // 应该识别出修改 + const modifiedDiffs = diffs.filter((diff) => diff.type === 'modified'); + expect(modifiedDiffs.length).toBe(1); + expect(modifiedDiffs[0].original).toBe('第二行'); + expect(modifiedDiffs[0].modified).toBe('修改的第二行'); + }); + + it('应该处理混合场景(换行差异 + 真实修改)', () => { + const original = `第一段内容, +继续第二段。 +第三行 unchanged`; + + const modified = `第一段内容,继续第二段。 +第三行已修改`; + + const diffs = compareDocumentsWithTolerance(original, modified); + + // 应该识别出换行容差和真实修改 + const unchangedDiffs = diffs.filter((diff) => diff.type === 'unchanged'); + const modifiedDiffs = diffs.filter((diff) => diff.type === 'modified'); + + expect(modifiedDiffs.length).toBe(1); + expect(modifiedDiffs[0].original).toContain('第三行 unchanged'); + expect(modifiedDiffs[0].modified).toContain('第三行已修改'); + }); + + it('应该处理增加和删除', () => { + const original = `第一行 +第二行 +第三行`; + + const modified = `新增的第一行 +第一行 +第三行`; + + const diffs = compareDocumentsWithTolerance(original, modified); + + const addedDiffs = diffs.filter((diff) => diff.type === 'added'); + const removedDiffs = diffs.filter((diff) => diff.type === 'removed'); + const unchangedDiffs = diffs.filter((diff) => diff.type === 'unchanged'); + + expect(addedDiffs.length).toBe(1); + expect(addedDiffs[0].modified).toBe('新增的第一行'); + expect(removedDiffs.length).toBe(1); + expect(removedDiffs[0].original).toBe('第二行'); + expect(unchangedDiffs.length).toBeGreaterThan(0); + }); + + it('应该处理空文档情况', () => { + const diffs1 = compareDocumentsWithTolerance('', '新内容'); + const diffs2 = compareDocumentsWithTolerance('原始内容', ''); + + expect(diffs1.length).toBe(1); + expect(diffs1[0].type).toBe('added'); + expect(diffs1[0].modified).toBe('新内容'); + + expect(diffs2.length).toBe(1); + expect(diffs2[0].type).toBe('removed'); + expect(diffs2[0].original).toBe('原始内容'); + }); + + it('应该处理相同文档', () => { + const text = `第一行 +第二行 +第三行`; + + const diffs = compareDocumentsWithTolerance(text, text); + + expect(diffs.length).toBe(3); + expect(diffs.every((diff) => diff.type === 'unchanged')).toBe(true); + }); + + it('应该处理不同的扫描范围设置', () => { + const original = `第一行 +第二行 +第三行 +第四行`; + + const modified = `第一行第二行 +第三行 +第四行`; + + // 较小的扫描范围 + const diffs1 = compareDocumentsWithTolerance(original, modified, { + enableLineBreakTolerance: true, + scanRange: 1, + toleranceThreshold: 0.95 + }); + + // 较大的扫描范围 + const diffs2 = compareDocumentsWithTolerance(original, modified, { + enableLineBreakTolerance: true, + scanRange: 5, + toleranceThreshold: 0.95 + }); + + // 两种情况下都应该能处理换行差异 + expect(diffs1.every((diff) => diff.type === 'unchanged')).toBe(true); + expect(diffs2.every((diff) => diff.type === 'unchanged')).toBe(true); + }); + + it('应该处理不同的相似度阈值', () => { + const original = `文本行1 +文本行2`; + + const modified = `文本行1文本行2`; + + // 高阈值 + const diffs1 = compareDocumentsWithTolerance(original, modified, { + enableLineBreakTolerance: true, + scanRange: 3, + toleranceThreshold: 0.99 + }); + + // 低阈值 + const diffs2 = compareDocumentsWithTolerance(original, modified, { + enableLineBreakTolerance: true, + scanRange: 3, + toleranceThreshold: 0.8 + }); + + // 高阈值情况下可能不会识别为容差,低阈值会识别 + expect(diffs1.length).toBeGreaterThanOrEqual(0); + expect(diffs2.length).toBeGreaterThanOrEqual(0); + }); + }); + + describe('复杂场景测试', () => { + it('应该处理段落级别的换行差异', () => { + const original = `这是第一段文本。 +内容比较长,被分成了多行。 +这是第二段文本。`; + + const modified = `这是第一段文本。内容比较长,被分成了多行。 +这是第二段文本。`; + + const diffs = compareDocumentsWithTolerance(original, modified); + + // 大部分内容应该被识别为 unchanged + const unchangedDiffs = diffs.filter((diff) => diff.type === 'unchanged'); + expect(unchangedDiffs.length).toBeGreaterThan(0); + }); + + it('应该处理表格相关的换行差异', () => { + const original = `| 列1 | 列2 | +|-----|-----| +| 值1 | 值2 |`; + + const modified = `| 列1 | 列2 | +|-----|-----| +| 值1 | +值2 |`; + + const diffs = compareDocumentsWithTolerance(original, modified); + + // 应该能处理表格中的换行差异 + expect(diffs.length).toBeGreaterThan(0); + }); + + it('应该处理代码块中的换行差异', () => { + const original = `function test() { + return true; +}`; + + const modified = `function test() { return true; }`; + + const diffs = compareDocumentsWithTolerance(original, modified); + + // 应该能处理代码中的换行差异 + expect(diffs.length).toBeGreaterThanOrEqual(0); + }); + }); +});