Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 94 additions & 17 deletions packages/mcp/src/docs-search/doc-fetch.test.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,37 @@
import { describe, it, expect, vi } from 'vitest';
import { describe, it, expect, vi, afterEach } from 'vitest';
import { DocFetchTool, normalizeDocUrl } from './doc-fetch.js';

const createMockResponse = ({
content,
contentType = 'text/html',
status = 200,
statusText = 'OK',
}: {
content: string;
contentType?: string;
status?: number;
statusText?: string;
}) =>
new Response(content, {
status,
statusText,
headers: { 'content-type': contentType },
});

const stubFetch = (factory: () => Response) => {
const fetchMock = vi.fn().mockImplementation(() => Promise.resolve(factory()));
vi.stubGlobal('fetch', fetchMock);
return fetchMock;
};

describe('DocFetchTool', () => {
const tool = new DocFetchTool();

afterEach(() => {
vi.clearAllMocks();
vi.unstubAllGlobals();
});

describe('URL validation', () => {
it('should accept valid HF and Gradio docs URLs', () => {
const validUrls = [
Expand Down Expand Up @@ -38,13 +66,30 @@ describe('DocFetchTool', () => {
});

describe('document chunking', () => {
it('uses markdown content from host when available', async () => {
const markdown = '# Heading\nBody content';
const fetchMock = stubFetch(() =>
createMockResponse({
content: markdown,
contentType: 'text/markdown',
}),
);

const result = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test' });
expect(fetchMock).toHaveBeenCalledWith('https://huggingface.co/docs/test', {
headers: { accept: 'text/markdown' },
});
expect(result).toBe(markdown);
});

it('should return small documents without chunking', async () => {

// Mock fetch to return HTML that converts to short markdown
global.fetch = vi.fn().mockResolvedValue({
ok: true,
text: () => Promise.resolve('<h1>Short Document</h1><p>This is a short document.</p>'),
});
stubFetch(() =>
createMockResponse({
content: '<h1>Short Document</h1><p>This is a short document.</p>',
}),
);

const result = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test' });

Expand All @@ -57,10 +102,11 @@ describe('DocFetchTool', () => {
// Mock fetch to return HTML that converts to long markdown
const longHtml = '<h1>Long Document</h1>' + '<p>This is a very long sentence that will be repeated many times to create a document that exceeds the 7500 token limit for testing chunking functionality.</p>'.repeat(200);

global.fetch = vi.fn().mockResolvedValue({
ok: true,
text: () => Promise.resolve(longHtml),
});
stubFetch(() =>
createMockResponse({
content: longHtml,
}),
);

const result = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test' });

Expand All @@ -74,21 +120,51 @@ describe('DocFetchTool', () => {
{ in: 'https://gradio.app/guides/x', out: 'https://www.gradio.app/guides/x' },
{ in: 'https://www.gradio.app/guides/x', out: 'https://www.gradio.app/guides/x' },
{ in: 'https://huggingface.co/docs/transformers', out: 'https://huggingface.co/docs/transformers' },
{ in: '/docs/diffusers/index', out: 'https://huggingface.co/docs/diffusers/index' },
{ in: './docs/diffusers/index', out: 'https://huggingface.co/docs/diffusers/index' },
{ in: 'not a url', out: 'not a url' },
];
for (const c of cases) {
expect(normalizeDocUrl(c.in)).toBe(c.out);
}
});

it('normalizes relative doc paths to the huggingface docs host', async () => {
const fetchMock = stubFetch(() =>
createMockResponse({
content: '<h1>Title</h1><p>Body</p>',
}),
);

const result = await tool.fetch({ doc_url: '/docs/test' });
expect(fetchMock).toHaveBeenCalledWith('https://huggingface.co/docs/test', {
headers: { accept: 'text/markdown' },
});
expect(result).toContain('# Title');
});

it('normalizes ./docs paths to the huggingface docs host', async () => {
const fetchMock = stubFetch(() =>
createMockResponse({
content: '<h1>Another Title</h1><p>Body</p>',
}),
);

await tool.fetch({ doc_url: './docs/another' });
expect(fetchMock).toHaveBeenCalledWith('https://huggingface.co/docs/another', {
headers: { accept: 'text/markdown' },
});
});

it('should return subsequent chunks with offset', async () => {
// Mock fetch to return the same long HTML
const longHtml = '<h1>Long Document</h1>' + '<p>This is a very long sentence that will be repeated many times to create a document that exceeds the 7500 token limit for testing chunking functionality.</p>'.repeat(200);

global.fetch = vi.fn().mockResolvedValue({
ok: true,
text: () => Promise.resolve(longHtml),
});
stubFetch(() =>
createMockResponse({
content: longHtml,
}),
);

// Get first chunk
const firstChunk = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test' });
Expand All @@ -106,10 +182,11 @@ describe('DocFetchTool', () => {
});

it('should handle offset beyond document length', async () => {
global.fetch = vi.fn().mockResolvedValue({
ok: true,
text: () => Promise.resolve('<h1>Short Document</h1><p>This is short.</p>'),
});
stubFetch(() =>
createMockResponse({
content: '<h1>Short Document</h1><p>This is short.</p>',
}),
);

const result = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test', offset: 10000 });

Expand Down
88 changes: 52 additions & 36 deletions packages/mcp/src/docs-search/doc-fetch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ export class DocFetchTool {
const nodeName = ((node as unknown as { nodeName?: string }).nodeName || '').toLowerCase();
if (nodeName === 'img') {
try {
const src = (node as unknown as { getAttribute?: (name: string) => string | null }).getAttribute?.('src') ??
const src =
(node as unknown as { getAttribute?: (name: string) => string | null }).getAttribute?.('src') ??
((node as unknown as { src?: string }).src || '');
if (
/\.svg(\?|$)/i.test(src) ||
Expand All @@ -87,21 +88,30 @@ export class DocFetchTool {
this.turndownService.addRule('dropHeadingAnchors', {
filter: (node) => {
try {
const n = node as unknown as { nodeName?: string; getAttribute?: (k: string) => string | null; textContent?: string; childNodes?: Array<{ nodeName?: string }> };
const n = node as unknown as {
nodeName?: string;
getAttribute?: (k: string) => string | null;
textContent?: string;
childNodes?: Array<{ nodeName?: string }>;
};
if ((n.nodeName || '').toLowerCase() !== 'a') return false;
const href = n.getAttribute?.('href') || '';
if (!href || !href.startsWith('#')) return false;
const text = (n.textContent || '').trim();
const children = (n as unknown as { childNodes?: Array<{ nodeName?: string }> }).childNodes || [];
const onlyIcons = children.length > 0 && children.every((c) => ((c.nodeName || '').toLowerCase() === 'img' || (c.nodeName || '').toLowerCase() === 'svg'));
const onlyIcons =
children.length > 0 &&
children.every(
(c) => (c.nodeName || '').toLowerCase() === 'img' || (c.nodeName || '').toLowerCase() === 'svg'
);
const looksLikeEncodedSvg = /data:image\/svg\+xml|%3csvg|svg%2bxml/i.test(text);
const noAlnumText = text.length <= 3 && !/[a-z0-9]/i.test(text);
return onlyIcons || looksLikeEncodedSvg || noAlnumText;
} catch {
return false;
}
},
replacement: () => ''
replacement: () => '',
});
}

Expand Down Expand Up @@ -136,37 +146,35 @@ export class DocFetchTool {
const normalizedUrl = normalizeDocUrl(params.doc_url);
this.validateUrl(normalizedUrl);

const response = await fetch(normalizedUrl);

const response = await fetch(normalizedUrl, { headers: { accept: 'text/markdown' } });
if (!response.ok) {
throw new Error(`Failed to fetch document: ${response.status} ${response.statusText}`);
}

const htmlContent = await response.text();

// Convert HTML to Markdown
let fullMarkdownContent = this.turndownService.turndown(htmlContent);

// Post-process: strip any leftover SVG images that slipped past DOM filters
// - Markdown images pointing to data:image/svg+xml or *.svg
// - Empty links left behind after image removal: [](...)
fullMarkdownContent = fullMarkdownContent
.replace(/!\[[^\]]*\]\(\s*(?:data:image\/svg\+xml[^)]*|[^)]*\.svg(?:\?[^)]*)?)\s*\)/gi, '')
.replace(/\[\s*\]\(\s*[^)]*\s*\)/g, '');

// Remove anchors whose link text still contains encoded SVG payloads (edge cases)
fullMarkdownContent = fullMarkdownContent
.replace(/\[[^\]]*(?:data:image\/svg\+xml|%3csvg|svg%2bxml)[^\]]*\]\([^)]*\)/gi, '');
let content = await response.text();
const contentType = response.headers.get('content-type') || '';
const isPlainOrMarkdown = contentType.includes('text/plain') || contentType.includes('text/markdown');
if (!isPlainOrMarkdown) {
// attempt conversion to markdown
content = this.turndownService.turndown(content);

// Post-process: strip any leftover SVG images that slipped past DOM filters
// - Markdown images pointing to data:image/svg+xml or *.svg
// - Empty links left behind after image removal: [](...)
content = content
.replace(/!\[[^\]]*\]\(\s*(?:data:image\/svg\+xml[^)]*|[^)]*\.svg(?:\?[^)]*)?)\s*\)/gi, '')
.replace(/\[\s*\]\(\s*[^)]*\s*\)/g, '');

// Remove anchors whose link text still contains encoded SVG payloads (edge cases)
content = content.replace(/\[[^\]]*(?:data:image\/svg\+xml|%3csvg|svg%2bxml)[^\]]*\]\([^)]*\)/gi, '');
}

// Apply chunking logic
return this.applyChunking(fullMarkdownContent, params.offset || 0);
return this.applyChunking(content, params.offset || 0);
} catch (error) {
throw new Error(`Failed to fetch document: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}



/**
* Apply chunking logic to markdown content
*/
Expand Down Expand Up @@ -213,15 +221,23 @@ export class DocFetchTool {
* - Convert gradio.app → www.gradio.app so pages resolve correctly
*/
export function normalizeDocUrl(input: string): string {
try {
const url = new URL(input);
const host = url.hostname.toLowerCase();
if (host === 'gradio.app') {
url.hostname = 'www.gradio.app';
return url.toString();
}
return input;
} catch {
return input;
}
try {
const trimmed = input.trim();
if (trimmed.startsWith('/docs')) {
return `https://huggingface.co${trimmed}`;
}
if (trimmed.startsWith('./docs')) {
return `https://huggingface.co/${trimmed.slice(2)}`;
}

const url = new URL(trimmed);
const host = url.hostname.toLowerCase();
if (host === 'gradio.app') {
url.hostname = 'www.gradio.app';
return url.toString();
}
return trimmed;
} catch {
return input;
}
}