Skip to content

Commit f8bf663

Browse files
xiaokuilijackwener
andauthored
fix(xiaohongshu): improve search login-wall handling and detail output (#298)
* fix(xiaohongshu): improve search login-wall handling and detail output * fix(xiaohongshu/search): keep login-wall detection & URL improvements, remove serial per-note enrichment - Detect login wall and throw a clear error message (from original PR) - Preserve search_result/ URL with xsec_token instead of degrading to /explore/<id> - Add author_url to results - Remove readNoteDetail() + sequential page.goto() per note (caused 60s+ delays for default limit=20 with 3s wait each) - Simplify and unify DOM extraction logic (remove unused fallback anchor scan) - Update tests: cover login-wall, URL preservation (assert single goto), and limit/filter --------- Co-authored-by: jackwener <jakevingoo@gmail.com>
1 parent 22f5c7a commit f8bf663

File tree

2 files changed

+185
-15
lines changed

2 files changed

+185
-15
lines changed
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
import { describe, expect, it, vi } from 'vitest';
2+
import type { IPage } from '../../types.js';
3+
import { getRegistry } from '../../registry.js';
4+
import './search.js';
5+
6+
function createPageMock(evaluateResults: any[]): IPage {
7+
const evaluate = vi.fn();
8+
for (const result of evaluateResults) {
9+
evaluate.mockResolvedValueOnce(result);
10+
}
11+
12+
return {
13+
goto: vi.fn().mockResolvedValue(undefined),
14+
evaluate,
15+
snapshot: vi.fn().mockResolvedValue(undefined),
16+
click: vi.fn().mockResolvedValue(undefined),
17+
typeText: vi.fn().mockResolvedValue(undefined),
18+
pressKey: vi.fn().mockResolvedValue(undefined),
19+
scrollTo: vi.fn().mockResolvedValue(undefined),
20+
getFormState: vi.fn().mockResolvedValue({ forms: [], orphanFields: [] }),
21+
wait: vi.fn().mockResolvedValue(undefined),
22+
tabs: vi.fn().mockResolvedValue([]),
23+
closeTab: vi.fn().mockResolvedValue(undefined),
24+
newTab: vi.fn().mockResolvedValue(undefined),
25+
selectTab: vi.fn().mockResolvedValue(undefined),
26+
networkRequests: vi.fn().mockResolvedValue([]),
27+
consoleMessages: vi.fn().mockResolvedValue([]),
28+
scroll: vi.fn().mockResolvedValue(undefined),
29+
autoScroll: vi.fn().mockResolvedValue(undefined),
30+
installInterceptor: vi.fn().mockResolvedValue(undefined),
31+
getInterceptedRequests: vi.fn().mockResolvedValue([]),
32+
getCookies: vi.fn().mockResolvedValue([]),
33+
screenshot: vi.fn().mockResolvedValue(''),
34+
};
35+
}
36+
37+
describe('xiaohongshu search', () => {
38+
it('throws a clear error when the search page is blocked by a login wall', async () => {
39+
const cmd = getRegistry().get('xiaohongshu/search');
40+
expect(cmd?.func).toBeTypeOf('function');
41+
42+
const page = createPageMock([
43+
{
44+
loginWall: true,
45+
results: [],
46+
},
47+
]);
48+
49+
await expect(cmd!.func!(page, { query: '特斯拉', limit: 5 })).rejects.toThrow(
50+
'Xiaohongshu search results are blocked behind a login wall'
51+
);
52+
});
53+
54+
it('returns ranked results with search_result url and author_url preserved', async () => {
55+
const cmd = getRegistry().get('xiaohongshu/search');
56+
expect(cmd?.func).toBeTypeOf('function');
57+
58+
const detailUrl =
59+
'https://www.xiaohongshu.com/search_result/68e90be80000000004022e66?xsec_token=test-token&xsec_source=';
60+
const authorUrl =
61+
'https://www.xiaohongshu.com/user/profile/635a9c720000000018028b40?xsec_token=user-token&xsec_source=pc_search';
62+
63+
const page = createPageMock([
64+
{
65+
loginWall: false,
66+
results: [
67+
{
68+
title: '某鱼买FSD被坑了4万',
69+
author: '随风',
70+
likes: '261',
71+
url: detailUrl,
72+
author_url: authorUrl,
73+
},
74+
],
75+
},
76+
]);
77+
78+
const result = await cmd!.func!(page, { query: '特斯拉', limit: 1 });
79+
80+
// Should only do one goto (the search page itself), no per-note detail navigation
81+
expect((page.goto as any).mock.calls).toHaveLength(1);
82+
83+
expect(result).toEqual([
84+
{
85+
rank: 1,
86+
title: '某鱼买FSD被坑了4万',
87+
author: '随风',
88+
likes: '261',
89+
url: detailUrl,
90+
author_url: authorUrl,
91+
},
92+
]);
93+
});
94+
95+
it('filters out results with no title and respects the limit', async () => {
96+
const cmd = getRegistry().get('xiaohongshu/search');
97+
expect(cmd?.func).toBeTypeOf('function');
98+
99+
const page = createPageMock([
100+
{
101+
loginWall: false,
102+
results: [
103+
{
104+
title: 'Result A',
105+
author: 'UserA',
106+
likes: '10',
107+
url: 'https://www.xiaohongshu.com/search_result/aaa',
108+
author_url: '',
109+
},
110+
{
111+
title: '',
112+
author: 'UserB',
113+
likes: '5',
114+
url: 'https://www.xiaohongshu.com/search_result/bbb',
115+
author_url: '',
116+
},
117+
{
118+
title: 'Result C',
119+
author: 'UserC',
120+
likes: '3',
121+
url: 'https://www.xiaohongshu.com/search_result/ccc',
122+
author_url: '',
123+
},
124+
],
125+
},
126+
]);
127+
128+
const result = (await cmd!.func!(page, { query: '测试', limit: 1 })) as any[];
129+
130+
// limit=1 should return only the first valid-titled result
131+
expect(result).toHaveLength(1);
132+
expect(result[0]).toMatchObject({ rank: 1, title: 'Result A' });
133+
});
134+
});

src/clis/xiaohongshu/search.ts

Lines changed: 51 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ cli({
1818
{ name: 'query', required: true, positional: true, help: 'Search keyword' },
1919
{ name: 'limit', type: 'int', default: 20, help: 'Number of results' },
2020
],
21-
columns: ['rank', 'title', 'author', 'likes'],
21+
columns: ['rank', 'title', 'author', 'likes', 'url'],
2222
func: async (page, kwargs) => {
2323
const keyword = encodeURIComponent(kwargs.query);
2424
await page.goto(
@@ -29,34 +29,70 @@ cli({
2929
// Scroll a couple of times to load more results
3030
await page.autoScroll({ times: 2 });
3131

32-
const data = await page.evaluate(`
32+
const payload = await page.evaluate(`
3333
(() => {
34-
const notes = document.querySelectorAll('section.note-item');
34+
const loginWall = /登录后查看搜索结果/.test(document.body.innerText || '');
35+
36+
const normalizeUrl = (href) => {
37+
if (!href) return '';
38+
if (href.startsWith('http://') || href.startsWith('https://')) return href;
39+
if (href.startsWith('/')) return 'https://www.xiaohongshu.com' + href;
40+
return '';
41+
};
42+
43+
const cleanText = (value) => (value || '').replace(/\\s+/g, ' ').trim();
44+
3545
const results = [];
36-
notes.forEach(el => {
46+
const seen = new Set();
47+
48+
document.querySelectorAll('section.note-item').forEach(el => {
3749
// Skip "related searches" sections
3850
if (el.classList.contains('query-note-item')) return;
3951
40-
const titleEl = el.querySelector('.title, .note-title, a.title');
41-
const nameEl = el.querySelector('.name, .author-name, .nick-name');
52+
const titleEl = el.querySelector('.title, .note-title, a.title, .footer .title span');
53+
const nameEl = el.querySelector('a.author .name, .name, .author-name, .nick-name, a.author');
4254
const likesEl = el.querySelector('.count, .like-count, .like-wrapper .count');
43-
const linkEl = el.querySelector('a[href*="/explore/"], a[href*="/search_result/"], a[href*="/note/"]');
55+
// Prefer search_result link (preserves xsec_token) over generic /explore/ link
56+
const detailLinkEl =
57+
el.querySelector('a.cover.mask') ||
58+
el.querySelector('a[href*="/search_result/"]') ||
59+
el.querySelector('a[href*="/explore/"]') ||
60+
el.querySelector('a[href*="/note/"]');
61+
const authorLinkEl = el.querySelector('a.author, a[href*="/user/profile/"]');
4462
45-
const href = linkEl?.getAttribute('href') || '';
46-
const noteId = href.match(/\\/(?:explore|note)\\/([a-zA-Z0-9]+)/)?.[1] || '';
63+
const url = normalizeUrl(detailLinkEl?.getAttribute('href') || '');
64+
if (!url) return;
65+
66+
const key = url;
67+
if (seen.has(key)) return;
68+
seen.add(key);
4769
4870
results.push({
49-
title: (titleEl?.textContent || '').trim(),
50-
author: (nameEl?.textContent || '').trim(),
51-
likes: (likesEl?.textContent || '0').trim(),
52-
url: noteId ? 'https://www.xiaohongshu.com/explore/' + noteId : '',
71+
title: cleanText(titleEl?.textContent || ''),
72+
author: cleanText(nameEl?.textContent || ''),
73+
likes: cleanText(likesEl?.textContent || '0'),
74+
url,
75+
author_url: normalizeUrl(authorLinkEl?.getAttribute('href') || ''),
5376
});
5477
});
55-
return results;
78+
79+
return {
80+
loginWall,
81+
results,
82+
};
5683
})()
5784
`);
5885

59-
if (!Array.isArray(data)) return [];
86+
if (!payload || typeof payload !== 'object') return [];
87+
88+
if ((payload as any).loginWall) {
89+
throw new Error(
90+
'Xiaohongshu search results are blocked behind a login wall for the current browser session. ' +
91+
'Open https://www.xiaohongshu.com/search_result in Chrome and sign in, then retry.'
92+
);
93+
}
94+
95+
const data: any[] = Array.isArray((payload as any).results) ? (payload as any).results : [];
6096
return data
6197
.filter((item: any) => item.title)
6298
.slice(0, kwargs.limit)

0 commit comments

Comments
 (0)