Skip to content

Commit 70651d3

Browse files
ykfnxxjackwener
andauthored
feat(douban): add movie adapter with search, top250, subject, marks, reviews commands (#239)
* feat(douban): add movie adapter with search, top250, subject, marks, reviews commands - search: search movies by keyword - top250: get top 250 movies - subject: get movie details by id - marks: export personal viewing marks - reviews: export personal movie reviews * review: resolve douban adapter blockers --------- Co-authored-by: jackwener <jakevingoo@gmail.com>
1 parent 9696db9 commit 70651d3

File tree

7 files changed

+491
-0
lines changed

7 files changed

+491
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ Run `opencli list` for the live registry.
161161
| **stackoverflow** | `hot` `search` `bounties` `unanswered` | Public |
162162
| **steam** | `top-sellers` | Public |
163163
| **weread** | `shelf` `search` `book` `highlights` `notes` `notebooks` `ranking` | Browser |
164+
| **douban** | `search` `top250` `subject` `marks` `reviews` | Browser |
164165

165166
> **Bloomberg note**: The RSS-backed Bloomberg listing commands (`main`, section feeds, `feeds`) work without a browser. `bloomberg news` is for standard Bloomberg story/article pages that your current Chrome session can already access. Audio and some other non-standard pages may fail, and OpenCLI does not bypass Bloomberg paywall or entitlement checks.
166167

README.zh-CN.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ npm install -g @jackwener/opencli@latest
162162
| **stackoverflow** | `hot` `search` `bounties` `unanswered` | 公开 |
163163
| **steam** | `top-sellers` | 公开 |
164164
| **weread** | `shelf` `search` `book` `highlights` `notes` `notebooks` `ranking` | 浏览器 |
165+
| **douban** | `search` `top250` `subject` `marks` `reviews` | 浏览器 |
165166

166167
> **Bloomberg 说明**:Bloomberg 的 RSS 列表命令(`main`、各栏目 feed、`feeds`)无需浏览器即可使用。`bloomberg news` 适用于当前 Chrome 会话本身就能访问的标准 Bloomberg 文章页。音频页和部分非标准页面可能失败,OpenCLI 也不会绕过 Bloomberg 的付费墙、登录或权限校验。
167168

src/clis/douban/marks.ts

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import { cli, Strategy } from '../../registry.js';
2+
import type { IPage } from '../../types.js';
3+
import { DoubanMark, getSelfUid } from './utils.js';
4+
5+
cli({
6+
site: 'douban',
7+
name: 'marks',
8+
description: '导出个人观影标记',
9+
domain: 'movie.douban.com',
10+
strategy: Strategy.COOKIE,
11+
args: [
12+
{
13+
name: 'status',
14+
default: 'collect',
15+
choices: ['collect', 'wish', 'do', 'all'],
16+
help: '标记类型: collect(看过), wish(想看), do(在看), all(全部)'
17+
},
18+
{ name: 'limit', type: 'int', default: 50, help: '导出数量, 0 表示全部' },
19+
{ name: 'uid', help: '用户ID,不填则使用当前登录账号' },
20+
],
21+
columns: ['title', 'year', 'myRating', 'myStatus', 'myDate', 'myComment', 'url'],
22+
func: async (page: IPage, kwargs: { status?: string; limit?: number; uid?: string }) => {
23+
const { status = 'collect', limit = 50, uid: providedUid } = kwargs;
24+
25+
const uid = providedUid || await getSelfUid(page);
26+
27+
const statuses = status === 'all'
28+
? ['collect', 'wish', 'do']
29+
: [status];
30+
31+
const allMarks: DoubanMark[] = [];
32+
33+
for (const s of statuses) {
34+
const remaining = limit > 0 ? limit - allMarks.length : 0;
35+
if (limit > 0 && remaining <= 0) break;
36+
37+
const marks = await fetchMarks(page, uid, s, remaining);
38+
allMarks.push(...marks);
39+
}
40+
41+
return allMarks.slice(0, limit > 0 ? limit : undefined);
42+
},
43+
});
44+
45+
async function fetchMarks(
46+
page: IPage,
47+
uid: string,
48+
status: string,
49+
limit: number
50+
): Promise<DoubanMark[]> {
51+
const marks: DoubanMark[] = [];
52+
let offset = 0;
53+
const pageSize = 30;
54+
55+
while (true) {
56+
const url = `https://movie.douban.com/people/${uid}/${status}?start=${offset}&sort=time&rating=all&filter=all&mode=grid`;
57+
58+
await page.goto(url);
59+
60+
await page.wait({ time: 2 });
61+
62+
const pageMarks = await page.evaluate(`
63+
() => {
64+
const results = [];
65+
66+
const items = document.querySelectorAll('.item');
67+
68+
items.forEach(item => {
69+
const titleLink = item.querySelector('.info a[href*="/subject/"]');
70+
if (!titleLink) return;
71+
72+
const titleEl = titleLink.querySelector('em');
73+
const titleText = titleEl?.textContent?.trim() || titleLink.textContent?.trim() || '';
74+
const title = titleText.split('/')[0].trim();
75+
const href = titleLink.href || '';
76+
77+
const idMatch = href.match(/subject\\/(\\d+)/);
78+
const movieId = idMatch ? idMatch[1] : '';
79+
80+
if (!movieId || !title) return;
81+
82+
const ratingSpan = item.querySelector('span[class*="rating"]');
83+
let myRating = null;
84+
if (ratingSpan) {
85+
const cls = ratingSpan.className || '';
86+
const ratingMatch = cls.match(/rating(\\d)-t/);
87+
if (ratingMatch) {
88+
myRating = parseInt(ratingMatch[1], 10) * 2;
89+
}
90+
}
91+
92+
const dateSpan = item.querySelector('.date');
93+
const myDate = dateSpan?.textContent?.trim() || '';
94+
95+
const commentSpan = item.querySelector('.comment');
96+
const myComment = commentSpan?.textContent?.trim() || '';
97+
98+
const introSpan = item.querySelector('.intro');
99+
let year = '';
100+
if (introSpan) {
101+
const introText = introSpan.textContent || '';
102+
const yearMatch = introText.match(/(\\d{4})/);
103+
year = yearMatch ? yearMatch[1] : '';
104+
}
105+
106+
results.push({
107+
movieId,
108+
title,
109+
year,
110+
myRating,
111+
myStatus: '${status}',
112+
myComment,
113+
myDate,
114+
url: href || 'https://movie.douban.com/subject/' + movieId
115+
});
116+
});
117+
118+
return results;
119+
}
120+
`) as DoubanMark[];
121+
122+
if (!pageMarks || pageMarks.length === 0) break;
123+
124+
marks.push(...pageMarks);
125+
126+
if (pageMarks.length < pageSize) break;
127+
if (limit > 0 && marks.length >= limit) break;
128+
129+
offset += pageSize;
130+
131+
await new Promise(resolve => setTimeout(resolve, 1000));
132+
}
133+
134+
return marks;
135+
}

src/clis/douban/reviews.ts

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import { cli, Strategy } from '../../registry.js';
2+
import type { IPage } from '../../types.js';
3+
import { getSelfUid, DoubanReview } from './utils.js';
4+
5+
cli({
6+
site: 'douban',
7+
name: 'reviews',
8+
description: '导出个人影评',
9+
domain: 'movie.douban.com',
10+
strategy: Strategy.COOKIE,
11+
args: [
12+
{ name: 'limit', type: 'int', default: 20, help: '导出数量' },
13+
{ name: 'uid', help: '用户ID,不填则使用当前登录账号' },
14+
{ name: 'full', type: 'bool', default: false, help: '获取完整影评内容' },
15+
],
16+
columns: ['movieTitle', 'title', 'myRating', 'votes', 'content', 'url'],
17+
func: async (page: IPage, kwargs: { limit?: number; uid?: string; full?: boolean }) => {
18+
const { limit = 20, uid: providedUid, full = false } = kwargs;
19+
20+
const uid = providedUid || await getSelfUid(page);
21+
const reviews = await fetchReviews(page, uid, limit, full);
22+
23+
return reviews;
24+
},
25+
});
26+
27+
async function fetchReviews(
28+
page: IPage,
29+
uid: string,
30+
limit: number,
31+
full: boolean,
32+
): Promise<DoubanReview[]> {
33+
const reviews: DoubanReview[] = [];
34+
let start = 0;
35+
const pageSize = 20;
36+
37+
while (true) {
38+
const url = `https://movie.douban.com/people/${uid}/reviews?start=${start}&sort=time`;
39+
40+
await page.goto(url);
41+
42+
await page.wait({ time: 1 });
43+
44+
const data = await page.evaluate(`
45+
() => {
46+
const reviews = [];
47+
48+
document.querySelectorAll('.tlst').forEach(el => {
49+
const movieLinkEl = el.querySelector('.ilst a');
50+
const reviewTitleEl = el.querySelector('.nlst a[title]');
51+
const ratingEl = el.querySelector('.clst span[class*="allstar"]');
52+
const contentEl = el.querySelector('.review-short span');
53+
const votesEl = el.querySelector('.review-short .pl span');
54+
55+
const movieHref = movieLinkEl?.href || '';
56+
const movieId = movieHref.match(/subject\\/(\\d+)/)?.[1] || '';
57+
const movieTitle = movieLinkEl?.getAttribute('title') || movieLinkEl?.textContent?.trim() || '';
58+
59+
const reviewHref = reviewTitleEl?.href || '';
60+
const reviewId = reviewHref.match(/reviews\\/(\\d+)/)?.[1] || '';
61+
const title = reviewTitleEl?.textContent?.trim() || '';
62+
63+
let myRating = 0;
64+
if (ratingEl) {
65+
const cls = ratingEl.className || '';
66+
const ratingMatch = cls.match(/allstar(\\d)0/);
67+
if (ratingMatch) {
68+
myRating = parseInt(ratingMatch[1], 10) * 2;
69+
}
70+
}
71+
72+
const votesText = votesEl?.textContent || '';
73+
const votesMatch = votesText.match(/(\\d+)/);
74+
const votes = votesMatch ? parseInt(votesMatch[1], 10) : 0;
75+
76+
reviews.push({
77+
reviewId,
78+
movieId,
79+
movieTitle,
80+
title,
81+
content: contentEl?.textContent?.trim() || '',
82+
myRating,
83+
createdAt: '',
84+
votes,
85+
url: reviewHref,
86+
});
87+
});
88+
89+
return reviews;
90+
}
91+
`) as DoubanReview[];
92+
93+
reviews.push(...data);
94+
95+
if (data.length < pageSize) break;
96+
if (limit > 0 && reviews.length >= limit) break;
97+
98+
start += pageSize;
99+
}
100+
101+
const result = reviews.slice(0, limit > 0 ? limit : undefined);
102+
103+
if (full && result.length > 0) {
104+
for (const review of result) {
105+
if (review.url) {
106+
const fullContent = await fetchFullReview(page, review.url);
107+
review.content = fullContent;
108+
}
109+
}
110+
}
111+
112+
return result;
113+
}
114+
115+
async function fetchFullReview(page: IPage, reviewUrl: string): Promise<string> {
116+
await page.goto(reviewUrl);
117+
await page.wait({ time: 1 });
118+
119+
const content = await page.evaluate(`
120+
() => {
121+
const contentEl = document.querySelector('.review-content');
122+
return contentEl?.textContent?.trim() || '';
123+
}
124+
`) as string;
125+
126+
return content;
127+
}

src/clis/douban/subject.yaml

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
site: douban
2+
name: subject
3+
description: 获取电影详情
4+
domain: movie.douban.com
5+
strategy: cookie
6+
browser: true
7+
8+
args:
9+
id:
10+
positional: true
11+
required: true
12+
type: str
13+
description: 电影 ID
14+
15+
pipeline:
16+
- navigate: https://movie.douban.com/subject/${{ args.id }}
17+
18+
- evaluate: |
19+
(async () => {
20+
const id = '${{ args.id }}';
21+
22+
// Wait for page to load
23+
await new Promise(r => setTimeout(r, 2000));
24+
25+
// Extract title
26+
const titleEl = document.querySelector('span[property="v:itemreviewed"]');
27+
const title = titleEl?.textContent?.trim() || '';
28+
29+
// Extract original title
30+
const ogTitleEl = document.querySelector('span[property="v:originalTitle"]');
31+
const originalTitle = ogTitleEl?.textContent?.trim() || '';
32+
33+
// Extract year
34+
const yearEl = document.querySelector('.year');
35+
const year = yearEl?.textContent?.trim() || '';
36+
37+
// Extract rating
38+
const ratingEl = document.querySelector('strong[property="v:average"]');
39+
const rating = parseFloat(ratingEl?.textContent || '0');
40+
41+
// Extract rating count
42+
const ratingCountEl = document.querySelector('span[property="v:votes"]');
43+
const ratingCount = parseInt(ratingCountEl?.textContent || '0', 10);
44+
45+
// Extract genres
46+
const genreEls = document.querySelectorAll('span[property="v:genre"]');
47+
const genres = Array.from(genreEls).map(el => el.textContent?.trim()).filter(Boolean).join(',');
48+
49+
// Extract directors
50+
const directorEls = document.querySelectorAll('a[rel="v:directedBy"]');
51+
const directors = Array.from(directorEls).map(el => el.textContent?.trim()).filter(Boolean).join(',');
52+
53+
// Extract casts
54+
const castEls = document.querySelectorAll('a[rel="v:starring"]');
55+
const casts = Array.from(castEls).slice(0, 5).map(el => el.textContent?.trim()).filter(Boolean).join(',');
56+
57+
// Extract summary
58+
const summaryEl = document.querySelector('span[property="v:summary"]');
59+
const summary = summaryEl?.textContent?.trim() || '';
60+
61+
return [{
62+
id,
63+
title,
64+
originalTitle,
65+
year,
66+
rating,
67+
ratingCount,
68+
genres,
69+
directors,
70+
casts,
71+
summary: summary.substring(0, 200),
72+
url: `https://movie.douban.com/subject/${id}`
73+
}];
74+
})()
75+
76+
columns: [id, title, originalTitle, year, rating, ratingCount, genres, directors, casts, summary, url]

0 commit comments

Comments
 (0)