diff --git a/README.md b/README.md index e77fad9..bdb01bf 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # OpenCLI > **Make any website or Electron App your CLI.** -> Zero risk · Reuse Chrome login · AI-powered discovery · 80+ commands · 19 sites +> Zero risk · Reuse Chrome login · AI-powered discovery · Browser + Desktop automation [中文文档](./README.zh-CN.md) @@ -22,6 +22,7 @@ Turn ANY Electron application into a CLI tool! Recombine, script, and extend app - [Prerequisites](#prerequisites) - [Quick Start](#quick-start) - [Built-in Commands](#built-in-commands) +- [Download Support](#download-support) - [Output Formats](#output-formats) - [For AI Agents (Developer Guide)](#for-ai-agents-developer-guide) - [Remote Chrome (Server/Headless)](#remote-chrome-serverheadless) @@ -143,39 +144,101 @@ npm install -g @jackwener/opencli@latest ## Built-in Commands -**32 sites · 162 commands** — run `opencli list` for the live registry. - -| Site | Commands | Count | Mode | -|------|----------|:-----:|------| -| **twitter** | `trending` `bookmarks` `profile` `search` `timeline` `thread` `following` `followers` `notifications` `post` `reply` `delete` `like` `article` `follow` `unfollow` `bookmark` `unbookmark` | 18 | 🔐 Browser | -| **reddit** | `hot` `frontpage` `popular` `search` `subreddit` `read` `user` `user-posts` `user-comments` `upvote` `save` `comment` `subscribe` `saved` `upvoted` | 15 | 🔐 Browser | -| **cursor** | `status` `send` `read` `new` `dump` `composer` `model` `extract-code` `ask` `screenshot` `history` `export` | 12 | 🖥️ Desktop | -| **bilibili** | `hot` `search` `me` `favorite` `history` `feed` `subtitle` `dynamic` `ranking` `following` `user-videos` | 11 | 🔐 Browser | -| **codex** | `status` `send` `read` `new` `extract-diff` `model` `ask` `screenshot` `history` `export` | 10 | 🖥️ Desktop | -| **chatwise** | `status` `new` `send` `read` `ask` `model` `history` `export` `screenshot` | 9 | 🖥️ Desktop | -| **notion** | `status` `search` `read` `new` `write` `sidebar` `favorites` `export` | 8 | 🖥️ Desktop | -| **discord** | `status` `send` `read` `channels` `servers` `search` `members` | 7 | 🖥️ Desktop | -| **v2ex** | `hot` `latest` `topic` `daily` `me` `notifications` | 6 | 🌐 / 🔐 | -| **xueqiu** | `feed` `hot-stock` `hot` `search` `stock` `watchlist` | 6 | 🔐 Browser | -| **antigravity** | `status` `send` `read` `new` `evaluate` | 5 | 🖥️ Desktop | -| **xiaohongshu** | `search` `notifications` `feed` `me` `user` | 5 | 🔐 Browser | -| **chatgpt** | `status` `new` `send` `read` `ask` | 5 | 🖥️ Desktop | -| **wechat** | `status` `send` `new` `search` `read` | 5 | 🖥️ Desktop | -| **feishu** | `status` `send` `new` `search` `read` | 5 | 🖥️ Desktop | -| **xiaoyuzhou** | `podcast` `podcast-episodes` `episode` | 3 | 🌐 Public | -| **youtube** | `search` `video` `transcript` | 3 | 🔐 Browser | -| **zhihu** | `hot` `search` `question` | 3 | 🔐 Browser | -| **boss** | `search` `detail` | 2 | 🔐 Browser | -| **coupang** | `search` `add-to-cart` | 2 | 🔐 Browser | -| **bbc** | `news` | 1 | 🌐 Public | -| **ctrip** | `search` | 1 | 🔐 Browser | -| **github** | `search` | 1 | 🌐 Public | -| **hackernews** | `top` | 1 | 🌐 Public | -| **linkedin** | `search` | 1 | 🔐 Browser | -| **reuters** | `search` | 1 | 🔐 Browser | -| **smzdm** | `search` | 1 | 🔐 Browser | -| **weibo** | `hot` | 1 | 🔐 Browser | -| **yahoo-finance** | `quote` | 1 | 🔐 Browser | +Run `opencli list` for the live registry. + +| Site | Commands | Mode | +|------|----------|------| +| **twitter** | `trending` `bookmarks` `profile` `search` `timeline` `thread` `following` `followers` `notifications` `post` `reply` `delete` `like` `article` `follow` `unfollow` `bookmark` `unbookmark` `download` | 🔐 Browser | +| **reddit** | `hot` `frontpage` `popular` `search` `subreddit` `read` `user` `user-posts` `user-comments` `upvote` `save` `comment` `subscribe` `saved` `upvoted` | 🔐 Browser | +| **cursor** | `status` `send` `read` `new` `dump` `composer` `model` `extract-code` `ask` `screenshot` `history` `export` | 🖥️ Desktop | +| **bilibili** | `hot` `search` `me` `favorite` `history` `feed` `subtitle` `dynamic` `ranking` `following` `user-videos` `download` | 🔐 Browser | +| **codex** | `status` `send` `read` `new` `extract-diff` `model` `ask` `screenshot` `history` `export` | 🖥️ Desktop | +| **chatwise** | `status` `new` `send` `read` `ask` `model` `history` `export` `screenshot` | 🖥️ Desktop | +| **notion** | `status` `search` `read` `new` `write` `sidebar` `favorites` `export` | 🖥️ Desktop | +| **discord** | `status` `send` `read` `channels` `servers` `search` `members` | 🖥️ Desktop | +| **v2ex** | `hot` `latest` `topic` `daily` `me` `notifications` | 🌐 / 🔐 | +| **xueqiu** | `feed` `hot-stock` `hot` `search` `stock` `watchlist` | 🔐 Browser | +| **antigravity** | `status` `send` `read` `new` `evaluate` | 🖥️ Desktop | +| **chatgpt** | `status` `new` `send` `read` `ask` | 🖥️ Desktop | +| **wechat** | `status` `send` `new` `search` `read` | 🖥️ Desktop | +| **feishu** | `status` `send` `new` `search` `read` | 🖥️ Desktop | +| **xiaohongshu** | `search` `notifications` `feed` `me` `user` `download` | 🔐 Browser | +| **xiaoyuzhou** | `podcast` `podcast-episodes` `episode` | 🌐 Public | +| **zhihu** | `hot` `search` `question` `download` | 🔐 Browser | +| **youtube** | `search` `video` `transcript` | 🔐 Browser | +| **boss** | `search` `detail` | 🔐 Browser | +| **coupang** | `search` `add-to-cart` | 🔐 Browser | +| **bbc** | `news` | 🌐 Public | +| **ctrip** | `search` | 🔐 Browser | +| **github** | `search` | 🌐 Public | +| **hackernews** | `top` | 🌐 Public | +| **linkedin** | `search` | 🔐 Browser | +| **reuters** | `search` | 🔐 Browser | +| **smzdm** | `search` | 🔐 Browser | +| **weibo** | `hot` | 🔐 Browser | +| **yahoo-finance** | `quote` | 🔐 Browser | + +## Download Support + +OpenCLI supports downloading images, videos, and articles from supported platforms. + +### Supported Platforms + +| Platform | Content Types | Notes | +|----------|---------------|-------| +| **xiaohongshu** | Images, Videos | Downloads all media from a note | +| **bilibili** | Videos | Requires `yt-dlp` installed | +| **twitter** | Images, Videos | Downloads from user media tab or single tweet | +| **zhihu** | Articles (Markdown) | Exports articles with optional image download | + +### Prerequisites + +For video downloads from streaming platforms, you need to install `yt-dlp`: + +```bash +# Install yt-dlp +pip install yt-dlp +# or +brew install yt-dlp +``` + +### Usage Examples + +```bash +# Download images/videos from Xiaohongshu note +opencli xiaohongshu download --note-id abc123 --output ./xhs + +# Download Bilibili video (requires yt-dlp) +opencli bilibili download --bvid BV1xxx --output ./bilibili +opencli bilibili download --bvid BV1xxx --quality 1080p # Specify quality + +# Download Twitter media from user +opencli twitter download --username elonmusk --limit 20 --output ./twitter + +# Download single tweet media +opencli twitter download --tweet-url "https://x.com/user/status/123" --output ./twitter + +# Export Zhihu article to Markdown +opencli zhihu download --url "https://zhuanlan.zhihu.com/p/xxx" --output ./zhihu + +# Export with local images +opencli zhihu download --url "https://zhuanlan.zhihu.com/p/xxx" --download-images +``` + +### Pipeline Step (for YAML adapters) + +The `download` step can be used in YAML pipelines: + +```yaml +pipeline: + - fetch: https://api.example.com/media + - download: + url: ${{ item.imageUrl }} + dir: ./downloads + filename: ${{ item.title | sanitize }}.jpg + concurrency: 5 + skip_existing: true +``` ## Output Formats @@ -220,7 +283,7 @@ Explore outputs to `.opencli/explore//` (manifest.json, endpoints.json, ca See **[TESTING.md](./TESTING.md)** for the full testing guide, including: -- Current test coverage (unit + E2E tests across 19 sites) +- Current test coverage (unit + E2E tests across browser and desktop adapters) - How to run tests locally - How to add tests when creating new adapters - CI/CD pipeline with sharding diff --git a/README.zh-CN.md b/README.zh-CN.md index 73e3e48..c741bbc 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -1,7 +1,7 @@ # OpenCLI > **把任何网站或 Electron 应用变成你的命令行工具。** -> 零风控 · 复用 Chrome 登录 · AI 自动发现接口 · 80+ 命令 · 19 站点 +> 零风控 · 复用 Chrome 登录 · AI 自动发现接口 · 浏览器与桌面端自动化 [English](./README.md) @@ -9,7 +9,7 @@ [![Node.js Version](https://img.shields.io/node/v/@jackwener/opencli?style=flat-square)](https://nodejs.org) [![License](https://img.shields.io/npm/l/@jackwener/opencli?style=flat-square)](./LICENSE) -OpenCLI 将任何网站或 Electron 应用(如 Antigravity)变成命令行工具 — B站、知乎、小红书、Twitter/X、Reddit、YouTube 等 [19 个站点](#内置命令) — 复用浏览器登录态,AI 驱动探索。 +OpenCLI 将任何网站或 Electron 应用(如 Antigravity)变成命令行工具 — B站、知乎、小红书、Twitter/X、Reddit、YouTube 等[多种站点与应用](#内置命令) — 复用浏览器登录态,AI 驱动探索。 🔥 **opencli 支持 CLI 化所有 electron 应用!最强大更新来袭!** 🔥 CLI all electron!现在支持把所有 electron 应用 CLI 化,从而组合出各种神奇的能力。 @@ -24,6 +24,7 @@ CLI all electron!现在支持把所有 electron 应用 CLI 化,从而组合 - [前置要求](#前置要求) - [快速开始](#快速开始) - [内置命令](#内置命令) +- [下载支持](#下载支持) - [输出格式](#输出格式) - [致 AI Agent(开发者指南)](#致-ai-agent开发者指南) - [远程 Chrome(服务器/无头环境)](#远程-chrome服务器无头环境) @@ -36,7 +37,7 @@ CLI all electron!现在支持把所有 electron 应用 CLI 化,从而组合 ## 亮点 - **CLI All Electron** — 支持把所有 electron 应用(如 Antigravity Ultra)CLI 化,让 AI 控制自己! -- **多站点覆盖** — B站、知乎、小红书、Twitter、Reddit 等 19 个站点,80+ 命令 +- **多站点覆盖** — 覆盖 B站、知乎、小红书、Twitter、Reddit,以及多种桌面应用 - **零风控** — 复用 Chrome 登录态,无需存储任何凭证 - **自修复配置** — `opencli setup` 自动发现 Token;`opencli doctor` 诊断 10+ 工具配置;`--fix` 一键修复 - **AI 原生** — `explore` 自动发现 API,`synthesize` 生成适配器,`cascade` 探测认证策略 @@ -144,39 +145,101 @@ npm install -g @jackwener/opencli@latest ## 内置命令 -**32 个站点 · 162 命令** — 运行 `opencli list` 查看完整注册表。 - -| 站点 | 命令 | 数量 | 模式 | -|------|------|:----:|------| -| **twitter** | `trending` `bookmarks` `profile` `search` `timeline` `thread` `following` `followers` `notifications` `post` `reply` `delete` `like` `article` `follow` `unfollow` `bookmark` `unbookmark` | 18 | 🔐 浏览器 | -| **reddit** | `hot` `frontpage` `popular` `search` `subreddit` `read` `user` `user-posts` `user-comments` `upvote` `save` `comment` `subscribe` `saved` `upvoted` | 15 | 🔐 浏览器 | -| **cursor** | `status` `send` `read` `new` `dump` `composer` `model` `extract-code` `ask` `screenshot` `history` `export` | 12 | 🖥️ 桌面端 | -| **bilibili** | `hot` `search` `me` `favorite` `history` `feed` `subtitle` `dynamic` `ranking` `following` `user-videos` | 11 | 🔐 浏览器 | -| **codex** | `status` `send` `read` `new` `extract-diff` `model` `ask` `screenshot` `history` `export` | 10 | 🖥️ 桌面端 | -| **chatwise** | `status` `new` `send` `read` `ask` `model` `history` `export` `screenshot` | 9 | 🖥️ 桌面端 | -| **notion** | `status` `search` `read` `new` `write` `sidebar` `favorites` `export` | 8 | 🖥️ 桌面端 | -| **discord** | `status` `send` `read` `channels` `servers` `search` `members` | 7 | 🖥️ 桌面端 | -| **v2ex** | `hot` `latest` `topic` `daily` `me` `notifications` | 6 | 🌐 / 🔐 | -| **xueqiu** | `feed` `hot-stock` `hot` `search` `stock` `watchlist` | 6 | 🔐 浏览器 | -| **antigravity** | `status` `send` `read` `new` `evaluate` | 5 | 🖥️ 桌面端 | -| **chatgpt** | `status` `new` `send` `read` `ask` | 5 | 🖥️ 桌面端 | -| **wechat** | `status` `send` `new` `search` `read` | 5 | 🖥️ 桌面端 | -| **feishu** | `status` `send` `new` `search` `read` | 5 | 🖥️ 桌面端 | -| **xiaohongshu** | `search` `notifications` `feed` `me` `user` | 5 | 🔐 浏览器 | -| **xiaoyuzhou** | `podcast` `podcast-episodes` `episode` | 3 | 🌐 公开 | -| **youtube** | `search` `video` `transcript` | 3 | 🔐 浏览器 | -| **zhihu** | `hot` `search` `question` | 3 | 🔐 浏览器 | -| **boss** | `search` `detail` | 2 | 🔐 浏览器 | -| **coupang** | `search` `add-to-cart` | 2 | 🔐 浏览器 | -| **bbc** | `news` | 1 | 🌐 公共 API | -| **ctrip** | `search` | 1 | 🔐 浏览器 | -| **github** | `search` | 1 | 🌐 公共 API | -| **hackernews** | `top` | 1 | 🌐 公共 API | -| **linkedin** | `search` | 1 | 🔐 浏览器 | -| **reuters** | `search` | 1 | 🔐 浏览器 | -| **smzdm** | `search` | 1 | 🔐 浏览器 | -| **weibo** | `hot` | 1 | 🔐 浏览器 | -| **yahoo-finance** | `quote` | 1 | 🔐 浏览器 | +运行 `opencli list` 查看完整注册表。 + +| 站点 | 命令 | 模式 | +|------|------|------| +| **twitter** | `trending` `bookmarks` `profile` `search` `timeline` `thread` `following` `followers` `notifications` `post` `reply` `delete` `like` `article` `follow` `unfollow` `bookmark` `unbookmark` `download` | 🔐 浏览器 | +| **reddit** | `hot` `frontpage` `popular` `search` `subreddit` `read` `user` `user-posts` `user-comments` `upvote` `save` `comment` `subscribe` `saved` `upvoted` | 🔐 浏览器 | +| **cursor** | `status` `send` `read` `new` `dump` `composer` `model` `extract-code` `ask` `screenshot` `history` `export` | 🖥️ 桌面端 | +| **bilibili** | `hot` `search` `me` `favorite` `history` `feed` `subtitle` `dynamic` `ranking` `following` `user-videos` `download` | 🔐 浏览器 | +| **codex** | `status` `send` `read` `new` `extract-diff` `model` `ask` `screenshot` `history` `export` | 🖥️ 桌面端 | +| **chatwise** | `status` `new` `send` `read` `ask` `model` `history` `export` `screenshot` | 🖥️ 桌面端 | +| **notion** | `status` `search` `read` `new` `write` `sidebar` `favorites` `export` | 🖥️ 桌面端 | +| **discord** | `status` `send` `read` `channels` `servers` `search` `members` | 🖥️ 桌面端 | +| **v2ex** | `hot` `latest` `topic` `daily` `me` `notifications` | 🌐 / 🔐 | +| **xueqiu** | `feed` `hot-stock` `hot` `search` `stock` `watchlist` | 🔐 浏览器 | +| **antigravity** | `status` `send` `read` `new` `evaluate` | 🖥️ 桌面端 | +| **chatgpt** | `status` `new` `send` `read` `ask` | 🖥️ 桌面端 | +| **wechat** | `status` `send` `new` `search` `read` | 🖥️ 桌面端 | +| **feishu** | `status` `send` `new` `search` `read` | 🖥️ 桌面端 | +| **xiaohongshu** | `search` `notifications` `feed` `me` `user` `download` | 🔐 浏览器 | +| **xiaoyuzhou** | `podcast` `podcast-episodes` `episode` | 🌐 公开 | +| **zhihu** | `hot` `search` `question` `download` | 🔐 浏览器 | +| **youtube** | `search` `video` `transcript` | 🔐 浏览器 | +| **boss** | `search` `detail` | 🔐 浏览器 | +| **coupang** | `search` `add-to-cart` | 🔐 浏览器 | +| **bbc** | `news` | 🌐 公共 API | +| **ctrip** | `search` | 🔐 浏览器 | +| **github** | `search` | 🌐 公共 API | +| **hackernews** | `top` | 🌐 公共 API | +| **linkedin** | `search` | 🔐 浏览器 | +| **reuters** | `search` | 🔐 浏览器 | +| **smzdm** | `search` | 🔐 浏览器 | +| **weibo** | `hot` | 🔐 浏览器 | +| **yahoo-finance** | `quote` | 🔐 浏览器 | + +## 下载支持 + +OpenCLI 支持从各平台下载图片、视频和文章。 + +### 支持的平台 + +| 平台 | 内容类型 | 说明 | +|------|----------|------| +| **小红书** | 图片、视频 | 下载笔记中的所有媒体文件 | +| **B站** | 视频 | 需要安装 `yt-dlp` | +| **Twitter/X** | 图片、视频 | 从用户媒体页或单条推文下载 | +| **知乎** | 文章(Markdown) | 导出文章,可选下载图片到本地 | + +### 前置依赖 + +下载流媒体平台的视频需要安装 `yt-dlp`: + +```bash +# 安装 yt-dlp +pip install yt-dlp +# 或者 +brew install yt-dlp +``` + +### 使用示例 + +```bash +# 下载小红书笔记中的图片/视频 +opencli xiaohongshu download --note-id abc123 --output ./xhs + +# 下载B站视频(需要 yt-dlp) +opencli bilibili download --bvid BV1xxx --output ./bilibili +opencli bilibili download --bvid BV1xxx --quality 1080p # 指定画质 + +# 下载 Twitter 用户的媒体 +opencli twitter download --username elonmusk --limit 20 --output ./twitter + +# 下载单条推文的媒体 +opencli twitter download --tweet-url "https://x.com/user/status/123" --output ./twitter + +# 导出知乎文章为 Markdown +opencli zhihu download --url "https://zhuanlan.zhihu.com/p/xxx" --output ./zhihu + +# 导出文章并下载图片到本地 +opencli zhihu download --url "https://zhuanlan.zhihu.com/p/xxx" --download-images +``` + +### Pipeline Step(用于 YAML 适配器) + +`download` step 可以在 YAML 管线中使用: + +```yaml +pipeline: + - fetch: https://api.example.com/media + - download: + url: ${{ item.imageUrl }} + dir: ./downloads + filename: ${{ item.title | sanitize }}.jpg + concurrency: 5 + skip_existing: true +``` ## 输出格式 diff --git a/src/clis/bilibili/download.ts b/src/clis/bilibili/download.ts new file mode 100644 index 0000000..9b2759c --- /dev/null +++ b/src/clis/bilibili/download.ts @@ -0,0 +1,161 @@ +/** + * Bilibili download — download videos using yt-dlp. + * + * Usage: + * opencli bilibili download --bvid BV1xxx --output ./bilibili + * + * Requirements: + * - yt-dlp must be installed: pip install yt-dlp + */ + +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { cli, Strategy } from '../../registry.js'; +import { + ytdlpDownload, + checkYtdlp, + sanitizeFilename, + getTempDir, + exportCookiesToNetscape, +} from '../../download/index.js'; +import { DownloadProgressTracker, formatBytes } from '../../download/progress.js'; + +cli({ + site: 'bilibili', + name: 'download', + description: '下载B站视频(需要 yt-dlp)', + domain: 'www.bilibili.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'bvid', required: true, help: 'Video BV ID (e.g., BV1xxx)' }, + { name: 'output', default: './bilibili-downloads', help: 'Output directory' }, + { name: 'quality', default: 'best', help: 'Video quality (best, 1080p, 720p, 480p)' }, + ], + columns: ['bvid', 'title', 'status', 'size'], + func: async (page, kwargs) => { + const bvid = kwargs.bvid; + const output = kwargs.output; + const quality = kwargs.quality; + + // Check yt-dlp availability + if (!checkYtdlp()) { + return [{ + bvid, + title: '-', + status: 'failed', + size: 'yt-dlp not installed. Run: pip install yt-dlp', + }]; + } + + // Navigate to video page to get title and cookies + await page.goto(`https://www.bilibili.com/video/${bvid}`); + await page.wait(3); + + // Extract video info + const data = await page.evaluate(` + (() => { + const title = document.querySelector('h1.video-title, .video-title')?.textContent?.trim() || 'video'; + const author = document.querySelector('.up-name, .username')?.textContent?.trim() || 'unknown'; + return { title, author }; + })() + `); + + const title = sanitizeFilename(data?.title || 'video'); + + // Extract cookies for authenticated downloads + const cookieString = await page.evaluate(`(() => document.cookie)()`); + + // Create output directory + fs.mkdirSync(output, { recursive: true }); + + // Export cookies to Netscape format for yt-dlp + let cookiesFile: string | undefined; + if (typeof cookieString === 'string' && cookieString) { + const tempDir = getTempDir(); + fs.mkdirSync(tempDir, { recursive: true }); + cookiesFile = path.join(tempDir, `bilibili_cookies_${Date.now()}.txt`); + + const cookies = cookieString.split(';').map((c) => { + const [name, ...rest] = c.trim().split('='); + return { + name: name || '', + value: rest.join('=') || '', + domain: '.bilibili.com', + path: '/', + secure: true, + httpOnly: false, + }; + }).filter((c) => c.name); + + exportCookiesToNetscape(cookies, cookiesFile); + } + + // Build yt-dlp format string based on quality + let format = 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best'; + if (quality === '1080p') { + format = 'bestvideo[height<=1080][ext=mp4]+bestaudio[ext=m4a]/best[height<=1080]'; + } else if (quality === '720p') { + format = 'bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/best[height<=720]'; + } else if (quality === '480p') { + format = 'bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/best[height<=480]'; + } + + const destPath = path.join(output, `${bvid}_${title}.mp4`); + + const tracker = new DownloadProgressTracker(1, true); + const progressBar = tracker.onFileStart(`${bvid}.mp4`, 0); + + try { + const result = await ytdlpDownload( + `https://www.bilibili.com/video/${bvid}`, + destPath, + { + cookiesFile, + format, + extraArgs: [ + '--merge-output-format', 'mp4', + '--embed-thumbnail', + ], + onProgress: (percent) => { + if (progressBar) progressBar.update(percent, 100); + }, + }, + ); + + if (progressBar) { + progressBar.complete(result.success, result.success ? formatBytes(result.size) : undefined); + } + + tracker.onFileComplete(result.success); + tracker.finish(); + + // Cleanup cookies file + if (cookiesFile && fs.existsSync(cookiesFile)) { + fs.unlinkSync(cookiesFile); + } + + return [{ + bvid, + title: data?.title || 'video', + status: result.success ? 'success' : 'failed', + size: result.success ? formatBytes(result.size) : (result.error || 'unknown error'), + }]; + } catch (err: any) { + if (progressBar) progressBar.fail(err.message); + tracker.onFileComplete(false); + tracker.finish(); + + // Cleanup cookies file + if (cookiesFile && fs.existsSync(cookiesFile)) { + fs.unlinkSync(cookiesFile); + } + + return [{ + bvid, + title: data?.title || 'video', + status: 'failed', + size: err.message, + }]; + } + }, +}); diff --git a/src/clis/twitter/download.ts b/src/clis/twitter/download.ts new file mode 100644 index 0000000..a32d0a6 --- /dev/null +++ b/src/clis/twitter/download.ts @@ -0,0 +1,227 @@ +/** + * Twitter/X download — download images and videos from tweets. + * + * Usage: + * opencli twitter download --username elonmusk --limit 10 --output ./twitter + * opencli twitter download --tweet-url https://x.com/xxx/status/123 --output ./twitter + */ + +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { cli, Strategy } from '../../registry.js'; +import { + httpDownload, + ytdlpDownload, + checkYtdlp, + sanitizeFilename, + getTempDir, + exportCookiesToNetscape, +} from '../../download/index.js'; +import { DownloadProgressTracker, formatBytes } from '../../download/progress.js'; + +cli({ + site: 'twitter', + name: 'download', + description: '下载 Twitter/X 媒体(图片和视频)', + domain: 'x.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'username', help: 'Twitter username (downloads from media tab)' }, + { name: 'tweet-url', help: 'Single tweet URL to download' }, + { name: 'limit', type: 'int', default: 10, help: 'Number of tweets to scan' }, + { name: 'output', default: './twitter-downloads', help: 'Output directory' }, + ], + columns: ['index', 'type', 'status', 'size'], + func: async (page, kwargs) => { + const username = kwargs.username; + const tweetUrl = kwargs['tweet-url']; + const limit = kwargs.limit; + const output = kwargs.output; + + if (!username && !tweetUrl) { + return [{ + index: 0, + type: '-', + status: 'failed', + size: 'Must provide --username or --tweet-url', + }]; + } + + // Navigate to the appropriate page + if (tweetUrl) { + await page.goto(tweetUrl); + } else { + await page.goto(`https://x.com/${username}/media`); + } + await page.wait(3); + + // Scroll to load more content + if (!tweetUrl) { + await page.autoScroll({ times: Math.ceil(limit / 5) }); + } + + // Extract media URLs + const data = await page.evaluate(` + (() => { + const media = []; + + // Find images (high quality) + document.querySelectorAll('img[src*="pbs.twimg.com/media"]').forEach(img => { + let src = img.src || ''; + // Get large version + src = src.replace(/&name=\\w+$/, '&name=large'); + src = src.replace(/\\?format=/, '?format='); + if (!src.includes('&name=')) { + src = src + '&name=large'; + } + media.push({ type: 'image', url: src }); + }); + + // Find videos + document.querySelectorAll('video').forEach(video => { + const src = video.src || ''; + if (src) { + media.push({ type: 'video', url: src, poster: video.poster || '' }); + } + }); + + // Find video tweets (for yt-dlp) + document.querySelectorAll('[data-testid="videoPlayer"]').forEach(player => { + const tweetLink = player.closest('article')?.querySelector('a[href*="/status/"]'); + const href = tweetLink?.getAttribute('href') || ''; + if (href) { + const tweetUrl = 'https://x.com' + href; + media.push({ type: 'video-tweet', url: tweetUrl }); + } + }); + + return media; + })() + `); + + if (!data || data.length === 0) { + return [{ + index: 0, + type: '-', + status: 'failed', + size: 'No media found', + }]; + } + + // Extract cookies + const cookieString = await page.evaluate(`(() => document.cookie)()`); + + // Create output directory + const outputDir = tweetUrl + ? path.join(output, 'tweets') + : path.join(output, username || 'media'); + fs.mkdirSync(outputDir, { recursive: true }); + + // Export cookies for yt-dlp + let cookiesFile: string | undefined; + if (typeof cookieString === 'string' && cookieString) { + const tempDir = getTempDir(); + fs.mkdirSync(tempDir, { recursive: true }); + cookiesFile = path.join(tempDir, `twitter_cookies_${Date.now()}.txt`); + + const cookies = cookieString.split(';').map((c) => { + const [name, ...rest] = c.trim().split('='); + return { + name: name || '', + value: rest.join('=') || '', + domain: '.x.com', + path: '/', + secure: true, + httpOnly: false, + }; + }).filter((c) => c.name); + + exportCookiesToNetscape(cookies, cookiesFile); + } + + // Deduplicate media + const seen = new Set(); + const uniqueMedia = data.filter((m: any) => { + if (seen.has(m.url)) return false; + seen.add(m.url); + return true; + }).slice(0, limit); + + const tracker = new DownloadProgressTracker(uniqueMedia.length, true); + const results: any[] = []; + + for (let i = 0; i < uniqueMedia.length; i++) { + const media = uniqueMedia[i]; + const ext = media.type === 'image' ? 'jpg' : 'mp4'; + const filename = `${username || 'tweet'}_${i + 1}.${ext}`; + const destPath = path.join(outputDir, filename); + + const progressBar = tracker.onFileStart(filename, i); + + try { + let result: { success: boolean; size: number; error?: string }; + + if (media.type === 'video-tweet' && checkYtdlp()) { + // Use yt-dlp for video tweets + result = await ytdlpDownload(media.url, destPath, { + cookiesFile, + extraArgs: ['--merge-output-format', 'mp4'], + onProgress: (percent) => { + if (progressBar) progressBar.update(percent, 100); + }, + }); + } else if (media.type === 'image') { + // Direct HTTP download for images + result = await httpDownload(media.url, destPath, { + cookies: typeof cookieString === 'string' ? cookieString : '', + timeout: 30000, + onProgress: (received, total) => { + if (progressBar) progressBar.update(received, total); + }, + }); + } else { + // Direct HTTP download for direct video URLs + result = await httpDownload(media.url, destPath, { + cookies: typeof cookieString === 'string' ? cookieString : '', + timeout: 60000, + onProgress: (received, total) => { + if (progressBar) progressBar.update(received, total); + }, + }); + } + + if (progressBar) { + progressBar.complete(result.success, result.success ? formatBytes(result.size) : undefined); + } + + tracker.onFileComplete(result.success); + + results.push({ + index: i + 1, + type: media.type === 'video-tweet' ? 'video' : media.type, + status: result.success ? 'success' : 'failed', + size: result.success ? formatBytes(result.size) : (result.error || 'unknown error'), + }); + } catch (err: any) { + if (progressBar) progressBar.fail(err.message); + tracker.onFileComplete(false); + + results.push({ + index: i + 1, + type: media.type, + status: 'failed', + size: err.message, + }); + } + } + + tracker.finish(); + + // Cleanup cookies file + if (cookiesFile && fs.existsSync(cookiesFile)) { + fs.unlinkSync(cookiesFile); + } + + return results; + }, +}); diff --git a/src/clis/xiaohongshu/download.ts b/src/clis/xiaohongshu/download.ts new file mode 100644 index 0000000..a8456d3 --- /dev/null +++ b/src/clis/xiaohongshu/download.ts @@ -0,0 +1,173 @@ +/** + * Xiaohongshu download — download images and videos from a note. + * + * Usage: + * opencli xiaohongshu download --note-id abc123 --output ./xhs + */ + +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { cli, Strategy } from '../../registry.js'; +import { + httpDownload, + sanitizeFilename, + detectContentType, +} from '../../download/index.js'; +import { DownloadProgressTracker, formatBytes } from '../../download/progress.js'; + +cli({ + site: 'xiaohongshu', + name: 'download', + description: '下载小红书笔记中的图片和视频', + domain: 'www.xiaohongshu.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'note-id', required: true, help: 'Note ID (from URL)' }, + { name: 'output', default: './xiaohongshu-downloads', help: 'Output directory' }, + ], + columns: ['index', 'type', 'status', 'size'], + func: async (page, kwargs) => { + const noteId = kwargs['note-id']; + const output = kwargs.output; + + // Navigate to note page + await page.goto(`https://www.xiaohongshu.com/explore/${noteId}`); + await page.wait(3); + + // Extract note info and media URLs + const data = await page.evaluate(` + (() => { + const result = { + noteId: '${noteId}', + title: '', + author: '', + media: [] + }; + + // Get title + const titleEl = document.querySelector('.title, #detail-title, .note-content .title'); + result.title = titleEl?.textContent?.trim() || 'untitled'; + + // Get author + const authorEl = document.querySelector('.username, .author-name, .name'); + result.author = authorEl?.textContent?.trim() || 'unknown'; + + // Get images - try multiple selectors + const imageSelectors = [ + '.swiper-slide img', + '.carousel-image img', + '.note-slider img', + '.note-image img', + '.image-wrapper img', + '#noteContainer img[src*="xhscdn"]', + 'img[src*="ci.xiaohongshu.com"]' + ]; + + const imageUrls = new Set(); + for (const selector of imageSelectors) { + document.querySelectorAll(selector).forEach(img => { + let src = img.src || img.getAttribute('data-src') || ''; + if (src && (src.includes('xhscdn') || src.includes('xiaohongshu'))) { + // Convert to high quality URL (remove resize parameters) + src = src.split('?')[0]; + // Try to get original size + src = src.replace(/\\/imageView\\d+\\/\\d+\\/w\\/\\d+/, ''); + imageUrls.add(src); + } + }); + } + + // Get video if exists + const videoSelectors = [ + 'video source', + 'video[src]', + '.player video', + '.video-player video' + ]; + + for (const selector of videoSelectors) { + document.querySelectorAll(selector).forEach(v => { + const src = v.src || v.getAttribute('src') || ''; + if (src) { + result.media.push({ + type: 'video', + url: src + }); + } + }); + } + + // Add images to media + imageUrls.forEach(url => { + result.media.push({ + type: 'image', + url: url + }); + }); + + return result; + })() + `); + + if (!data || !data.media || data.media.length === 0) { + return [{ index: 0, type: '-', status: 'failed', size: 'No media found' }]; + } + + // Extract cookies for authenticated downloads + const cookies = await page.evaluate(`(() => document.cookie)()`); + + // Create output directory + const outputDir = path.join(output, noteId); + fs.mkdirSync(outputDir, { recursive: true }); + + // Download all media files + const tracker = new DownloadProgressTracker(data.media.length, true); + const results: any[] = []; + + for (let i = 0; i < data.media.length; i++) { + const media = data.media[i]; + const ext = media.type === 'video' ? 'mp4' : 'jpg'; + const filename = `${noteId}_${i + 1}.${ext}`; + const destPath = path.join(outputDir, filename); + + const progressBar = tracker.onFileStart(filename, i); + + try { + const result = await httpDownload(media.url, destPath, { + cookies: typeof cookies === 'string' ? cookies : '', + timeout: 60000, + onProgress: (received, total) => { + if (progressBar) progressBar.update(received, total); + }, + }); + + if (progressBar) { + progressBar.complete(result.success, result.success ? formatBytes(result.size) : undefined); + } + + tracker.onFileComplete(result.success); + + results.push({ + index: i + 1, + type: media.type, + status: result.success ? 'success' : 'failed', + size: result.success ? formatBytes(result.size) : (result.error || 'unknown error'), + }); + } catch (err: any) { + if (progressBar) progressBar.fail(err.message); + tracker.onFileComplete(false); + + results.push({ + index: i + 1, + type: media.type, + status: 'failed', + size: err.message, + }); + } + } + + tracker.finish(); + + return results; + }, +}); diff --git a/src/clis/zhihu/download.test.ts b/src/clis/zhihu/download.test.ts new file mode 100644 index 0000000..bf6ac87 --- /dev/null +++ b/src/clis/zhihu/download.test.ts @@ -0,0 +1,12 @@ +import { describe, expect, it } from 'vitest'; +import { htmlToMarkdown } from './download.js'; + +describe('htmlToMarkdown', () => { + it('renders ordered lists with the original list item content', () => { + const html = '
  1. First item
  2. Second item
'; + + expect(htmlToMarkdown(html)).toContain('1. First item'); + expect(htmlToMarkdown(html)).toContain('2. Second item'); + expect(htmlToMarkdown(html)).not.toContain('$1'); + }); +}); diff --git a/src/clis/zhihu/download.ts b/src/clis/zhihu/download.ts new file mode 100644 index 0000000..967f6f9 --- /dev/null +++ b/src/clis/zhihu/download.ts @@ -0,0 +1,223 @@ +/** + * Zhihu download — export articles to Markdown format. + * + * Usage: + * opencli zhihu download --url "https://zhuanlan.zhihu.com/p/xxx" --output ./zhihu + */ + +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { cli, Strategy } from '../../registry.js'; +import { sanitizeFilename, httpDownload } from '../../download/index.js'; +import { formatBytes } from '../../download/progress.js'; + +/** + * Convert HTML content to Markdown. + * This is a simplified converter for Zhihu article content. + */ +export function htmlToMarkdown(html: string): string { + let md = html; + + // Remove script and style tags + md = md.replace(/]*>[\s\S]*?<\/script>/gi, ''); + md = md.replace(/]*>[\s\S]*?<\/style>/gi, ''); + + // Convert headers + md = md.replace(/]*>(.*?)<\/h1>/gi, '# $1\n\n'); + md = md.replace(/]*>(.*?)<\/h2>/gi, '## $1\n\n'); + md = md.replace(/]*>(.*?)<\/h3>/gi, '### $1\n\n'); + md = md.replace(/]*>(.*?)<\/h4>/gi, '#### $1\n\n'); + + // Convert paragraphs + md = md.replace(/]*>([\s\S]*?)<\/p>/gi, '$1\n\n'); + + // Convert links + md = md.replace(/]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)'); + + // Convert images + md = md.replace(/]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, '![$2]($1)'); + md = md.replace(/]*src="([^"]*)"[^>]*\/?>/gi, '![]($1)'); + + // Convert lists + md = md.replace(/]*>([\s\S]*?)<\/ul>/gi, (match, content) => { + return content.replace(/]*>([\s\S]*?)<\/li>/gi, '- $1\n') + '\n'; + }); + md = md.replace(/]*>([\s\S]*?)<\/ol>/gi, (match, content) => { + let index = 0; + return content.replace( + /]*>([\s\S]*?)<\/li>/gi, + (_itemMatch: string, itemContent: string) => `${++index}. ${itemContent}\n`, + ) + '\n'; + }); + + // Convert bold and italic + md = md.replace(/]*>(.*?)<\/strong>/gi, '**$1**'); + md = md.replace(/]*>(.*?)<\/b>/gi, '**$1**'); + md = md.replace(/]*>(.*?)<\/em>/gi, '*$1*'); + md = md.replace(/]*>(.*?)<\/i>/gi, '*$1*'); + + // Convert code blocks + md = md.replace(/]*>]*>([\s\S]*?)<\/code><\/pre>/gi, '```\n$1\n```\n\n'); + md = md.replace(/]*>(.*?)<\/code>/gi, '`$1`'); + + // Convert blockquotes + md = md.replace(/]*>([\s\S]*?)<\/blockquote>/gi, (match, content) => { + return content.split('\n').map((line: string) => `> ${line}`).join('\n') + '\n\n'; + }); + + // Convert line breaks + md = md.replace(//gi, '\n'); + + // Remove remaining HTML tags + md = md.replace(/<[^>]+>/g, ''); + + // Decode HTML entities + md = md.replace(/ /g, ' '); + md = md.replace(/</g, '<'); + md = md.replace(/>/g, '>'); + md = md.replace(/&/g, '&'); + md = md.replace(/"/g, '"'); + + // Clean up extra whitespace + md = md.replace(/\n{3,}/g, '\n\n'); + md = md.trim(); + + return md; +} + +cli({ + site: 'zhihu', + name: 'download', + description: '导出知乎文章为 Markdown 格式', + domain: 'zhuanlan.zhihu.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'url', required: true, help: 'Article URL (zhuanlan.zhihu.com/p/xxx)' }, + { name: 'output', default: './zhihu-articles', help: 'Output directory' }, + { name: 'download-images', type: 'boolean', default: false, help: 'Download images locally' }, + ], + columns: ['title', 'author', 'status', 'size'], + func: async (page, kwargs) => { + const url = kwargs.url; + const output = kwargs.output; + const downloadImages = kwargs['download-images']; + + // Navigate to article page + await page.goto(url); + await page.wait(3); + + // Extract article content + const data = await page.evaluate(` + (() => { + const result = { + title: '', + author: '', + content: '', + publishTime: '', + images: [] + }; + + // Get title + const titleEl = document.querySelector('.Post-Title, h1.ContentItem-title, .ArticleTitle'); + result.title = titleEl?.textContent?.trim() || 'untitled'; + + // Get author + const authorEl = document.querySelector('.AuthorInfo-name, .UserLink-link'); + result.author = authorEl?.textContent?.trim() || 'unknown'; + + // Get publish time + const timeEl = document.querySelector('.ContentItem-time, .Post-Time'); + result.publishTime = timeEl?.textContent?.trim() || ''; + + // Get content HTML + const contentEl = document.querySelector('.Post-RichTextContainer, .RichText, .ArticleContent'); + if (contentEl) { + result.content = contentEl.innerHTML; + + // Extract image URLs + contentEl.querySelectorAll('img').forEach(img => { + const src = img.getAttribute('data-original') || img.getAttribute('data-actualsrc') || img.src; + if (src && !src.includes('data:image')) { + result.images.push(src); + } + }); + } + + return result; + })() + `); + + if (!data || !data.content) { + return [{ + title: 'Error', + author: '-', + status: 'failed', + size: 'Could not extract article content', + }]; + } + + // Create output directory + fs.mkdirSync(output, { recursive: true }); + + // Convert HTML to Markdown + let markdown = htmlToMarkdown(data.content); + + // Create frontmatter + const frontmatter = [ + '---', + `title: "${data.title.replace(/"/g, '\\"')}"`, + `author: "${data.author.replace(/"/g, '\\"')}"`, + `source: "${url}"`, + data.publishTime ? `date: "${data.publishTime}"` : '', + '---', + '', + ].filter(Boolean).join('\n'); + + // Download images if requested + if (downloadImages && data.images && data.images.length > 0) { + const imagesDir = path.join(output, 'images'); + fs.mkdirSync(imagesDir, { recursive: true }); + + const cookies = await page.evaluate(`(() => document.cookie)()`); + + for (let i = 0; i < data.images.length; i++) { + const imgUrl = data.images[i]; + const ext = imgUrl.match(/\.(jpg|jpeg|png|gif|webp)/i)?.[1] || 'jpg'; + const imgFilename = `img_${i + 1}.${ext}`; + const imgPath = path.join(imagesDir, imgFilename); + + try { + await httpDownload(imgUrl, imgPath, { + cookies: typeof cookies === 'string' ? cookies : '', + timeout: 30000, + }); + + // Replace image URL in markdown with local path + markdown = markdown.replace( + new RegExp(imgUrl.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'), + `./images/${imgFilename}`, + ); + } catch { + // Keep original URL if download fails + } + } + } + + // Write markdown file + const safeTitle = sanitizeFilename(data.title, 100); + const filename = `${safeTitle}.md`; + const filePath = path.join(output, filename); + + const fullContent = frontmatter + '\n' + markdown; + fs.writeFileSync(filePath, fullContent, 'utf-8'); + + const size = Buffer.byteLength(fullContent, 'utf-8'); + + return [{ + title: data.title, + author: data.author, + status: 'success', + size: formatBytes(size), + }]; + }, +}); diff --git a/src/download/index.ts b/src/download/index.ts new file mode 100644 index 0000000..8253795 --- /dev/null +++ b/src/download/index.ts @@ -0,0 +1,395 @@ +/** + * Download utilities: HTTP downloads, yt-dlp wrapper, format conversion. + */ + +import { spawn, execSync } from 'node:child_process'; +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import * as https from 'node:https'; +import * as http from 'node:http'; +import * as os from 'node:os'; +import { URL } from 'node:url'; +import type { ProgressBar } from './progress.js'; + +export interface DownloadOptions { + cookies?: string; + headers?: Record; + timeout?: number; + onProgress?: (received: number, total: number) => void; +} + +export interface YtdlpOptions { + cookies?: string; + cookiesFile?: string; + format?: string; + extraArgs?: string[]; + onProgress?: (percent: number) => void; +} + +/** + * Check if yt-dlp is available in PATH. + */ +export function checkYtdlp(): boolean { + try { + execSync('yt-dlp --version', { encoding: 'utf-8', stdio: 'pipe' }); + return true; + } catch { + return false; + } +} + +/** + * Check if ffmpeg is available in PATH. + */ +export function checkFfmpeg(): boolean { + try { + execSync('ffmpeg -version', { encoding: 'utf-8', stdio: 'pipe' }); + return true; + } catch { + return false; + } +} + +/** + * Detect content type from URL and optional headers. + */ +export function detectContentType(url: string, contentType?: string): 'image' | 'video' | 'document' | 'binary' { + // Check content-type header first + if (contentType) { + if (contentType.startsWith('image/')) return 'image'; + if (contentType.startsWith('video/')) return 'video'; + if (contentType.startsWith('text/') || contentType.includes('json') || contentType.includes('xml')) return 'document'; + } + + // Detect from URL + const urlLower = url.toLowerCase(); + const ext = path.extname(new URL(url).pathname).toLowerCase(); + + // Image extensions + if (['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico', '.bmp', '.avif'].includes(ext)) { + return 'image'; + } + + // Video extensions + if (['.mp4', '.webm', '.avi', '.mov', '.mkv', '.flv', '.m3u8', '.ts'].includes(ext)) { + return 'video'; + } + + // Video platforms (need yt-dlp) + if (urlLower.includes('youtube.com') || urlLower.includes('youtu.be') || + urlLower.includes('bilibili.com') || urlLower.includes('twitter.com') || + urlLower.includes('x.com') || urlLower.includes('tiktok.com') || + urlLower.includes('vimeo.com') || urlLower.includes('twitch.tv')) { + return 'video'; + } + + // Document extensions + if (['.html', '.htm', '.json', '.xml', '.txt', '.md', '.markdown'].includes(ext)) { + return 'document'; + } + + return 'binary'; +} + +/** + * Check if URL requires yt-dlp for download. + */ +export function requiresYtdlp(url: string): boolean { + const urlLower = url.toLowerCase(); + return ( + urlLower.includes('youtube.com') || + urlLower.includes('youtu.be') || + urlLower.includes('bilibili.com/video') || + urlLower.includes('twitter.com') || + urlLower.includes('x.com') || + urlLower.includes('tiktok.com') || + urlLower.includes('vimeo.com') || + urlLower.includes('twitch.tv') + ); +} + +/** + * HTTP download with progress callback. + */ +export async function httpDownload( + url: string, + destPath: string, + options: DownloadOptions = {}, +): Promise<{ success: boolean; size: number; error?: string }> { + const { cookies, headers = {}, timeout = 30000, onProgress } = options; + + return new Promise((resolve) => { + const parsedUrl = new URL(url); + const protocol = parsedUrl.protocol === 'https:' ? https : http; + + const requestHeaders: Record = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + ...headers, + }; + + if (cookies) { + requestHeaders['Cookie'] = cookies; + } + + // Ensure directory exists + const dir = path.dirname(destPath); + fs.mkdirSync(dir, { recursive: true }); + + const tempPath = `${destPath}.tmp`; + const file = fs.createWriteStream(tempPath); + + const request = protocol.get(url, { headers: requestHeaders, timeout }, (response) => { + // Handle redirects + if (response.statusCode && response.statusCode >= 300 && response.statusCode < 400 && response.headers.location) { + file.close(); + fs.unlinkSync(tempPath); + httpDownload(response.headers.location, destPath, options).then(resolve); + return; + } + + if (response.statusCode !== 200) { + file.close(); + fs.unlinkSync(tempPath); + resolve({ success: false, size: 0, error: `HTTP ${response.statusCode}` }); + return; + } + + const totalSize = parseInt(response.headers['content-length'] || '0', 10); + let received = 0; + + response.on('data', (chunk: Buffer) => { + received += chunk.length; + if (onProgress) onProgress(received, totalSize); + }); + + response.pipe(file); + + file.on('finish', () => { + file.close(); + // Rename temp file to final destination + fs.renameSync(tempPath, destPath); + resolve({ success: true, size: received }); + }); + }); + + request.on('error', (err) => { + file.close(); + if (fs.existsSync(tempPath)) fs.unlinkSync(tempPath); + resolve({ success: false, size: 0, error: err.message }); + }); + + request.on('timeout', () => { + request.destroy(); + file.close(); + if (fs.existsSync(tempPath)) fs.unlinkSync(tempPath); + resolve({ success: false, size: 0, error: 'Timeout' }); + }); + }); +} + +/** + * Export cookies to Netscape format for yt-dlp. + */ +export function exportCookiesToNetscape( + cookies: Array<{ name: string; value: string; domain: string; path?: string; secure?: boolean; httpOnly?: boolean }>, + filePath: string, +): void { + const lines = [ + '# Netscape HTTP Cookie File', + '# https://curl.se/docs/http-cookies.html', + '# This is a generated file! Do not edit.', + '', + ]; + + for (const cookie of cookies) { + const domain = cookie.domain.startsWith('.') ? cookie.domain : `.${cookie.domain}`; + const includeSubdomains = 'TRUE'; + const cookiePath = cookie.path || '/'; + const secure = cookie.secure ? 'TRUE' : 'FALSE'; + const expiry = Math.floor(Date.now() / 1000) + 86400 * 365; // 1 year from now + lines.push(`${domain}\t${includeSubdomains}\t${cookiePath}\t${secure}\t${expiry}\t${cookie.name}\t${cookie.value}`); + } + + fs.mkdirSync(path.dirname(filePath), { recursive: true }); + fs.writeFileSync(filePath, lines.join('\n')); +} + +/** + * Download video using yt-dlp. + */ +export async function ytdlpDownload( + url: string, + destPath: string, + options: YtdlpOptions = {}, +): Promise<{ success: boolean; size: number; error?: string }> { + const { cookiesFile, format = 'best', extraArgs = [], onProgress } = options; + + if (!checkYtdlp()) { + return { success: false, size: 0, error: 'yt-dlp not installed. Install with: pip install yt-dlp' }; + } + + return new Promise((resolve) => { + const dir = path.dirname(destPath); + fs.mkdirSync(dir, { recursive: true }); + + // Build yt-dlp arguments + const args = [ + url, + '-o', destPath, + '-f', format, + '--no-playlist', + '--progress', + ]; + + if (cookiesFile && fs.existsSync(cookiesFile)) { + args.push('--cookies', cookiesFile); + } else { + // Try to use browser cookies + args.push('--cookies-from-browser', 'chrome'); + } + + args.push(...extraArgs); + + const proc = spawn('yt-dlp', args, { + stdio: ['ignore', 'pipe', 'pipe'], + }); + + let lastPercent = 0; + let errorOutput = ''; + + proc.stderr.on('data', (data: Buffer) => { + const line = data.toString(); + errorOutput += line; + + // Parse progress from yt-dlp output + const match = line.match(/(\d+\.?\d*)%/); + if (match && onProgress) { + const percent = parseFloat(match[1]); + if (percent > lastPercent) { + lastPercent = percent; + onProgress(percent); + } + } + }); + + proc.stdout.on('data', (data: Buffer) => { + const line = data.toString(); + const match = line.match(/(\d+\.?\d*)%/); + if (match && onProgress) { + const percent = parseFloat(match[1]); + if (percent > lastPercent) { + lastPercent = percent; + onProgress(percent); + } + } + }); + + proc.on('close', (code) => { + if (code === 0 && fs.existsSync(destPath)) { + const stats = fs.statSync(destPath); + resolve({ success: true, size: stats.size }); + } else { + // Check for common yt-dlp output patterns + const patterns = fs.readdirSync(dir).filter(f => f.startsWith(path.basename(destPath, path.extname(destPath)))); + if (patterns.length > 0) { + const actualFile = path.join(dir, patterns[0]); + const stats = fs.statSync(actualFile); + resolve({ success: true, size: stats.size }); + } else { + resolve({ success: false, size: 0, error: errorOutput.slice(0, 200) || `Exit code ${code}` }); + } + } + }); + + proc.on('error', (err) => { + resolve({ success: false, size: 0, error: err.message }); + }); + }); +} + +/** + * Save document content to file. + */ +export async function saveDocument( + content: string, + destPath: string, + format: 'json' | 'markdown' | 'html' | 'text' = 'markdown', + metadata?: Record, +): Promise<{ success: boolean; size: number; error?: string }> { + try { + const dir = path.dirname(destPath); + fs.mkdirSync(dir, { recursive: true }); + + let output: string; + + if (format === 'json') { + output = JSON.stringify({ ...metadata, content }, null, 2); + } else if (format === 'markdown') { + // Add frontmatter if metadata exists + const frontmatter = metadata ? `---\n${Object.entries(metadata).map(([k, v]) => `${k}: ${JSON.stringify(v)}`).join('\n')}\n---\n\n` : ''; + output = frontmatter + content; + } else { + output = content; + } + + fs.writeFileSync(destPath, output, 'utf-8'); + return { success: true, size: Buffer.byteLength(output, 'utf-8') }; + } catch (err: any) { + return { success: false, size: 0, error: err.message }; + } +} + +/** + * Sanitize filename by removing invalid characters. + */ +export function sanitizeFilename(name: string, maxLength: number = 200): string { + return name + .replace(/[<>:"/\\|?*\x00-\x1f]/g, '_') // Remove invalid chars + .replace(/\s+/g, '_') // Replace spaces with underscores + .replace(/_+/g, '_') // Collapse multiple underscores + .replace(/^_|_$/g, '') // Trim underscores + .slice(0, maxLength); +} + +/** + * Generate filename from URL if not provided. + */ +export function generateFilename(url: string, index: number, extension?: string): string { + try { + const parsedUrl = new URL(url); + const pathname = parsedUrl.pathname; + const basename = path.basename(pathname); + + if (basename && basename !== '/' && basename.includes('.')) { + return sanitizeFilename(basename); + } + + // Generate from hostname and index + const ext = extension || detectExtension(url); + const hostname = parsedUrl.hostname.replace(/^www\./, ''); + return sanitizeFilename(`${hostname}_${index + 1}${ext}`); + } catch { + const ext = extension || '.bin'; + return `download_${index + 1}${ext}`; + } +} + +/** + * Detect file extension from URL. + */ +function detectExtension(url: string): string { + const type = detectContentType(url); + switch (type) { + case 'image': return '.jpg'; + case 'video': return '.mp4'; + case 'document': return '.md'; + default: return '.bin'; + } +} + +/** + * Get temp directory for cookie files. + */ +export function getTempDir(): string { + return path.join(os.tmpdir(), 'opencli-download'); +} diff --git a/src/download/progress.ts b/src/download/progress.ts new file mode 100644 index 0000000..b7c6ddb --- /dev/null +++ b/src/download/progress.ts @@ -0,0 +1,125 @@ +/** + * Download progress display: terminal progress bars, status updates. + */ + +import chalk from 'chalk'; + +export interface ProgressBar { + update(current: number, total: number, label?: string): void; + complete(success: boolean, message?: string): void; + fail(error: string): void; +} + +/** + * Format bytes as human-readable string (KB, MB, GB). + */ +export function formatBytes(bytes: number): string { + if (bytes === 0) return '0 B'; + const k = 1024; + const sizes = ['B', 'KB', 'MB', 'GB', 'TB']; + const i = Math.floor(Math.log(bytes) / Math.log(k)); + return `${(bytes / Math.pow(k, i)).toFixed(1)} ${sizes[i]}`; +} + +/** + * Format milliseconds as human-readable duration. + */ +export function formatDuration(ms: number): string { + if (ms < 1000) return `${ms}ms`; + const seconds = Math.floor(ms / 1000); + if (seconds < 60) return `${seconds}s`; + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + return `${minutes}m ${remainingSeconds}s`; +} + +/** + * Create a simple progress bar for terminal display. + */ +export function createProgressBar(filename: string, index: number, total: number): ProgressBar { + const prefix = chalk.dim(`[${index + 1}/${total}]`); + const truncatedName = filename.length > 40 ? filename.slice(0, 37) + '...' : filename; + + return { + update(current: number, totalBytes: number, label?: string) { + const percent = totalBytes > 0 ? Math.round((current / totalBytes) * 100) : 0; + const bar = createBar(percent); + const size = totalBytes > 0 ? formatBytes(totalBytes) : ''; + const extra = label ? ` ${label}` : ''; + process.stderr.write(`\r${prefix} ${truncatedName} ${bar} ${percent}% ${size}${extra}`); + }, + complete(success: boolean, message?: string) { + const icon = success ? chalk.green('✓') : chalk.red('✗'); + const msg = message ? ` ${chalk.dim(message)}` : ''; + process.stderr.write(`\r${prefix} ${icon} ${truncatedName}${msg}\n`); + }, + fail(error: string) { + process.stderr.write(`\r${prefix} ${chalk.red('✗')} ${truncatedName} ${chalk.red(error)}\n`); + }, + }; +} + +/** + * Create a progress bar string. + */ +function createBar(percent: number, width: number = 20): string { + const filled = Math.round((percent / 100) * width); + const empty = width - filled; + return chalk.cyan('█'.repeat(filled)) + chalk.dim('░'.repeat(empty)); +} + +/** + * Multi-file download progress tracker. + */ +export class DownloadProgressTracker { + private completed = 0; + private failed = 0; + private skipped = 0; + private total: number; + private startTime: number; + private verbose: boolean; + + constructor(total: number, verbose: boolean = true) { + this.total = total; + this.startTime = Date.now(); + this.verbose = verbose; + } + + onFileStart(filename: string, index: number): ProgressBar | null { + if (!this.verbose) return null; + return createProgressBar(filename, index, this.total); + } + + onFileComplete(success: boolean, skipped: boolean = false): void { + if (skipped) { + this.skipped++; + } else if (success) { + this.completed++; + } else { + this.failed++; + } + } + + getSummary(): string { + const elapsed = formatDuration(Date.now() - this.startTime); + const parts: string[] = []; + + if (this.completed > 0) { + parts.push(chalk.green(`${this.completed} downloaded`)); + } + if (this.skipped > 0) { + parts.push(chalk.yellow(`${this.skipped} skipped`)); + } + if (this.failed > 0) { + parts.push(chalk.red(`${this.failed} failed`)); + } + + return `${parts.join(', ')} in ${elapsed}`; + } + + finish(): void { + if (this.verbose) { + process.stderr.write(`\n${chalk.bold('Download complete:')} ${this.getSummary()}\n`); + } + } +} diff --git a/src/pipeline/registry.ts b/src/pipeline/registry.ts index 74d8794..cdd9e53 100644 --- a/src/pipeline/registry.ts +++ b/src/pipeline/registry.ts @@ -11,6 +11,7 @@ import { stepFetch } from './steps/fetch.js'; import { stepSelect, stepMap, stepFilter, stepSort, stepLimit } from './steps/transform.js'; import { stepIntercept } from './steps/intercept.js'; import { stepTap } from './steps/tap.js'; +import { stepDownload } from './steps/download.js'; /** * Step handler: all pipeline steps conform to this generic interface. @@ -58,3 +59,4 @@ registerStep('sort', stepSort); registerStep('limit', stepLimit); registerStep('intercept', stepIntercept); registerStep('tap', stepTap); +registerStep('download', stepDownload); diff --git a/src/pipeline/steps/download.ts b/src/pipeline/steps/download.ts new file mode 100644 index 0000000..15453a6 --- /dev/null +++ b/src/pipeline/steps/download.ts @@ -0,0 +1,310 @@ +/** + * Pipeline step: download — file download with concurrency and progress. + * + * Supports: + * - Direct HTTP downloads (images, documents) + * - yt-dlp integration for video platforms + * - Browser cookie forwarding for authenticated downloads + * - Filename templating and deduplication + */ + +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import * as os from 'node:os'; +import type { IPage } from '../../types.js'; +import { render } from '../template.js'; +import { + httpDownload, + ytdlpDownload, + saveDocument, + detectContentType, + requiresYtdlp, + sanitizeFilename, + generateFilename, + exportCookiesToNetscape, + getTempDir, +} from '../../download/index.js'; +import { DownloadProgressTracker, formatBytes } from '../../download/progress.js'; + +export interface DownloadResult { + status: 'success' | 'skipped' | 'failed'; + path?: string; + size?: number; + error?: string; + duration?: number; +} + +/** + * Simple async concurrency limiter for downloads. + */ +async function mapConcurrent( + items: T[], + limit: number, + fn: (item: T, index: number) => Promise, +): Promise { + const results: R[] = new Array(items.length); + let index = 0; + + async function worker() { + while (index < items.length) { + const i = index++; + results[i] = await fn(items[i], i); + } + } + + const workers = Array.from({ length: Math.min(limit, items.length) }, () => worker()); + await Promise.all(workers); + return results; +} + +/** + * Extract cookies from browser page. + */ +async function extractBrowserCookies(page: IPage, domain?: string): Promise { + try { + // Use browser evaluate to get document.cookie + const cookieString = await page.evaluate(`(() => document.cookie)()`); + return typeof cookieString === 'string' ? cookieString : ''; + } catch { + return ''; + } +} + +/** + * Extract cookies as array for yt-dlp Netscape format. + */ +async function extractCookiesArray( + page: IPage, + domain: string, +): Promise> { + try { + const cookieString = await extractBrowserCookies(page); + if (!cookieString) return []; + + return cookieString.split(';').map((c) => { + const [name, ...rest] = c.trim().split('='); + return { + name: name || '', + value: rest.join('=') || '', + domain, + path: '/', + secure: true, + httpOnly: false, + }; + }).filter((c) => c.name); + } catch { + return []; + } +} + +/** + * Download step handler for YAML pipelines. + * + * Usage in YAML: + * ```yaml + * pipeline: + * - download: + * url: ${{ item.imageUrl }} + * dir: ./downloads + * filename: ${{ item.title }}.jpg + * concurrency: 5 + * skip_existing: true + * use_ytdlp: false + * type: auto + * ``` + */ +export async function stepDownload( + page: IPage | null, + params: any, + data: any, + args: Record, +): Promise { + // Parse parameters with defaults + const urlTemplate = typeof params === 'string' ? params : (params?.url ?? ''); + const dirTemplate = params?.dir ?? './downloads'; + const filenameTemplate = params?.filename ?? ''; + const concurrency = typeof params?.concurrency === 'number' ? params.concurrency : 3; + const skipExisting = params?.skip_existing !== false; + const timeout = typeof params?.timeout === 'number' ? params.timeout * 1000 : 30000; + const useYtdlp = params?.use_ytdlp ?? false; + const ytdlpArgs = Array.isArray(params?.ytdlp_args) ? params.ytdlp_args : []; + const contentType = params?.type ?? 'auto'; + const showProgress = params?.progress !== false; + const contentTemplate = params?.content; + const metadataTemplate = params?.metadata; + + // Resolve output directory + const dir = String(render(dirTemplate, { args, data })); + fs.mkdirSync(dir, { recursive: true }); + + // Normalize data to array + const items: any[] = Array.isArray(data) ? data : data ? [data] : []; + if (items.length === 0) { + return []; + } + + // Create progress tracker + const tracker = new DownloadProgressTracker(items.length, showProgress); + + // Extract cookies if browser is available + let cookies = ''; + let cookiesFile: string | undefined; + + if (page) { + cookies = await extractBrowserCookies(page); + + // For yt-dlp, we need to export cookies to Netscape format + if (useYtdlp || items.some((item, index) => { + const url = String(render(urlTemplate, { args, data, item, index })); + return requiresYtdlp(url); + })) { + try { + // Try to get domain from first URL + const firstUrl = String(render(urlTemplate, { args, data, item: items[0], index: 0 })); + const domain = new URL(firstUrl).hostname; + const cookiesArray = await extractCookiesArray(page, domain); + + if (cookiesArray.length > 0) { + const tempDir = getTempDir(); + fs.mkdirSync(tempDir, { recursive: true }); + cookiesFile = path.join(tempDir, `cookies_${Date.now()}.txt`); + exportCookiesToNetscape(cookiesArray, cookiesFile); + } + } catch { + // Ignore cookie extraction errors + } + } + } + + // Process downloads with concurrency + const results = await mapConcurrent(items, concurrency, async (item, index): Promise => { + const startTime = Date.now(); + + // Render URL + const url = String(render(urlTemplate, { args, data, item, index })); + if (!url) { + tracker.onFileComplete(false); + return { + ...item, + _download: { status: 'failed', error: 'Empty URL' } as DownloadResult, + }; + } + + // Render filename + let filename: string; + if (filenameTemplate) { + filename = String(render(filenameTemplate, { args, data, item, index })); + } else { + filename = generateFilename(url, index); + } + filename = sanitizeFilename(filename); + + const destPath = path.join(dir, filename); + + // Check if file exists and skip_existing is true + if (skipExisting && fs.existsSync(destPath)) { + tracker.onFileComplete(true, true); + return { + ...item, + _download: { + status: 'skipped', + path: destPath, + size: fs.statSync(destPath).size, + } as DownloadResult, + }; + } + + // Create progress bar for this file + const progressBar = tracker.onFileStart(filename, index); + + // Determine download method + const detectedType = contentType === 'auto' ? detectContentType(url) : contentType; + const shouldUseYtdlp = useYtdlp || (detectedType === 'video' && requiresYtdlp(url)); + + let result: { success: boolean; size: number; error?: string }; + + try { + if (detectedType === 'document' && contentTemplate) { + // Save extracted content as document + const content = String(render(contentTemplate, { args, data, item, index })); + const metadata = metadataTemplate + ? Object.fromEntries( + Object.entries(metadataTemplate).map(([k, v]) => [k, render(v, { args, data, item, index })]), + ) + : undefined; + + const ext = path.extname(filename).toLowerCase(); + const format = ext === '.json' ? 'json' : ext === '.html' ? 'html' : 'markdown'; + result = await saveDocument(content, destPath, format, metadata); + + if (progressBar) { + progressBar.complete(result.success, result.success ? formatBytes(result.size) : undefined); + } + } else if (shouldUseYtdlp) { + // Use yt-dlp for video downloads + result = await ytdlpDownload(url, destPath, { + cookiesFile, + extraArgs: ytdlpArgs, + onProgress: (percent) => { + if (progressBar) { + progressBar.update(percent, 100); + } + }, + }); + + if (progressBar) { + progressBar.complete(result.success, result.success ? formatBytes(result.size) : undefined); + } + } else { + // Direct HTTP download + result = await httpDownload(url, destPath, { + cookies, + timeout, + onProgress: (received, total) => { + if (progressBar) { + progressBar.update(received, total); + } + }, + }); + + if (progressBar) { + progressBar.complete(result.success, result.success ? formatBytes(result.size) : undefined); + } + } + } catch (err: any) { + result = { success: false, size: 0, error: err.message }; + if (progressBar) { + progressBar.fail(err.message); + } + } + + tracker.onFileComplete(result.success); + + const duration = Date.now() - startTime; + + return { + ...item, + _download: { + status: result.success ? 'success' : 'failed', + path: result.success ? destPath : undefined, + size: result.size, + error: result.error, + duration, + } as DownloadResult, + }; + }); + + // Cleanup temp cookie file + if (cookiesFile && fs.existsSync(cookiesFile)) { + try { + fs.unlinkSync(cookiesFile); + } catch { + // Ignore cleanup errors + } + } + + // Show summary + tracker.finish(); + + return results; +} diff --git a/src/pipeline/template.ts b/src/pipeline/template.ts index 49ae588..5081e49 100644 --- a/src/pipeline/template.ts +++ b/src/pipeline/template.ts @@ -119,6 +119,32 @@ function applyFilter(filterExpr: string, value: any): any { return Array.isArray(value) ? value[value.length - 1] : value; case 'json': return JSON.stringify(value ?? null); + case 'slugify': + // Convert to URL-safe slug + return typeof value === 'string' + ? value + .toLowerCase() + .replace(/[^\p{L}\p{N}]+/gu, '-') + .replace(/^-|-$/g, '') + : value; + case 'sanitize': + // Remove invalid filename characters + return typeof value === 'string' + ? value.replace(/[<>:"/\\|?*\x00-\x1f]/g, '_') + : value; + case 'ext': { + // Extract file extension from URL or path + if (typeof value !== 'string') return value; + const lastDot = value.lastIndexOf('.'); + const lastSlash = Math.max(value.lastIndexOf('/'), value.lastIndexOf('\\')); + return lastDot > lastSlash ? value.slice(lastDot) : ''; + } + case 'basename': { + // Extract filename from URL or path + if (typeof value !== 'string') return value; + const parts = value.split(/[/\\]/); + return parts[parts.length - 1] || value; + } default: return value; }