diff --git a/.github/workflows/comprehensive-test.yml b/.github/workflows/comprehensive-test.yml index 3d8b3ada4..1f472c0f4 100644 --- a/.github/workflows/comprehensive-test.yml +++ b/.github/workflows/comprehensive-test.yml @@ -2,7 +2,7 @@ name: Comprehensive Tests on: push: - branches: [master, develop] + branches: [master] pull_request: branches: [master, develop] diff --git a/Dockerfile b/Dockerfile index 89019cd7f..751c95524 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,8 +9,10 @@ COPY ./requirements.txt . # apk repository RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.ustc.edu.cn/g' /etc/apk/repositories -# timezone -RUN apk add -U tzdata && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && apk del tzdata +# timezone and init process +RUN apk add -U tzdata tini && \ + cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \ + apk del tzdata # runtime environment RUN apk add musl-dev gcc libxml2-dev libxslt-dev && \ @@ -21,4 +23,4 @@ COPY . . EXPOSE 5010 -ENTRYPOINT [ "sh", "start.sh" ] +ENTRYPOINT ["tini", "--", "bash", "proxy_pool.sh", "start", "--fg"] diff --git a/README.md b/README.md index 64a387e60..ffbd5bedd 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ ProxyPool 爬虫代理IP池 * git clone ```bash -git clone git@github.com:jhao104/proxy_pool.git +git clone https://github.com/jhao104/proxy_pool.git ``` * releases @@ -238,7 +238,7 @@ PROXY_FETCHER = [   这里感谢以下contributor的无私奉献: -  [@kangnwh](https://github.com/kangnwh) | [@bobobo80](https://github.com/bobobo80) | [@halleywj](https://github.com/halleywj) | [@newlyedward](https://github.com/newlyedward) | [@wang-ye](https://github.com/wang-ye) | [@gladmo](https://github.com/gladmo) | [@bernieyangmh](https://github.com/bernieyangmh) | [@PythonYXY](https://github.com/PythonYXY) | [@zuijiawoniu](https://github.com/zuijiawoniu) | [@netAir](https://github.com/netAir) | [@scil](https://github.com/scil) | [@tangrela](https://github.com/tangrela) | [@highroom](https://github.com/highroom) | [@luocaodan](https://github.com/luocaodan) | [@vc5](https://github.com/vc5) | [@1again](https://github.com/1again) | [@obaiyan](https://github.com/obaiyan) | [@zsbh](https://github.com/zsbh) | [@jiannanya](https://github.com/jiannanya) | [@Jerry12228](https://github.com/Jerry12228) +  [@kangnwh](https://github.com/kangnwh) | [@bobobo80](https://github.com/bobobo80) | [@halleywj](https://github.com/halleywj) | [@newlyedward](https://github.com/newlyedward) | [@wang-ye](https://github.com/wang-ye) | [@gladmo](https://github.com/gladmo) | [@bernieyangmh](https://github.com/bernieyangmh) | [@PythonYXY](https://github.com/PythonYXY) | [@zuijiawoniu](https://github.com/zuijiawoniu) | [@netAir](https://github.com/netAir) | [@scil](https://github.com/scil) | [@tangrela](https://github.com/tangrela) | [@highroom](https://github.com/highroom) | [@luocaodan](https://github.com/luocaodan) | [@vc5](https://github.com/vc5) | [@1again](https://github.com/1again) | [@obaiyan](https://github.com/obaiyan) | [@zsbh](https://github.com/zsbh) | [@jiannanya](https://github.com/jiannanya) | [@Jerry12228](https://github.com/Jerry12228) | [@zeyudada](https://github.com/zeyudada) ### Release Notes diff --git a/docs/index.rst b/docs/index.rst index 6a52ed4c6..0bc35ce7c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,7 +29,7 @@ Python爬虫代理IP池 .. code-block:: console - $ git clone git@github.com:jhao104/proxy_pool.git + $ git clone https://github.com/jhao104/proxy_pool.git * 安装依赖 diff --git a/docs/user/how_to_manage.rst b/docs/user/how_to_manage.rst new file mode 100644 index 000000000..4f0bde455 --- /dev/null +++ b/docs/user/how_to_manage.rst @@ -0,0 +1,163 @@ +.. how_to_manage + +服务管理 +-------- + +``proxy_pool.sh`` 是项目的服务管理脚本, 提供统一的命令行接口来启动、停止和管理服务。 + +基本用法 +>>>>>>>>> + +.. code-block:: console + + $ ./proxy_pool.sh [options] + +可用命令 +>>>>>>>>> + +start - 启动服务 +^^^^^^^^^^^^^^^^ + +启动所有服务(调度程序和API服务)。 + +.. code-block:: console + + # 后台启动(默认) + $ ./proxy_pool.sh start + + # 前台启动(适用于容器环境) + $ ./proxy_pool.sh start --fg + +后台模式下, 服务会在后台运行, 并生成 ``proxy_pool.pid`` 文件记录进程ID。 + +前台模式下, 服务在当前终端运行, 按 ``Ctrl+C`` 可停止服务。 + +stop - 停止服务 +^^^^^^^^^^^^^^^^ + +停止所有正在运行的服务。 + +.. code-block:: console + + $ ./proxy_pool.sh stop + +脚本会读取 ``proxy_pool.pid`` 文件, 向所有子进程发送终止信号。 + +restart - 重启服务 +^^^^^^^^^^^^^^^^^^^ + +重启所有服务。 + +.. code-block:: console + + # 后台重启 + $ ./proxy_pool.sh restart + + # 前台重启 + $ ./proxy_pool.sh restart --fg + +status - 查看状态 +^^^^^^^^^^^^^^^^^^ + +查看当前服务运行状态。 + +.. code-block:: console + + $ ./proxy_pool.sh status + +输出示例:: + + [INFO] Services: 2 running, 0 dead + PID 12345: running + PID 12346: running + +环境变量 +>>>>>>>>> + +PYTHON +^^^^^^ + +指定Python解释器路径, 默认为 ``python``。 + +.. code-block:: console + + $ PYTHON=python3 ./proxy_pool.sh start + +PID 文件 +>>>>>>>> + +服务启动后会在项目根目录生成 ``proxy_pool.pid`` 文件, 记录所有子进程的PID。 + +该文件用于: + +- ``stop`` 命令识别需要终止的进程 +- ``status`` 命令检查进程状态 +- 防止重复启动 + +``stop`` 命令执行后会自动删除该文件。 + +Docker 环境 +>>>>>>>>>>>> + +在Docker环境中, 建议使用前台模式: + +.. code-block:: dockerfile + + ENTRYPOINT ["tini", "--", "bash", "proxy_pool.sh", "start", "--fg"] + +``docker-compose.yml`` 示例: + +.. code-block:: yaml + + version: '2' + services: + proxy_pool: + build: . + container_name: proxy_pool + ports: + - "5010:5010" + links: + - proxy_redis + environment: + DB_CONN: "redis://@proxy_redis:6379/0" + proxy_redis: + image: "redis" + container_name: proxy_redis + +故障排除 +>>>>>>>>> + +服务启动失败 +^^^^^^^^^^^^^ + +检查日志输出, 确认配置是否正确: + +.. code-block:: console + + # 前台启动查看详细日志 + $ ./proxy_pool.sh start --fg + +端口被占用 +^^^^^^^^^^^ + +如果API端口被占用, 修改 ``setting.py`` 中的 ``PORT`` 配置: + +.. code-block:: python + + PORT = 5010 # 修改为其他端口 + +无法停止服务 +^^^^^^^^^^^^^ + +如果 ``stop`` 命令无法停止服务, 可手动终止: + +.. code-block:: console + + # 查看PID文件 + $ cat proxy_pool.pid + + # 手动终止进程 + $ kill + + # 删除PID文件 + $ rm proxy_pool.pid diff --git a/docs/user/how_to_run.rst b/docs/user/how_to_run.rst index 91bc41ff9..61ab31ab3 100644 --- a/docs/user/how_to_run.rst +++ b/docs/user/how_to_run.rst @@ -11,7 +11,7 @@ .. code-block:: console - $ git clone git@github.com:jhao104/proxy_pool.git + $ git clone https://github.com/jhao104/proxy_pool.git 或者下载特定的 ``release`` 版本: @@ -58,10 +58,35 @@ 启动项目 >>>>>>>>> -如果已配置好运行环境, 具备运行条件, 可以通过 ``proxyPool.py`` 启动. ``proxyPool.py`` 是项目的CLI入口. +如果已配置好运行环境, 具备运行条件, 可以通过 ``proxyPool.py`` 或 ``proxy_pool.sh`` 启动. + 完整程序包含两部份: ``schedule`` 调度程序和 ``server`` API服务, 调度程序负责采集和验证代理, API服务提供代理服务HTTP接口. -通过命令行程序分别启动调度程序和API服务: +方式一: 使用 ``proxy_pool.sh`` (推荐) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``proxy_pool.sh`` 提供统一的服务管理接口, 支持后台运行和进程管理: + +.. code-block:: console + + # 后台启动所有服务 + $ ./proxy_pool.sh start + + # 前台启动(容器环境) + $ ./proxy_pool.sh start --fg + + # 停止服务 + $ ./proxy_pool.sh stop + + # 查看状态 + $ ./proxy_pool.sh status + +更多用法请参考 :doc:`/user/how_to_manage` + +方式二: 使用 ``proxyPool.py`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``proxyPool.py`` 是项目的Python CLI入口, 可以分别启动调度程序和API服务: .. code-block:: console diff --git a/docs/user/index.rst b/docs/user/index.rst index 97b156d31..62a8623e3 100644 --- a/docs/user/index.rst +++ b/docs/user/index.rst @@ -8,5 +8,6 @@ :maxdepth: 2 how_to_run + how_to_manage how_to_use how_to_config diff --git a/proxy_pool.sh b/proxy_pool.sh new file mode 100644 index 000000000..cae60196c --- /dev/null +++ b/proxy_pool.sh @@ -0,0 +1,224 @@ +#!/usr/bin/env bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PID_FILE="$SCRIPT_DIR/proxy_pool.pid" +PYTHON="${PYTHON:-python}" + +# 颜色 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +# 获取已启动的 PIDs +get_pids() { + if [ -f "$PID_FILE" ]; then + cat "$PID_FILE" + fi +} + +# 检查进程是否存活 +is_running() { + local pid=$1 + kill -0 "$pid" 2>/dev/null +} + +# 启动服务 +cmd_start() { + local foreground=false + + while [[ $# -gt 0 ]]; do + case $1 in + --fg|--foreground) foreground=true; shift ;; + *) log_error "Unknown option: $1"; exit 1 ;; + esac + done + + # 检查是否已运行 + local pids=$(get_pids) + if [ -n "$pids" ]; then + for pid in $pids; do + if is_running "$pid"; then + log_warn "Service already running (PID: $pid)" + log_warn "Use '$0 stop' first, or '$0 restart'" + exit 1 + fi + done + fi + + # 清理旧的 PID 文件 + rm -f "$PID_FILE" + + cd "$SCRIPT_DIR" + + if [ "$foreground" = true ]; then + # 前台模式(容器环境) + log_info "Starting in foreground mode..." + + trap 'log_info "Shutting down..."; kill $SERVER_PID $SCHEDULER_PID 2>/dev/null; wait; rm -f "$PID_FILE"; exit 0' EXIT INT TERM + + $PYTHON proxyPool.py server & + SERVER_PID=$! + + $PYTHON proxyPool.py schedule & + SCHEDULER_PID=$! + + echo "$SERVER_PID" >> "$PID_FILE" + echo "$SCHEDULER_PID" >> "$PID_FILE" + + log_info "Services started (PIDs: $SERVER_PID $SCHEDULER_PID)" + wait + else + # 后台模式(非容器环境) + log_info "Starting in background mode..." + + nohup $PYTHON proxyPool.py server > /dev/null 2>&1 & + SERVER_PID=$! + + nohup $PYTHON proxyPool.py schedule > /dev/null 2>&1 & + SCHEDULER_PID=$! + + echo "$SERVER_PID" >> "$PID_FILE" + echo "$SCHEDULER_PID" >> "$PID_FILE" + + sleep 2 + + # 验证启动 + local failed=false + if ! is_running "$SERVER_PID"; then + log_error "Server failed to start" + failed=true + fi + if ! is_running "$SCHEDULER_PID"; then + log_error "Scheduler failed to start" + failed=true + fi + + if [ "$failed" = true ]; then + cmd_stop + exit 1 + fi + + log_info "Services started" + log_info " Server PID: $SERVER_PID" + log_info " Scheduler PID: $SCHEDULER_PID" + log_info "Use '$0 stop' to stop, '$0 status' to check" + fi +} + +# 停止服务 +cmd_stop() { + local pids=$(get_pids) + + if [ -z "$pids" ]; then + log_warn "No PID file found. Services may not be running." + exit 0 + fi + + log_info "Stopping services..." + + local stopped=0 + for pid in $pids; do + if is_running "$pid"; then + kill "$pid" 2>/dev/null || true + stopped=$((stopped + 1)) + fi + done + + # 等待进程退出 + sleep 1 + + # 强制杀死仍在运行的进程 + for pid in $pids; do + if is_running "$pid"; then + log_warn "Force killing PID $pid" + kill -9 "$pid" 2>/dev/null || true + fi + done + + rm -f "$PID_FILE" + log_info "Stopped $stopped service(s)" +} + +# 重启服务 +cmd_restart() { + cmd_stop + sleep 1 + cmd_start "$@" +} + +# 查看状态 +cmd_status() { + local pids=$(get_pids) + + if [ -z "$pids" ]; then + log_info "No PID file found. Services are not running." + exit 0 + fi + + local running=0 + local dead=0 + + for pid in $pids; do + if is_running "$pid"; then + running=$((running + 1)) + else + dead=$((dead + 1)) + fi + done + + if [ $running -gt 0 ]; then + log_info "Services: $running running, $dead dead" + for pid in $pids; do + local status="stopped" + if is_running "$pid"; then + status="running" + fi + echo " PID $pid: $status" + done + else + log_warn "All services are stopped" + rm -f "$PID_FILE" + fi +} + +# 显示帮助 +cmd_help() { + cat < [options] + +Commands: + start [--fg] Start services (background by default) + --fg Run in foreground (for containers) + stop Stop all services + restart [--fg] Restart services + status Show service status + help Show this help + +Examples: + $0 start # Start in background + $0 start --fg # Start in foreground (containers) + $0 stop # Stop all services + $0 status # Check status + +Environment: + PYTHON Python executable (default: python) +EOF +} + +# 主入口 +case "${1:-help}" in + start) shift; cmd_start "$@" ;; + stop) cmd_stop ;; + restart) shift; cmd_restart "$@" ;; + status) cmd_status ;; + help|-h|--help) cmd_help ;; + *) log_error "Unknown command: $1"; cmd_help; exit 1 ;; +esac diff --git a/setting.py b/setting.py index e8a669f58..ff616aabc 100644 --- a/setting.py +++ b/setting.py @@ -37,7 +37,7 @@ # example: # Redis: redis://:password@ip:port/db # Ssdb: ssdb://:password@ip:port -DB_CONN = 'redis://:pwd@127.0.0.1:6379/0' +DB_CONN = 'redis://:pwdstring@127.0.0.1:6379/0' # proxy table name TABLE_NAME = 'use_proxy' diff --git a/start.sh b/start.sh deleted file mode 100755 index 4c9b48f5d..000000000 --- a/start.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env bash -python proxyPool.py server & -python proxyPool.py schedule \ No newline at end of file diff --git a/tox.ini b/tox.ini index 6f993ff95..90cd97751 100644 --- a/tox.ini +++ b/tox.ini @@ -3,16 +3,7 @@ envlist = py38,py39,py310,py311 skip_missing_interpreters = true [testenv] +recreate = true deps = - six - requests - pyquery - gunicorn - lxml - redis - APScheduler==3.2.0;python_version<"3.10" - APScheduler==3.10.0;python_version>="3.10" - click==8.0.1 - Flask==2.1.1 - werkzeug==2.1.0 + -r requirements.txt commands = python test.py \ No newline at end of file