# 微博数据采集（HTTP版）
本Notebook使用 requests + BeautifulSoup + lxml + Weibo 评论接口批量抓取指定关键词相关的帖子，覆盖正文、互动指标与评论抽取。关键词和配置通过 `config.json` 文件自定义。

## 环境依赖
如首次运行，请先安装 requests / beautifulsoup4 / lxml / pandas / openpyxl / tqdm。

In [1]:
!pip install -q requests beautifulsoup4 lxml pandas openpyxl tqdm

/bin/bash: /root/autodl-tmp/conda/envs/bipo/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m

## Cookie 及采集配置
使用浏览器登录微博后复制请求头中的Cookie填入 `cookie_str`。Cookie 失效将导致403或评论接口报错。

In [None]:
import datetime as dt
import json
import time
from pathlib import Path

import pandas as pd
import requests
from bs4 import BeautifulSoup
from lxml import etree
from tqdm import tqdm

PROJECT_ROOT = Path('/root/autodl-tmp/dhd/net_space').resolve()
DATA_DIR = PROJECT_ROOT / 'data'
RAW_DIR = DATA_DIR / 'raw'
HTML_DIR = DATA_DIR / 'html'
for path in (RAW_DIR, HTML_DIR):
    path.mkdir(parents=True, exist_ok=True)

# 加载配置文件
config_path = PROJECT_ROOT / 'config.json'
with open(config_path, 'r', encoding='utf-8') as f:
    config = json.load(f)

cookie_str = config.pop('COOKIE_STR')  # 从配置中移除并赋值

In [3]:
def set_cookie(cookie: str):
    if 'YOUR_COOKIE_HERE' in cookie:
        raise ValueError('请先填写有效的Cookie字符串。')
    config['COOKIE_DICT'].clear()
    for segment in cookie.split(';'):
        if '=' not in segment:
            continue
        k, v = segment.strip().split('=', 1)
        config['COOKIE_DICT'][k] = v

set_cookie(cookie_str)
config['COOKIE_DICT']

{'_T_WM': '96082886014',
 'WEIBOCN_FROM': '1110006030',
 'SCF': 'AjE92wD88hvHibcpGBpP5lcH53fAEoPVz_qnWgdVRPpMzwpEGeTCtidKECrIoQ8dxh7-4bxNMpauJxPF4vvEp3M.',
 'SUB': '_2A25EIvVwDeRhGeFI7lsQ-CvNyjiIHXVnXgi4rDV6PUJbktANLU2gkW1NfOvLZnJf5B7CLqZP8_Jz98lqqvalXcFm',
 'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5bd4R5_2rgUZb8Sv6e2fzX5NHD95QNSo-4eKnfeK2XWs4Dqcj_i--4iKL2iKL8i--Xi-iWi-2ci--4iKL2iKL8i--4iKL2iKL8i--4iKL2iKL8',
 'SSOLoginState': '1764132128',
 'ALF': '1766724128',
 'MLOGIN': '1',
 'M_WEIBOCN_PARAMS': 'lfid%3D102803%26luicode%3D20000174%26uicode%3D10000011%26fid%3D102803',
 'XSRF-TOKEN': 'd3a0e1'}

In [4]:
def get_cookie(with_csrf_token: bool = False) -> str:
    items = []
    for k, v in config['COOKIE_DICT'].items():
        if k == 'XSRF-TOKEN' and not with_csrf_token:
            continue
        items.append(f'{k}={v}')
    return '; '.join(items)

def update_xsrf_token() -> str:
    if 'XSRF-TOKEN' in config['COOKIE_DICT']:
        return config['COOKIE_DICT']['XSRF-TOKEN']
    url = 'https://weibo.com/ajax/getClientMessageSettings'
    headers = {
        'accept': 'application/json, text/plain, */*',
        'cookie': get_cookie(),
        'referer': 'https://s.weibo.com/',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    new_cookie = response.headers.get('Set-Cookie', '')
    for segment in new_cookie.split(';'):
        if '=' not in segment:
            continue
        k, v = segment.strip().split('=', 1)
        if k == 'XSRF-TOKEN':
            config['COOKIE_DICT'][k] = v
            return v
    raise RuntimeError('未能刷新XSRF-TOKEN，请检查Cookie。')

def get_xsrf_token() -> str:
    return config['COOKIE_DICT'].get('XSRF-TOKEN') or update_xsrf_token()

In [5]:
last_request = dt.datetime.now()

def limit_interval():
    global last_request
    now = dt.datetime.now()
    delta = (now - last_request).total_seconds()
    if delta < config['HTTP_INTERVAL']:
        time.sleep(config['HTTP_INTERVAL'] - delta)
    last_request = dt.datetime.now()

def save_as_html(html: str, filename: str, dir_path: Path):
    dir_path.mkdir(parents=True, exist_ok=True)
    (dir_path / filename).write_text(html, encoding='utf-8')

def weibo_search(topic: str, page: int) -> str:
    url = f'https://s.weibo.com/weibo?q={topic}'
    if page > 1:
        url = f'{url}&page={page}'
    headers = {
        'cookie': get_cookie(),
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'upgrade-insecure-requests': '1'
    }
    limit_interval()
    resp = requests.get(url, headers=headers, timeout=30)
    resp.raise_for_status()
    return resp.text

In [6]:
def get_content(html: str):
    soup = BeautifulSoup(html, 'html.parser')
    dom = etree.HTML(str(soup))
    weibo_id = dom.xpath("//div[@action-type='feed_list_item']/@mid")
    author = dom.xpath("//div[@action-type='feed_list_item']//a[@class='name' and @target='_blank']/text()")
    author_url = dom.xpath("//div[@action-type='feed_list_item']//a[@class='name' and @target='_blank']/@href")
    author_id = [int(url.split('?')[0].split('/')[-1]) for url in author_url]
    publish_time = dom.xpath("//div[@action-type='feed_list_item']//div[@class='from']/a[1]/text()")
    cards = dom.xpath("//div[@node-type='like']")
    content = []
    for card in cards:
        full = card.xpath(".//p[@node-type='feed_list_content_full']")
        if not full:
            full = card.xpath(".//p[@node-type='feed_list_content']")
        content.append([''.join(t.itertext()).strip() for t in full])
    retweets = dom.xpath("//div[@class='card-act']/ul/li[1]/a")
    retweet_nums = [''.join(t.itertext()).strip() for t in retweets]
    retweet_nums = [int(n) if n.isdigit() else 0 for n in retweet_nums]
    reviews = dom.xpath("//div[@class='card-act']/ul/li[2]/a")
    review_nums = [''.join(t.itertext()).strip() for t in reviews]
    review_nums = [int(n) if n.isdigit() else 0 for n in review_nums]
    likes = dom.xpath("//div[@class='card-act']/ul/li[3]//span[@class='woo-like-count']/text()")
    like_nums = [int(n) if n.strip().isdigit() else 0 for n in likes]
    detail_url = dom.xpath("//div[@class='content']/div[@class='from']/a[1]/@href")
    save_as_html(html, 'weibo_assert.html', HTML_DIR)
    assert len(weibo_id) == len(author) == len(author_id)
    assert len(retweet_nums) == len(review_nums) == len(like_nums) == len(detail_url)
    return {
        'weibo_id': weibo_id,
        'author': author,
        'author_id': author_id,
        'author_url': author_url,
        'publish_time': publish_time,
        'content': content,
        'retweet_nums': retweet_nums,
        'review_nums': review_nums,
        'like_nums': like_nums,
        'detail_url': detail_url
    }

In [8]:
def request_comments(mid: str, uid: int, max_id: int | None = None):
    base = f'https://weibo.com/ajax/statuses/buildComments?is_reload=1&id={mid}&is_show_bulletin=2&is_mix=0&count=10&uid={uid}&fetch_level=0&locale=zh-CN'
    if max_id:
        base = f'{base}&max_id={max_id}&flow=0'
    headers = {
        'accept': 'application/json, text/plain, */*',
        'cookie': get_cookie(with_csrf_token=True),
        'referer': 'https://weibo.com/',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
        'x-xsrf-token': get_xsrf_token()
    }
    limit_interval()
    resp = requests.get(base, headers=headers, timeout=30)
    resp.raise_for_status()
    review_data = resp.json()
    if review_data.get('ok') != 1:
        raise RuntimeError(f'获取评论失败: {review_data}')
    detail = review_data.get('data', [])
    review_detail = [{
        'review_id': r.get('id_str', ''),
        'review_time': r.get('created_at', ''),
        'review_like': r.get('like_counts', 0),
        'review_content': r.get('text_raw', ''),
        'review_loc': r.get('source', '')
    } for r in detail]
    reviewer_detail = [{
        'reviewer_id': r.get('user', {}).get('id'),
        'reviewer_name': r.get('user', {}).get('screen_name'),
        'reviewer_url': r.get('user', {}).get('profile_url'),
        'reviewer_loc': r.get('user', {}).get('location'),
        'reviewer_followers': r.get('user', {}).get('followers_count'),
        'reviewer_following': r.get('user', {}).get('friends_count'),
        'reviewer_weibo_count': r.get('user', {}).get('statuses_count'),
        'reviewer_description': r.get('user', {}).get('description'),
        'reviewer_gender': r.get('user', {}).get('gender')
    } for r in detail]
    return review_data.get('max_id'), review_data.get('total_number', 0), review_detail, reviewer_detail

def all_comments(mid: str, uid: int, max_n: int | None = None):
    limit = max_n or config['MAX_COMMENTS']
    comments, reviewers = [], []
    max_id = None
    total_number = 0
    while len(comments) < limit:
        next_data = request_comments(mid, uid, max_id=max_id)
        max_id, total_number, review_detail, reviewer_detail = next_data
        if not review_detail:
            break
        comments.extend(review_detail)
        reviewers.extend(reviewer_detail)
        if max_id == 0:
            break
    return total_number, comments[:limit], reviewers[:limit]

## 批量采集函数

In [9]:
def collect_topic(topic: str, page_depth: int):
    topic_dir = HTML_DIR / topic
    topic_dir.mkdir(parents=True, exist_ok=True)
    records = []
    for page in range(1, page_depth + 1):
        print(f'采集主题={topic}, 页码={page}')
        html = weibo_search(topic, page)
        save_as_html(html, f'weibo_{page}.html', topic_dir)
        elements = get_content(html)
        total_items = len(elements['weibo_id'])
        for idx in range(total_items):
            mid = elements['weibo_id'][idx]
            uid = elements['author_id'][idx]
            content_text = ' '.join(elements['content'][idx])
            try:
                total_review, comments, reviewers = all_comments(mid, uid)
            except Exception as err:
                print(f'获取评论失败 mid={mid}: {err}')
                total_review, comments, reviewers = 0, [], []
            records.append({
                'topic': topic,
                'page': page,
                'weibo_id': mid,
                'author': elements['author'][idx],
                'author_id': uid,
                'author_url': elements['author_url'][idx],
                'publish_time': elements['publish_time'][idx],
                'content': content_text,
                'retweet_count': elements['retweet_nums'][idx],
                'review_count': elements['review_nums'][idx],
                'like_count': elements['like_nums'][idx],
                'detail_url': elements['detail_url'][idx],
                'fetched_review_total': total_review,
                'comments': json.dumps(comments, ensure_ascii=False),
                'reviewers': json.dumps(reviewers, ensure_ascii=False)
            })
    df = pd.DataFrame(records)
    df.drop_duplicates(subset=['weibo_id'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [10]:
def collect_topics(topics, page_depth, min_total, max_total):
    combined = pd.DataFrame()
    frames = []
    for topic in topics:
        topic_df = collect_topic(topic, page_depth)
        frames.append(topic_df)
        combined = pd.concat(frames, ignore_index=True)
        combined.drop_duplicates(subset=['weibo_id'], inplace=True)
        combined.reset_index(drop=True, inplace=True)
        print(f"当前累计 {len(combined)} 条")
        if len(combined) >= max_total:
            print(f'达到上限 {max_total} 条，提前停止。')
            combined = combined.head(max_total)
            break
    if len(combined) < min_total:
        print(f"警告：仅采集到 {len(combined)} 条，低于目标 {min_total} 条，可适当增加PAGE_DEPTH或拓展关键词。")
    else:
        print(f"采集完成，共 {len(combined)} 条。")
    return combined

## 执行采集（多关键词合并）

In [11]:
df_quanyunhui = collect_topics(
    topics=config['TOPICS'],
    page_depth=config['PAGE_DEPTH'],
    min_total=config['MIN_TOTAL_RECORDS'],
    max_total=config['MAX_TOTAL_RECORDS']
 )
df_quanyunhui.head()

采集主题=全运会, 页码=1


采集主题=全运会, 页码=2
采集主题=全运会, 页码=3
采集主题=全运会, 页码=3
采集主题=全运会, 页码=4
采集主题=全运会, 页码=4
采集主题=全运会, 页码=5
采集主题=全运会, 页码=5
采集主题=全运会, 页码=6
采集主题=全运会, 页码=6
采集主题=全运会, 页码=7
采集主题=全运会, 页码=7
采集主题=全运会, 页码=8
采集主题=全运会, 页码=8
采集主题=全运会, 页码=9
采集主题=全运会, 页码=9
采集主题=全运会, 页码=10
采集主题=全运会, 页码=10
采集主题=全运会, 页码=11
采集主题=全运会, 页码=11
采集主题=全运会, 页码=12
采集主题=全运会, 页码=12
采集主题=全运会, 页码=13
采集主题=全运会, 页码=13
采集主题=全运会, 页码=14
采集主题=全运会, 页码=14
采集主题=全运会, 页码=15
采集主题=全运会, 页码=15
采集主题=全运会, 页码=16
采集主题=全运会, 页码=16
采集主题=全运会, 页码=17
采集主题=全运会, 页码=17
采集主题=全运会, 页码=18
采集主题=全运会, 页码=18
采集主题=全运会, 页码=19
采集主题=全运会, 页码=19
采集主题=全运会, 页码=20
采集主题=全运会, 页码=20
采集主题=全运会, 页码=21
采集主题=全运会, 页码=21
采集主题=全运会, 页码=22
采集主题=全运会, 页码=22
采集主题=全运会, 页码=23
采集主题=全运会, 页码=23
采集主题=全运会, 页码=24
采集主题=全运会, 页码=24
采集主题=全运会, 页码=25
采集主题=全运会, 页码=25
采集主题=全运会, 页码=26
采集主题=全运会, 页码=26
采集主题=全运会, 页码=27
采集主题=全运会, 页码=27
采集主题=全运会, 页码=28
采集主题=全运会, 页码=28
采集主题=全运会, 页码=29
采集主题=全运会, 页码=29
采集主题=全运会, 页码=30
采集主题=全运会, 页码=30
采集主题=全运会, 页码=31
采集主题=全运会, 页码=31
采集主题=全运会, 页码=32
采集主题=全运会, 页码=32
采集主题=全运会, 页码=33
采集主题=全运会, 页码=33
采集主题=全运

Unnamed: 0,topic,page,weibo_id,author,author_id,author_url,publish_time,content,retweet_count,review_count,like_count,detail_url,fetched_review_total,comments,reviewers
0,全运会,1,5237267904857892,全世界游得最快的人,7806874547,//weibo.com/7806874547?refer_flag=1001030103_,\n 今天16:23\n ...,“相信自己，每天都是新的一天。” ©️新华社 潘展乐 专访 ​,83,29,151,//weibo.com/7806874547/QfDvwknL6?refer_flag=10...,29,"[{""review_id"": """", ""review_time"": ""Wed Nov 26 ...","[{""reviewer_id"": 7806874547, ""reviewer_name"": ..."
1,全运会,1,5237209479515516,无人区不打码,2884596377,//weibo.com/2884596377?refer_flag=1001030103_,\n 今天12:31\n ...,邓亚萍在全运会直播解说中称樊振东为T0级选手，即Top 0，是乒坛最顶尖的那一档，独一档实力...,20,39,556,//weibo.com/2884596377/QfBZhDVq4?refer_flag=10...,39,"[{""review_id"": """", ""review_time"": ""Wed Nov 26 ...","[{""reviewer_id"": 2661954943, ""reviewer_name"": ..."
2,全运会,1,5237223583910249,麦子哥哥QaQ,3847642518,//weibo.com/3847642518?refer_flag=1001030103_,\n 今天13:27\n ...,林雨薇说争取明年亚运会卫冕 | 全运会赛后100米栏铜牌获得者林雨薇接受采访，满分100分给...,524,44,1666,//weibo.com/3847642518/QfCm2gpex?refer_flag=10...,44,"[{""review_id"": """", ""review_time"": ""Wed Nov 26 ...","[{""reviewer_id"": 6867450328, ""reviewer_name"": ..."
3,全运会,1,5237157448124772,平原跑马,1060387602,//weibo.com/1060387602?refer_flag=1001030103_,\n 今天09:05\n ...,你周雨有资格定义一名国手的身份吗？如果缺席境外商业赛事就等同于自动退出国家队，那乒协为何又明...,28,236,4020,//weibo.com/1060387602/QfADmy5CI?refer_flag=10...,226,"[{""review_id"": """", ""review_time"": ""Wed Nov 26 ...","[{""reviewer_id"": 6913420351, ""reviewer_name"": ..."
4,全运会,1,5237177730468242,-Veronica-Y,1740626210,//weibo.com/1740626210?refer_flag=1001030103_,\n 今天10:25\n ...,林高远全运会拿了一金一铜单打也走到八强本来老D们想着他全运会打不好就“借坡下驴”让他走人结果...,0,21,238,//weibo.com/1740626210/QfBa51XOi?refer_flag=10...,21,"[{""review_id"": """", ""review_time"": ""Wed Nov 26 ...","[{""reviewer_id"": 2438911677, ""reviewer_name"": ..."


## 数据持久化

In [12]:
topic_slug = 'quanyunhui_multi'
output_base = RAW_DIR / f'weibo_{topic_slug}_http'
excel_path = output_base.with_suffix('.xlsx')
csv_path = output_base.with_suffix('.csv')
df_quanyunhui.to_excel(excel_path, index=False)
df_quanyunhui.to_csv(csv_path, index=False)
metadata = {
    'keywords': config['TOPICS'],
    'pages_per_keyword': config['PAGE_DEPTH'],
    'records': len(df_quanyunhui),
    'generated_at': dt.datetime.now().isoformat(),
    'paths': {
        'excel': str(excel_path.relative_to(PROJECT_ROOT)),
        'csv': str(csv_path.relative_to(PROJECT_ROOT))
    }
}
with open(output_base.with_suffix('.json'), 'w', encoding='utf-8') as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)
metadata

{'keywords': ['全运会', '全运会志愿者', '全运会金牌', '全运会开幕式', '全运会闭幕式'],
 'pages_per_keyword': 60,
 'records': 2420,
 'generated_at': '2025-11-26T18:33:47.361661',
 'paths': {'excel': 'data/raw/weibo_quanyunhui_multi_http.xlsx',
  'csv': 'data/raw/weibo_quanyunhui_multi_http.csv'}}

## 采集结果速览

In [13]:
df_quanyunhui['publish_time'] = pd.to_datetime(df_quanyunhui['publish_time'], errors='coerce')
summary = df_quanyunhui.groupby(df_quanyunhui['publish_time'].dt.date)['weibo_id'].count().tail()
print(f'共采集 {len(df_quanyunhui)} 条，时间范围：{df_quanyunhui.publish_time.min()} - {df_quanyunhui.publish_time.max()}')
summary

共采集 2420 条，时间范围：NaT - NaT


Series([], Name: weibo_id, dtype: int64)