In [None]:
pip install selenium
pip install webdriver-manager  # 这个库可以自动管理浏览器驱动

In [None]:
import time
import csv
import os
import random
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# --- 1. 配置区域 ---

SAVE_DIR = r"D:\5507课件\作业"
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

# 文件名：标明从1页开始
CSV_FILE = os.path.join(SAVE_DIR, '泡泡玛特_1页起_截止250601.csv')

BASE_URL = "https://weibo.com/p/1008088142edf6a82c40093893fa3c5cff63e8/super_index"

# 【核心设置修改】
START_PAGE = 1       # 从第 1 页开始
MAX_PAGE = 500        # 设置一个很大的结束页
CUTOFF_DATE = datetime(2025, 6, 1) # 截止日期：2025年6月1日

# --- 2. 辅助函数 ---

def save_to_csv(data_list):
    file_exists = os.path.isfile(CSV_FILE)
    try:
        with open(CSV_FILE, 'a', newline='', encoding='utf-8-sig') as f:
            fieldnames = ['发布者', '发布时间', '微博内容', '转发', '评论', '点赞']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            if not file_exists: writer.writeheader()
            writer.writerows(data_list)
    except Exception as e:
        print(f"保存出错: {e}")

def parse_weibo_date(date_str):
    """解析时间，支持多种格式"""
    now = datetime.now()
    date_str = str(date_str).strip()
    if not date_str: return now
    try:
        # 格式: "2023-12-20"
        if re.match(r'\d{4}-\d{1,2}-\d{1,2}', date_str):
            return datetime.strptime(date_str[:10], "%Y-%m-%d")
        # 格式: "06-15 12:00" (今年)
        elif re.match(r'\d{1,2}-\d{1,2}', date_str):
            month, day = map(int, re.findall(r'\d+', date_str)[:2])
            return datetime(now.year, month, day)
        # 格式: "今天", "10分钟前" -> 视为新贴
        else:
            return now 
    except: return now

def aggressive_scroll(driver):
    """暴力滚动 + 等待，防止只加载4条"""
    print("   正在滚动加载...", end="", flush=True)
    last_height = driver.execute_script("return document.body.scrollHeight")
    retry_count = 0
    
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # 增加等待时间，让微博服务器反应过来
        time.sleep(random.uniform(2.0, 3.0))
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if new_height == last_height:
            retry_count += 1
            print(".", end="", flush=True)
            if retry_count >= 3: # 连续3次高度没变才算到底
                break
        else:
            retry_count = 0 
            last_height = new_height
            
    driver.execute_script("window.scrollBy(0, -200);")
    print(" 完成")

def click_expand_text(driver):
    try:
        js = "arguments[0].click();"
        btns = driver.find_elements(By.XPATH, "//a[contains(text(), '展开')]")
        for btn in btns:
            try: driver.execute_script(js, btn)
            except: pass
    except: pass

def get_cards(driver):
    """获取卡片元素"""
    cards = driver.find_elements(By.CSS_SELECTOR, "div[action-type='feed_list_item']")
    if not cards:
        cards = driver.find_elements(By.CSS_SELECTOR, "div.WB_cardwrap")
    return cards

# --- 3. 主程序 ---

if __name__ == '__main__':
    chrome_options = Options()
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
    })

    try:
        driver.get("https://weibo.com/")
        driver.maximize_window()
        
        print("\n" + "="*50)
        print(f"【任务配置】")
        print(f"起始页码: {START_PAGE}")
        print(f"截止日期: {CUTOFF_DATE.strftime('%Y-%m-%d')}")
        print(f"策略: 遇到旧贴自动跳过，整页全旧则停止。")
        print(f"请在 25 秒内扫码登录...")
        print("="*50 + "\n")
        time.sleep(25)
        
        stop_main_loop = False

        for page in range(START_PAGE, MAX_PAGE + 1):
            if stop_main_loop: break

            print(f"\n>>> [第 {page} 页] 处理中...")
            
            # --- 重试机制：防止一页只加载4条 ---
            retry_limit = 2
            cards = []
            
            for attempt in range(retry_limit + 1):
                if attempt > 0:
                    print(f"   [重试] 加载不全(只有{len(cards)}条)，正在第{attempt}次刷新...")
                
                driver.get(f"{BASE_URL}?page={page}")
                time.sleep(5) # 基础等待
                
                aggressive_scroll(driver)
                click_expand_text(driver)
                
                cards = get_cards(driver)
                
                # 如果抓到的数量 > 5，说明加载成功，无需重试
                if len(cards) > 5:
                    break
                else:
                    if attempt == retry_limit:
                        print(f"   [放弃] 重试后依然只有 {len(cards)} 条，强行处理。")
            
            # --- 数据提取与日期检查 ---
            current_page_data = []
            valid_posts_count = 0 
            skipped_posts_count = 0
            
            for card in cards:
                try:
                    # 1. 时间提取
                    pub_time_text = ""
                    try:
                        time_elem = card.find_element(By.CSS_SELECTOR, "a[node-type='feed_list_item_date']")
                        pub_time_text = time_elem.text
                        title_time = time_elem.get_attribute("title")
                        if title_time: pub_time_text = title_time
                    except: pass
                    
                    post_date = parse_weibo_date(pub_time_text)
                    
                    # 2. 截止日期判断
                    # 如果帖子时间 < 截止日期 (旧数据)
                    if post_date < CUTOFF_DATE:
                        skipped_posts_count += 1
                        continue # 跳过这条，继续看下一条
                    
                    valid_posts_count += 1
                    
                    # 3. 内容提取
                    content = card.text.replace('\n', ' ')
                    try: content = card.find_element(By.CSS_SELECTOR, "div[node-type='feed_list_content']").text.strip()
                    except: pass
                    
                    author = "未知"
                    try: author = card.find_element(By.CSS_SELECTOR, "div.WB_info a.S_txt1").text
                    except: pass
                    
                    fwd, cmt, like = "0", "0", "0"
                    try:
                        handles = card.find_elements(By.CSS_SELECTOR, "ul.WB_row_line li")
                        if len(handles) >= 4:
                            fwd = handles[1].text.replace('转发', '').strip() or "0"
                            cmt = handles[2].text.replace('评论', '').strip() or "0"
                            like = handles[3].text.replace('ñ', '').strip() or "0"
                    except: pass
                    
                    if content:
                        current_page_data.append({
                            '发布者': author, '发布时间': pub_time_text, '微博内容': content,
                            '转发': fwd, '评论': cmt, '点赞': like
                        })
                except: continue
            
            # --- 保存与停止逻辑 ---
            if current_page_data:
                save_to_csv(current_page_data)
                print(f"   [保存] 本页录入 {len(current_page_data)} 条 (跳过 {skipped_posts_count} 条旧数据)")
            else:
                print(f"   [提示] 本页无有效录入。")
                # 核心：如果跳过了旧数据，且没录入新数据 -> 说明真到底了
                if skipped_posts_count > 0 and valid_posts_count == 0:
                    print(f"\n   ★ 触发停止条件：本页所有数据均早于 {CUTOFF_DATE.strftime('%Y-%m-%d')}")
                    stop_main_loop = True
            
            # 随机休息，防封
            time.sleep(random.randint(4, 7))

    except Exception as e:
        print(f"发生错误: {e}")
    finally:
        print(f"任务结束，文件: {CSV_FILE}")
        driver.quit()