In [9]:
import requests
from lxml import etree
import pandas as pd
from urllib.parse import urljoin

headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0"
    }

# 定义函数获取页面内容
def get_page_content(stock_code, page_num):
    """
    获取股吧页面内容
    :param stock_code: 股票代码
    :param page_num: 页面编号
    :return: 页面HTML内容
    """
    url = f"https://guba.eastmoney.com/list,{stock_code}_{page_num}.html"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0"
    }
    try:
        response = requests.get(url, headers=headers)
        response.encoding = response.apparent_encoding
        return response.text
    except requests.RequestException as e:
        print(f"请求页面时发生错误: {e}")
        return None

# 定义函数解析页面内容
def parse_page(html):
    """
    解析页面内容，提取评论信息
    :param html: 页面HTML内容
    :return: 评论信息列表
    """
    tree = etree.HTML(html)
    comments = []
    for i in range(1, 81):  # 每页最多80条记录
        content_xpath = f'//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr[{i}]/td[3]/div/a/@href'
        content = tree.xpath(content_xpath)
        if not content or 'caifuhao' in content[0]:
            continue
        title = tree.xpath(f'//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr[{i}]/td[3]/div/a/text()')
        user_name = tree.xpath(f'//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr[{i}]/td[4]/div/a/text()')
        update_time = tree.xpath(f'//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr[{i}]/td[5]/div/text()')
        if content:
            comment_id = content[0].split(',')[2].split('.')[0]
            link = content[0]
            target_url = urljoin("https://guba.eastmoney.com", link)
            try:
                target_response = requests.get(target_url, headers=headers)
                target_response.encoding = target_response.apparent_encoding
                target_html = etree.HTML(target_response.text)
                comment_xpath = '//*[@id="newscontent"]/div[4]'
                comment = target_html.xpath(comment_xpath)
                # 提取评论内容，如果为空则存储为空值
                comment_text = comment[0].xpath('string(.)').strip() if comment else ""
                # 提取评论标题
                title_text = title[0].strip() if title else ""
                comments.append({
                    "记录ID": comment_id,
                    "评论标题": title_text,  # 存储评论标题
                    "评论内容": comment_text,  # 存储评论内容
                    "发布时间": update_time[0].strip() if update_time else "",
                    "用户ID": user_name[0].strip() if user_name else ""
                })
            except requests.RequestException as e:
                print(f"请求评论详情页面时发生错误: {e}")
    return comments

# 定义函数保存数据到Excel
def save_to_excel(data, filename):
    """
    将数据保存到Excel文件
    :param data: 数据列表
    :param filename: 文件名
    """
    df = pd.DataFrame(data)
    df.to_excel(filename, index=False, encoding='utf-8')


In [10]:
# 主程序
if __name__ == "__main__":
    stock_code = '510050'
    start_page = 12  # 起始页码
    end_page = 13    # 结束页码
    all_comments = []  # 用于存储所有页面的评论数据

    for page_num in range(start_page, end_page + 1):  # 爬取第12页到第16页
        print(f"正在爬取第{page_num}页...")
        html_content = get_page_content(stock_code, page_num)
        if html_content:
            comments = parse_page(html_content)
            if comments:
                all_comments.extend(comments)  # 将当前页的评论数据添加到总列表中
            else:
                print(f"第{page_num}页未找到有效评论数据")
        else:
            print(f"无法获取第{page_num}页的内容，请检查网页链接的合法性或稍后重试")

    if all_comments:
        save_to_excel(all_comments, "股吧数据记录.xlsx")
        print("所有数据已成功保存到Excel文件")
    else:
        print("未找到任何有效评论数据")

正在爬取第12页...
正在爬取第13页...
所有数据已成功保存到Excel文件
