In [None]:
# 时间范围：2024.01-2025.2

In [None]:
!pip install pytesseract

In [None]:
!pip install pytesseract pillow

In [None]:
import os
import requests
from bs4 import BeautifulSoup

# 1) 将你想处理的微信公众号文章链接放在列表中
article_urls = []

# 2) 创建会话并设置请求头，模拟浏览器
session = requests.Session()
session.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/110.0.5481.77 Safari/537.36"
    ),
    "Referer": "https://mp.weixin.qq.com/"
})

# 3) 准备图片下载存放的文件夹
save_dir = "wechat_images"
os.makedirs(save_dir, exist_ok=True)

def get_soup(url):
    """获取网页并返回 BeautifulSoup 对象"""
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.text, "html.parser")
    except requests.RequestException as e:
        print(f"请求失败: {url}，错误: {e}")
        return None

def download_images_from_url(url):
    """从指定文章链接中提取并下载所有图片"""
    soup = get_soup(url)
    if not soup:
        return

    # 查找所有图片标签
    img_tags = soup.find_all("img")
    # 公众号图片地址通常在 data-src 属性中
    img_urls = [img["data-src"] for img in img_tags if "data-src" in img.attrs]

    # 逐个下载
    for idx, img_url in enumerate(img_urls):
        try:
            img_response = session.get(img_url, timeout=10)
            img_response.raise_for_status()
            # 用文章URL最后一段 + 索引号，构建文件名
            filename = f"{url.split('/')[-1]}_image_{idx}.jpg"
            img_path = os.path.join(save_dir, filename)

            with open(img_path, "wb") as f:
                f.write(img_response.content)

            print(f"已下载: {img_path}")
        except requests.RequestException as e:
            print(f"下载失败: {img_url}，错误: {e}")

if __name__ == "__main__":
    # 遍历每篇文章链接，直接下载图片
    for article_url in article_urls:
        print(f"正在处理文章：{article_url}")
        download_images_from_url(article_url)

    print("所有文章的图片下载完成！")

In [None]:
import pytesseract
from PIL import Image
import csv
import re

# 图片所在目录（请根据实际情况修改路径）
image_directory = "Your Address"

# 输出 CSV 文件名
output_csv = "bad_loan_output.csv"

def extract_relevant(text):
    """
    提取文本中的中文字符、数字以及日期常用符号（-、/、.、:）
    """
    # 正则模式：匹配中文、数字和 - / . :
    pattern = r'[\u4e00-\u9fff0-9\-/\.:]+'
    return ''.join(re.findall(pattern, text))

def ocr_image(image_path):
    """
    对图片进行 OCR，使用中文+英文语言包
    """
    # 使用 chi_sim+eng 以确保数字及日期字符能被识别
    return pytesseract.image_to_string(Image.open(image_path), lang="chi_sim+eng")

# 打开 CSV 文件以写入（注意编码设置保证中文正确）
with open(output_csv, mode="w", newline='', encoding="utf-8-sig") as file:
    writer = csv.writer(file)
    
    # 遍历目录中所有图片
    for filename in os.listdir(image_directory):
        if filename.lower().endswith((".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif")):
            image_path = os.path.join(image_directory, filename)
            print(f"Processing {image_path}")
            
            # OCR 提取整张图片文本
            text = ocr_image(image_path)
            # 按行拆分
            lines = text.split('\n')
            
            # 逐行处理
            for line in lines:
                # 提取中文、数字及日期符号
                relevant_text = extract_relevant(line)
                if relevant_text:  # 如果提取出内容
                    # 每一行写入 CSV 一行记录（可根据需要调整分列方式）
                    writer.writerow([relevant_text])

print("CSV file created successfully.")