In [3]:
import json
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

def download_image(image_url, save_dir, mime_types, skip_count):
    try:
        # 提取URL中的文件名，并尝试获取文件的MIME类型（此处在函数外获取MIME类型可能更高效）
        content_type = requests.head(image_url).headers.get('Content-Type', '')
        extension = mime_types.get(content_type, '')
        image_name = os.path.basename(image_url).split('?')[0] + extension
        image_path = os.path.join(save_dir, image_name)

        # 检查图片是否已存在
        if os.path.exists(image_path):
            skip_count['count'] += 1
            return

        # 从URL下载图片
        response = requests.get(image_url)
        if response.status_code == 200:
            # 保存图片到文件
            with open(image_path, 'wb') as image_file:
                image_file.write(response.content)
    except Exception as e:
        print(f"无法下载或保存图片 {image_url}: {e}")

# 初始化跳过下载的图片计数
skip_count = {'count': 0}

# 文件路径和目录定义
file_path = 'data/tweets_2024-03-07_22-02-24.json'
save_dir = '𝕏_likes'
os.makedirs(save_dir, exist_ok=True)

# MIME类型映射
mime_types = {
    "image/jpeg": ".jpg",
    "image/png": ".png",
    "image/gif": ".jpeg"
}

# 读取并处理JSON文件
with open(file_path, 'r', encoding='utf-8') as file:
    tweets = [json.loads(line) for line in file.readlines()]

# 使用ThreadPoolExecutor来并行处理下载任务
with ThreadPoolExecutor(max_workers= 5) as executor:
    future_to_url = {executor.submit(download_image, tweet['images_urls'][0], save_dir, mime_types, skip_count): tweet for tweet in tweets if tweet.get('media_type') == 'Image' and tweet.get('images_urls')}
    for future in as_completed(future_to_url):
        url = future_to_url[future]
        try:
            data = future.result()
        except Exception as exc:
            print(f'{url} generated an exception: {exc}')

print(f"下载完成。跳过重复x {skip_count['count']} ")

无法下载或保存图片 https://pbs.twimg.com/media/GHOJD4daYAAOyFh?format=jpg&name=medium: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
下载完成。跳过重复x 534 
