# Konachan Image Downloader
这个脚本通过 Konachan 的官方 JSON API 爬取带有 `genshin_impact` 标签的高清原图，并保存到 `./dataset/highres/original/` 目录下。
使用 API 爬取比直接解析 HTML 网页更稳定，且能直接获取到无损/高分辨率的 `file_url`。

In [2]:
import os
import requests
import time
from urllib.parse import unquote

# 配置参数
TAGS = "genshin_impact"
LIMIT = 1000
SAVE_DIR = "./dataset/highres/original/"
API_URL = f"https://konachan.net/post.json?tags={TAGS}&limit={LIMIT}"

# 确保保存目录存在
os.makedirs(SAVE_DIR, exist_ok=True)

def download_images():
    print(f"正在从 {API_URL} 获取图片元数据...")
    
    # 添加 User-Agent 防止被简单的反爬虫机制拦截
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(API_URL, headers=headers)
    
    if response.status_code != 200:
        print(f"获取数据失败: HTTP {response.status_code}")
        return
        
    posts = response.json()
    print(f"找到 {len(posts)} 张图片。开始下载...")
    
    for i, post in enumerate(posts):
        # 1. 过滤 NSFW 内容 (rating: s=safe, q=questionable, e=explicit)
        if post.get('rating') != 's':
            print(f"[{i+1}/{len(posts)}] 跳过 NSFW 图片 (rating: {post.get('rating')})")
            continue
            
        # 2. 过滤特定敏感标签
        tags = post.get('tags', '').lower()
        nsfw_keywords = ['uncensored', 'nude', 'nsfw', 'nipples', 'sex', 'pussy']
        if any(keyword in tags.split() for keyword in nsfw_keywords):
            print(f"[{i+1}/{len(posts)}] 跳过包含 NSFW 标签的图片")
            continue
            
        # 3. 过滤高度相似的差分图/变体图 (通常变体图会有 parent_id)
        if post.get('parent_id'):
            print(f"[{i+1}/{len(posts)}] 跳过变体图片 (存在 parent_id: {post.get('parent_id')})")
            continue

        # 优先获取无损/最高分辨率的原图 URL
        img_url = post.get('file_url')
        if not img_url:
            continue
            
        # 确保 URL 包含协议头
        if img_url.startswith('//'):
            img_url = 'https:' + img_url
            
        # 从 URL 中提取文件名并解码
        filename = unquote(img_url.split('/')[-1])
        filepath = os.path.join(SAVE_DIR, filename)
        
        # 如果文件已存在则跳过
        if os.path.exists(filepath):
            print(f"[{i+1}/{len(posts)}] 文件已存在，跳过: {filename}")
            continue
            
        print(f"[{i+1}/{len(posts)}] 正在下载: {filename}")
        try:
            img_response = requests.get(img_url, headers=headers, stream=True)
            if img_response.status_code == 200:
                with open(filepath, 'wb') as f:
                    for chunk in img_response.iter_content(1024 * 8):
                        f.write(chunk)
            else:
                print(f"  -> 下载失败: HTTP {img_response.status_code}")
        except Exception as e:
            print(f"  -> 下载出错 {filename}: {e}")
            
        # 礼貌爬取，每次下载后暂停 1 秒，防止给服务器造成压力或被封 IP
        time.sleep(1)
        
    print("\n所有下载任务完成！")

if __name__ == "__main__":
    download_images()

正在从 https://konachan.net/post.json?tags=genshin_impact&limit=1000 获取图片元数据...
找到 1000 张图片。开始下载...
[1/1000] 正在下载: Konachan.com - 399613 breasts cho!cho! genshin_impact gray_hair long_hair navel ofuda see_through shenhe_(genshin_impact) spear weapon.jpg
[2/1000] 正在下载: Konachan.com - 399583 building furina_(genshin_impact) garter genshin_impact gloves gray_hair hat long_hair moon night sky sushispin tears water.jpg
[3/1000] 跳过 NSFW 图片 (rating: e)
[4/1000] 跳过 NSFW 图片 (rating: e)
[5/1000] 跳过 NSFW 图片 (rating: q)
[6/1000] 正在下载: Konachan.com - 399401 aratakosu_(tako's) braids breasts building chinese_clothes city cleavage genshin_impact gray_hair long_hair ponytail see_through wristwear yellow_eyes.png
[7/1000] 正在下载: Konachan.com - 399397 animal brown_hair chibi close genshin_impact gray_eyes gray_hair hat horse japanese_clothes long_hair purple_hair qiqi_(genshin_impact) yu_e_baba.jpg
[8/1000] 跳过 NSFW 图片 (rating: q)
[9/1000] 正在下载: Konachan.com - 399290 braids chinese_dress dress ermu_(enmmm) g