In [1]:
import re
from collections import defaultdict

# =========================
# 美食主题词表
# =========================
FOOD_CATEGORIES = {
    "ingredient": [
        "米饭","面条","鸡蛋","猪肉","牛肉","羊肉","鸡肉","鱼","虾","蟹",
        "豆腐","土豆","西红柿","胡萝卜","青椒","洋葱","蒜","姜","葱",
        "面粉","酱油","醋","盐","糖","油","辣椒","香菜","花椒","八角",
        "茶叶","奶","奶酪","面包","蘑菇","海鲜","藕","紫菜","茄子",'菜','汤','饭','米'
    ],
    "method": [
        "炒","煮","蒸","烤","炖","炸","煎","煨","凉拌","熬","腌","焯","熏","焗","煲","拌","红烧","清蒸","水煮","爆炒"
    ],
    "dish": [
        "宫保鸡丁","红烧肉","麻婆豆腐","鱼香肉丝","酸辣土豆丝","回锅肉","西红柿炒蛋",
        "青椒炒肉","小龙虾","火锅","麻辣烫","烧烤","拉面","寿司","咖喱","汉堡","披萨",
        "沙拉","炒饭","面包","点心","粤菜","川菜","湘菜","鲁菜","苏菜","闽菜","浙菜","徽菜","西餐","中餐","日料","韩餐"
    ],
    "taste": [
        "鲜","香","辣","咸","甜","酸","苦","麻","鲜美","浓郁","清淡","爽口","细腻","柔软","脆","酥","滑","嚼劲","入味","回味","清香","丰富"
    ],
    "review": [
        "好吃","难吃","美味","可口","味道棒",
       "吃过","必吃",
    ],
    "context": [
        "餐厅","厨房","烹饪","美食","菜谱","食谱","饭店","饮食","小吃","夜市","宴席",
        "早餐","午餐","晚餐","家常菜","下饭菜","甜品","饮料","饮品","烹调","美味佳肴",
        "餐桌","聚餐","食文化","地方特色菜"
    ]
}


# =========================
# 核心过滤函数
# =========================
def is_food_related(text: str, min_category_hits: int = 3) -> bool:
    """
    判断一段文本是否与美食相关（命中>=2种类别关键词即为True）
    :param text: 输入文本
    :param min_category_hits: 至少命中多少类关键词才算通过
    :return: bool
    """
    if not text or len(text.strip()) < 2:
        return False

    hits = defaultdict(bool)
    for category, keywords in FOOD_CATEGORIES.items():
        for kw in keywords:
            if re.search(re.escape(kw), text):
                hits[category] = True
                break  # 命中一个词就够了，不重复统计该类
    if sum(hits.values()) >= min_category_hits:
        print(hits)
    return sum(hits.values()) >= min_category_hits



In [4]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
from datasets import load_dataset
from huggingface_hub import login

# 1. 直接登录（安全写法：不要硬编码 token）
login("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")
dataset = load_dataset("BAAI/CCI3-HQ", split="train",streaming=True,token=True)



In [None]:
import json
data = []
a = 0
with open('data.log','w',encoding='utf-8') as f1:
    for example in dataset:
        if is_food_related( example['text'],min_category_hits=5):
            data.append(example)
            f1.write(str(a)+'\n')
            a  += 1
        if len(data) >= 1000:
            with open(str(a) + '.json','w',encoding='utf-8' ) as f:
                json.dump(data,f,ensure_ascii=True,indent=2)
                data = []
        if a > 150000:
            break
        

defaultdict(<class 'bool'>, {'ingredient': True, 'dish': True, 'taste': True, 'review': True, 'context': True})
defaultdict(<class 'bool'>, {'ingredient': True, 'method': True, 'dish': True, 'taste': True, 'context': True})
defaultdict(<class 'bool'>, {'ingredient': True, 'method': True, 'dish': True, 'taste': True, 'context': True})
defaultdict(<class 'bool'>, {'ingredient': True, 'method': True, 'taste': True, 'review': True, 'context': True})
defaultdict(<class 'bool'>, {'ingredient': True, 'method': True, 'taste': True, 'review': True, 'context': True})
defaultdict(<class 'bool'>, {'ingredient': True, 'method': True, 'dish': True, 'taste': True, 'review': True})
defaultdict(<class 'bool'>, {'ingredient': True, 'method': True, 'dish': True, 'taste': True, 'review': True, 'context': True})
defaultdict(<class 'bool'>, {'ingredient': True, 'method': True, 'dish': True, 'taste': True, 'context': True})
defaultdict(<class 'bool'>, {'ingredient': True, 'method': True, 'dish': True, 'taste