In [1]:
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv()

import os
token = os.getenv("HG_KEY")
login(token=token)

### Restructure huatuo data (AI Farm)

#### General

In [36]:
import json
import random
from collections import defaultdict
from typing import Dict, List
from tqdm import tqdm

def process_huatuo_data(
    file_path: str,
    dataset_type: str,  # 'train', 'valid', 'test'
    output_data: Dict[str, List[List[str]]],
    percent: float = 100,
    shuffle: bool = False
):
    assert dataset_type in {'train', 'valid', 'test'}, "dataset_type must be one of 'train', 'valid', 'test'"
    assert 0 < percent <= 100, "percent must be between 0 and 100"

    # Mapping ID phần chuỗi sang loại dữ liệu
    classification_map = {
        'HuatuoGPT2_Pretrain_Meidcal_Encyclopedia_cn': 'Meidcal_Encyclopedia_cn',
        'HuatuoGPT2_Pretrain_Meidcal_Encyclopedia_en': 'Meidcal_Encyclopedia_en',
        'huatuo_encyclopedia_qa': 'huatuo_encyclopedia_qa',
        'huatuo_knowledge_graph_qa': 'huatuo_knowledge_graph_qa',
    }

    dataset_map = {
        'train_datasets': 'train',
        'validation_datasets': 'valid',
        'test_datasets': 'test',
    }

    # Dữ liệu tạm thời dùng để lọc percent
    temp_data = defaultdict(list)

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            data = json.loads(line)
            src_id = data.get("id", "")
            matched_class = None
            matched_type = None

            for key, val in classification_map.items():
                if key in src_id:
                    matched_class = val

            for key, val in dataset_map.items():
                if key in src_id:
                    matched_type = val

            if matched_class is None:
                continue

            # Lọc theo dataset_type
            if matched_type:
                if matched_type != dataset_type:
                    continue
            else:
                if dataset_type == 'train':
                    matched_type = 'train'
                else:
                    continue

            instruction = data.get("instruction", "").strip()
            output = data.get("output", "").strip()
            temp_data[matched_class].append([instruction, output])

    # Cắt phần trăm và shuffle nếu cần, cập nhật vào output_data
    for key, examples in temp_data.items():
        if shuffle:
            random.shuffle(examples)
        keep_n = int(len(examples) * (percent / 100))
        output_data[key].extend(examples[:keep_n])

output_data = defaultdict(list)
process_huatuo_data('/mnt/c/Users/HOME/Downloads/HuatuoGPT-II/all_data/huatuogpt2_vi.jsonl', 'valid', output_data, percent=5)
process_huatuo_data('/mnt/c/Users/HOME/Downloads/HuatuoGPT-II/all_data/huatuo26m_vi.jsonl', 'valid', output_data, percent=5)

0it [00:00, ?it/s]

65765it [00:06, 10316.82it/s]
57080it [00:03, 16859.61it/s]


In [37]:
s = {}
for k in output_data:
    s[k] = len(output_data[k])
s

{'huatuo_knowledge_graph_qa': 2, 'huatuo_encyclopedia_qa': 10}

In [38]:
with open("/mnt/c/Users/HOME/Downloads/HuatuoGPT-II/all_data/huatuo_validated_5p.json", "w", encoding="utf-8") as f:
    json.dump(output_data, f, ensure_ascii=False, indent=2)

#### Old ver

In [None]:
import os
import json
import random

def convert_jsonl_to_huatuo_format(file1, file2, output_file, percent=1):
    # Các loại id hợp lệ
    known_types = [
        "Meidcal_Web_Corpus_en", "Meidcal_Web_Corpus_cn",
        "Meidcal_Literature_cn", "Meidcal_Literature_en",
        "Meidcal_Encyclopedia_cn", "Meidcal_Encyclopedia_en",
        "Meidcal_Books_cn", "Meidcal_Books_en",
        "SFT_data"
    ]

    def load_jsonl(filepath):
        with open(filepath, "r", encoding="utf-8") as f:
            return [json.loads(line.strip()) for line in f if line.strip()]

    # Load dữ liệu
    data1 = load_jsonl(file1)
    data2 = load_jsonl(file2)

    # Cắt n%
    n1 = int(len(data1) * percent)
    n2 = int(len(data2) * percent)
    data1 = data1[:n1]
    data2 = data2[:n2]

    # Dữ liệu kết quả
    merged_data = {}

    # Xử lý file1
    for item in data1:
        id_str = item.get("id", "")
        key = None
        for k in known_types:
            if k in id_str:
                key = k
                break
        if not key:
            key = "huatuo_knowledge_graph_qa"  # fallback (trường hợp sai)
        q, a = item["instruction"].strip(), item["output"].strip()
        if not q or not a:
            continue
        merged_data.setdefault(key, []).append([q, a])

    # Xử lý file2
    for item in data2:
        id_str = item.get("id", "")
        matched = any(k in id_str for k in known_types)
        key = "huatuo_knowledge_graph_qa" if not matched else "UNKNOWN_TYPE"
        if key == "UNKNOWN_TYPE":
            print('UNKNOWN_TYPE', item)
            continue  # skip bất thường

        q, a = item["instruction"].strip(), item["output"].strip()
        if not q or not a:
            continue
        merged_data.setdefault(key, []).append([q, a])

    # Ghi kết quả
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(merged_data, f, ensure_ascii=False, indent=2)

    print(f"✅ Saved merged dataset to {output_file}")
    for key, items in merged_data.items():
        print(f" - {key}: {len(items)} samples")

# 🔧 Ví dụ cách gọi:
p = 0.1/100
file1 = "/mnt/c/Users/HOME/Downloads/HuatuoGPT-II/all_data/huatuogpt2_vi.jsonl"
file2 = "/mnt/c/Users/HOME/Downloads/HuatuoGPT-II/all_data/huatuo26m_vi.jsonl"
convert_jsonl_to_huatuo_format(
    file1=file1,
    file2=file2,
    output_file=os.path.join(os.path.dirname(file1), "train_qa_" + str(p*100) + "p_vi.json"),
    percent=p
)

### Check unique ids

In [18]:
import json

# Dictionary to store counts of unique base IDs
id_counts = {}

with open('/mnt/c/Users/HOME/Downloads/HuatuoGPT-II/all_data/huatuogpt2_vi.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        full_id = data['id']
        # Split to get the base ID (before ---)
        base_id = full_id.split('---')[0]
        # Increment count for this base_id
        id_counts[base_id] = id_counts.get(base_id, 0) + 1

# Print results with counts
for uid in sorted(id_counts.keys()):
    print(f"{uid}: {id_counts[uid]}")

# Optionally save to file
with open('huotuogpt2_unique_ids.txt', 'w', encoding='utf-8') as f:
    for uid in sorted(id_counts.keys()):
        f.write(f"{uid}: {id_counts[uid]}\n")

11_HuatuoGPT2_chinese/HuatuoGPT2_Pretrain_Meidcal_Encyclopedia_cn.json: 43888
11_HuatuoGPT2_chinese/HuatuoGPT2_Pretrain_Meidcal_Encyclopedia_en.json: 21877


### Check null input

In [None]:
import json, os

input_file = "/mnt/c/Users/HOME/Downloads/HuatuoGPT-II/all_data/huatuo26m_vi.jsonl"  # Thay bằng đường dẫn file thật
output_data = []

with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        instruction = item.get("instruction", "").strip()
        input_field = item.get("input", "").strip()
        output = item.get("output", "").strip()

        # In ra nếu input khác rỗng
        if input_field:
            print(f"[Non-empty input] ID: {item.get('id')}, input: {input_field}")

        # Gộp input vào instruction nếu có
        if input_field:
            full_instruction = f"{instruction}\n\n{input_field}"
        else:
            full_instruction = instruction

        # Thêm vào dạng multi-turn ["q", "a"]
        output_data.append([full_instruction, output])

# Chuyển thành định dạng mà HuatuoGPT_data có thể đọc:
formatted_data = [qa_pair for qa_pair in output_data]  # mỗi phần tử là ["question", "answer"]

# Lưu ra file nếu muốn:
with open(os.path.join(os.path.dirname(input_file), "train_" + os.path.basename(input_file)[:-1]), "w", encoding="utf-8") as out_f:
    json.dump({"SFT_data": output_data}, out_f, ensure_ascii=False, indent=2)

print(f"\n✅ Đã xử lý {len(output_data)} mẫu. Lưu vào 'formatted_sft_data.json'")

### Restructure huatuo data (Paper)

In [None]:
import json
import ijson
import os
import math
import time

# List of data sources
data_sources = [
    "Meidcal_Web_Corpus_en",
    "Meidcal_Web_Corpus_cn",
    "Meidcal_Literature_cn",
    "Meidcal_Literature_en",
    "Meidcal_Encyclopedia_cn",
    "Meidcal_Encyclopedia_en",
    "Meidcal_Books_cn",
    "Meidcal_Books_en",
    "SFT_data"
]

# Initialize the final dictionary
final_dict = {}

# Function to count total conversations in a JSON file
def count_conversations(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            parser = ijson.items(f, 'item')
            return sum(1 for _ in parser)  # Count all conversations
    except Exception as e:
        print(f"Error counting conversations in {file_path}: {e}")
        return 0

# Function to process a chunk of conversations (first 1%)
def process_json_chunk(file_path, num_to_include):
    conversations = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            parser = ijson.items(f, 'item')
            for i, conv in enumerate(parser):
                if i >= num_to_include:  # Stop after 1%
                    break
                # Extract the "value" fields from conversations
                turns = [turn["value"] for turn in conv["conversations"]]
                conversations.append(turns)
        return conversations
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return []

# Process each data source
used_percent = .01/100

for source in data_sources:
    if source == "SFT_data":
        file_path = os.path.join("/mnt/c/Users/HOME/Downloads", f"HuatuoGPT2-GPT4-SFT-140K.json")  # Adjust path as needed
    else:
        file_path = os.path.join("/mnt/c/Users/HOME/Downloads", f"HuatuoGPT2_Pretrain_{source}.json")  # Adjust path as needed

    if os.path.exists(file_path):
        # Count total conversations to determine 1%
        total_conversations = count_conversations(file_path)
        num_to_include = math.ceil(total_conversations * used_percent) if total_conversations > 0 else 0
        # Process only the first 1% of conversations
        if num_to_include > 0:
            final_dict[source] = process_json_chunk(file_path, num_to_include)
        else:
            final_dict[source] = []
            print(f"No conversations found in {file_path}")

# Save the combined JSON
output_path = '/mnt/c/Users/HOME/Downloads/HuatuoGPT-II/adaption/one_stage_training/train_qa_' + str(used_percent*100) + 'p.json'
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(final_dict, f, ensure_ascii=False, indent=4)

print("Processing complete. Output saved to combined.json")

### Cn to En

In [None]:
import json
from argostranslate import package, translate

# === 1. Cài đặt gói dịch nếu chưa có ===
package.update_package_index()
available_packages = package.get_available_packages()
zh_vi_package = next(p for p in available_packages if p.from_code == "zh" and p.to_code == "en")
package.install_from_path(zh_vi_package.download())

# === 2. Đọc file JSON gốc ===
input_path = '/mnt/c/Users/HOME/Downloads/HuatuoGPT-II/adaption/one_stage_training/train_qa_0.01p.json'
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

print('loaded')

# === 3. Lặp và dịch từng câu hỏi + đáp ===
translated_data = {}

for key, qa_list in data.items():
    translated_pairs = []
    for question, answer in qa_list:
        translated_q = translate.translate(question, "zh", "en")
        translated_a = translate.translate(answer, "zh", "en")
        translated_pairs.append([translated_q, translated_a])
    # Đặt tên mới cho key
    translated_key = key.replace("_en", "_vi")
    translated_data[translated_key] = translated_pairs
    print(key)

# === 4. Ghi ra file JSON mới ===
with open(input_path[:-5] + '_vi.json', "w", encoding="utf-8") as f:
    json.dump(translated_data, f, ensure_ascii=False, indent=2)

print("✅ Đã dịch và lưu vào output_vi.json")
