In [1]:
import json
from typing import List, Dict, Optional

In [8]:
def load_data(file_path: str) -> List[Dict]:
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_to_json(data, filename: str):
    """Lưu dữ liệu Python (list hoặc dict) ra file JSON với định dạng đẹp."""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


def extract_labels(sample: Dict, target_modality: Optional[str] = None) -> Dict:
    """Extract label(s) from a sample. If target_modality is None, extract all."""
    result = {}
    for modality in sample['annotations'][0]['result']:
        from_name = modality['from_name']
        label_value = modality['value']['choices'][0]

        if target_modality is None or from_name == target_modality:
            result[from_name if target_modality is None else 'label'] = label_value
    return result


def get_all_modality_label(exported_file_path: str) -> List[Dict]:
    data = load_data(exported_file_path)
    return [
        {
            **sample['data'],
            **extract_labels(sample)
        }
        for sample in data
    ]


def get_text_modality_label(exported_file_path: str) -> List[Dict]:
    data = load_data(exported_file_path)
    return [
        {
            'caption': sample['data']['caption'],
            **extract_labels(sample, target_modality='text_modality')
        }
        for sample in data
    ]


def get_image_modality_label(exported_file_path: str) -> List[Dict]:
    data = load_data(exported_file_path)
    return [
        {
            'image_url': sample['data']['image'],
            **extract_labels(sample, target_modality='image_modality')
        }
        for sample in data
    ]

def get_multi_modality_label(exported_file_path: str) -> List[Dict]:
    data = load_data(exported_file_path)
    return [
        {
            'caption': sample['data']['caption'],
            'image_url': sample['data']['image'],
            **extract_labels(sample, target_modality='multi_modality')
        }
        for sample in data
    ]



In [22]:
file_path = r'D:\Git_repo\ViSoMMSD\data\interim\round_2\export\round_2.json'

In [23]:
all = get_all_modality_label(file_path)
img = get_image_modality_label(file_path)
text = get_text_modality_label(file_path)
multi = get_multi_modality_label(file_path)

In [26]:
print(len(all))
print(len(img))
print(len(text))
print(len(multi))

200
200
200
200


In [33]:
output_path = r'D:\Git_repo\ViSoMMSD\data\interim\round_2\label'

In [34]:
save_to_json(all, output_path + r'\all.json')
save_to_json(text, output_path + r'\text.json')
save_to_json(img, output_path + r'\img.json')
save_to_json(multi, output_path + r'\multi.json')