In [3]:
import os
import json
from glob import glob

SOURCE_DIR = r'd:\DATA CACA\00. College\Skripsi\ta_nmt\text_c10_translated_gt'
OUTPUT_FILE = './translated_dataset.json'

# Ambil daftar folder kelas dan urutkan
class_folders = sorted([
    folder for folder in os.listdir(SOURCE_DIR)
    if os.path.isdir(os.path.join(SOURCE_DIR, folder))
])

all_data = []

# Proses tiap folder
for class_id, class_folder in enumerate(class_folders):
    folder_path = os.path.join(SOURCE_DIR, class_folder)
    txt_files = sorted(glob(os.path.join(folder_path, '*.txt')))

    for txt_path in txt_files:
        try:
            with open(txt_path, 'r', encoding='utf-8') as f:
                lines = [line.strip() for line in f if line.strip()]

            item = {
                "filename": os.path.basename(txt_path).replace('.txt', '.jpg'),
                "class_name": class_folder,
                "class_id": class_id,
                "captions": lines
            }
            all_data.append(item)
        except Exception as e:
            print(f"❌ Error reading {txt_path}: {e}")

# Simpan hasil akhir ke file JSON
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(all_data, f, ensure_ascii=False, indent=2)

print(f"✅ Selesai! Total data: {len(all_data)}")
print(f"📦 File disimpan di: {OUTPUT_FILE}")


✅ Selesai! Total data: 11788
📦 File disimpan di: ./translated_dataset.json


In [2]:
import os
print("Current working directory:", os.getcwd())


Current working directory: d:\DATA CACA\00. College\Skripsi\ta_nmt\notebooks


In [None]:
# MERGED/gabung class*.json
import os
import json
from glob import glob

SOURCE_DIR = r'd:\DATA CACA\00. College\Skripsi\ta_nmt\paralel_disempurnakan'
OUTPUT_FILE = './merged_dataset.json'

all_data = []

# Ambil semua file class_*.json
json_paths = sorted(glob(os.path.join(SOURCE_DIR, 'class_*.json')))

for path in json_paths:
    try:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        if "dataset" in data:
            all_data.extend(data["dataset"])
        else:
            print(f"⚠️  File tanpa key 'dataset': {path}")

    except Exception as e:
        print(f"❌ Gagal memproses {path}: {e}")

# Simpan hasil gabungan
output = {
    "dataset": all_data
}

with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print(f"✅ Gabungan selesai. Total item: {len(all_data)}")
print(f"📁 Disimpan di: {OUTPUT_FILE}")


✅ Gabungan selesai. Total item: 11788
📁 Disimpan di: ./merged_dataset.json


In [1]:
# mengambil hanya caption bahasa Indonesia dari dataset gabungan
import json

# === Path konfigurasi ===
# INPUT_FILE = r'd:\DATA CACA\00. College\Skripsi\ta_nmt\notebooks\merged_dataset.json'
INPUT_FILE = r'd:\DATA CACA\00. College\Skripsi\ta_nmt\data\paralel_cub_200_2011_captions.json'
OUTPUT_FILE = './paralel_cub_200_2011_indo_only.json'

# Daftar class_name (ganti dengan isi class sebenarnya!)
with open('classes.txt', 'r', encoding='utf-8') as f:
    class_names = [line.strip() for line in f]


# Baca file JSON gabungan
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)

result = []

for item in data.get("dataset", []):
    captions = item.get("captions", [])
    indo_only = [cap["indo"] for cap in captions if isinstance(cap, dict) and "indo" in cap]

    result.append({
        "filename": item["filename"],
        "class_id": item["class_id"],
        "class_name": class_names[item["class_id"]],
        "captions": indo_only
    })

print(f"✅ Total data: {len(result)}")

# Simpan hasilnya
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print(f"📁 File disimpan ke: {OUTPUT_FILE}")


✅ Total data: 11788
📁 File disimpan ke: ./paralel_cub_200_2011_indo_only.json


In [18]:
import json

# Path ke file JSON yang berisi data "class"
CLASS_FILE = r'd:\DATA CACA\00. College\Skripsi\ta_nmt\data\paralel_cub_200_2011_captions.json'
OUTPUT_TXT = './classes.txt'          # Opsional: simpan ke file .txt

# Load JSON
with open(CLASS_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Ambil list class name sesuai urutan id
class_names = [None] * len(data["class"])
for entry in data["class"]:
    class_names[entry["id"]] = entry["class"]

# Tampilkan sebagian hasil
print("✅ Contoh class_names[0:5]:")
for i in range(5):
    print(f"{i:03d} → {class_names[i]}")

# Simpan ke file teks jika mau
with open(OUTPUT_TXT, 'w', encoding='utf-8') as f:
    for name in class_names:
        f.write(name + '\n')

print(f"\n📁 Disimpan ke: {OUTPUT_TXT}")


✅ Contoh class_names[0:5]:
000 → 001.Black_footed_Albatross
001 → 002.Laysan_Albatross
002 → 003.Sooty_Albatross
003 → 004.Groove_billed_Ani
004 → 005.Crested_Auklet

📁 Disimpan ke: ./classes.txt


In [20]:
import json

INPUT_FILE = './indo_captions_only.json'
OUTPUT_FILE = './indo_captions_only.json'

with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Susun ulang urutan key di setiap item
reordered_data = []
for item in data:
    reordered_item = {
        "filename": item.get("filename"),
        "class_name": item.get("class_name"),
        "class_id": item.get("class_id"),
        "captions": item.get("captions", [])
    }
    reordered_data.append(reordered_item)

# Simpan ke file baru
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(reordered_data, f, ensure_ascii=False, indent=2)

print(f"✅ Data berhasil disusun ulang dan disimpan ke: {OUTPUT_FILE}")


✅ Data berhasil disusun ulang dan disimpan ke: ./indo_captions_only.json
