In [5]:
# Nama file sumber
file1 = "bird_glossary_terms.txt"
file2 = "glossary_terms.txt"

# Baca semua baris dari kedua file
with open(file1, "r", encoding="utf-8") as f1:
    terms1 = [line.strip() for line in f1 if line.strip()]

with open(file2, "r", encoding="utf-8") as f2:
    terms2 = [line.strip() for line in f2 if line.strip()]

# Gabungkan dan hapus duplikat
combined_terms = sorted(set(terms1 + terms2))

# Simpan ke file baru
with open("glossary_combined.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(combined_terms))

print(f"✅ Jumlah istilah unik: {len(combined_terms)}")
print("✅ File berhasil disimpan sebagai 'glossary_combined.txt'")


✅ Jumlah istilah unik: 1932
✅ File berhasil disimpan sebagai 'glossary_combined.txt'


In [2]:
import json

# Buka hasil glosarium gabungan
with open("glossary_combined.txt", "r", encoding="utf-8") as f:
    terms = [line.strip().lower() for line in f if line.strip()]

# Kata kunci morfologi umum
morph_keywords = [
    "bill", "beak", "tail", "feather", "plumage", "crown", "nape", "mantle", "scapular", "wing",
    "leg", "toe", "foot", "throat", "abdomen", "breast", "eye", "iris", "eyering", "flank",
    "auricular", "coverts", "rump", "vent", "thigh", "neck", "tarsus", "mandible", "nostril", "supercilium"
]

# Ambil hanya istilah morfologi berdasarkan kata kunci
morph_terms = [term for term in terms if any(keyword in term for keyword in morph_keywords)]

# Simpan ke JSON
with open("glossary_morphology_filtered.json", "w", encoding="utf-8") as f:
    json.dump(morph_terms, f, indent=4, ensure_ascii=False)

print(f"✅ Istilah morfologi terpilih: {len(morph_terms)}")
print(morph_terms)

✅ Istilah morfologi terpilih: 129
['convention on international trade in endangered species (cites)', 'coverts', 'crown', 'crown-stripe', 'ear-coverts', 'eclipse plumage', 'eye-ring', 'eye-stripe', 'flank', 'flight feathers', 'foreneck', 'house finch eye disease', 'iris', 'mandible', 'mantle', 'nape', 'necklace', 'nuptial plumage', 'plumage', 'rump', 'scapulars', 'shoebill', 'supercilium', 'tail streamers', 'tarsus', 'underwing', 'vent', 'ventral', 'wing-bar', 'wing-coverts', 'wing-span', 'winter plumage', 'afterfeather', 'alternate plumage', 'atrioventricular valve', 'auricular feathers', 'basic plumage', 'beak', 'bend of the wing', 'bill', 'bill tip organ', 'bill-wiping', 'breast', 'breeding plumage', 'broadbills', 'contour feathers', 'coverts', 'crown', 'crown group', 'definitive plumages', 'delayed plumage maturation', 'down feathers', 'eclipse plumage', 'elliptical wings', 'emarginate tail', 'eventual variety', 'extinction events', 'eye ring', 'eyeball', 'eyebrow stripe', 'eyeline

In [3]:
captions_wrong = [
    "burung ini memiliki tagihan tebal dan ekor datar",
    "warna bulunya coklat dengan garis superciliary tipis",
    "ia memiliki bulu atas yang kontras dan dagu pucat"
]


In [19]:
from difflib import get_close_matches

# Daftar caption salah (contoh)
captions_wrong = [
    "burung kecil memiliki perut emas, paruh putih pendek dan lubang putih.",
    "warna bulunya coklat dengan garis superciliary tipis",
    "burung kecil ini memiliki bulu halus, batang kuning dan tubuh putih, dengan kaki hitam kurus."
]

# Glosarium morfologi (bisa dimuat dari JSON jika sudah disimpan)
glossary_morph = [
    "abdomen", "auriculars", "beak", "bill", "breast", "crown", "coverts", "eye", "eyering",
    "feather", "flank", "foot", "iris", "leg", "mandible", "mantle", "nape", "neck", "nostril",
    "plumage", "rump", "scapulars", "supercilium", "tail", "tarsus", "thigh", "throat", "toe",
    "vent", "wing"
]

# Tokenisasi semua kata dari caption salah
tokens = set(" ".join(captions_wrong).lower().split())

# Cari istilah yang tidak ada di glosarium (kemungkinan terjemahan literal/salah)
non_morph_terms = []
for word in tokens:
    match = get_close_matches(word, glossary_morph, n=1, cutoff=0.85)
    if not match:
        non_morph_terms.append(word)

print("🧠 Istilah non-ornitologis terdeteksi (perlu dicek/koreksi):")
print(non_morph_terms)


🧠 Istilah non-ornitologis terdeteksi (perlu dicek/koreksi):
['putih.', 'ini', 'dan', 'coklat', 'lubang', 'bulunya', 'memiliki', 'warna', 'kaki', 'superciliary', 'putih,', 'bulu', 'kuning', 'tipis', 'perut', 'burung', 'kurus.', 'paruh', 'garis', 'halus,', 'putih', 'emas,', 'dengan', 'hitam', 'kecil', 'pendek', 'batang', 'tubuh']


In [None]:
# contoh salah -> benar
correction_map = {
    "tagihan": "paruh",
    "payudara": "dada",
    "tambalan": "bercak",
    "datar": "pipih",
    "garis superciliary": "garis alis",
    "persegi panjang": "",
}

corrected_captions = []
for caption in captions_wrong:
    corrected = caption
    for wrong_term, correct_term in correction_map.items():
        if wrong_term in corrected:
            corrected = corrected.replace(wrong_term, correct_term)
    corrected_captions.append((caption, corrected))

import pandas as pd
df = pd.DataFrame(corrected_captions, columns=["Sebelum Koreksi", "Setelah Koreksi"])
print(df.to_markdown(index=False))


| Sebelum Koreksi                                                                               | Setelah Koreksi                                                                                     |
|:----------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------|
| burung kecil memiliki perut emas, paruh putih pendek dan lubang putih.                        | burung kecil memiliki perut emas, paruh putih pendek dan lubang putih.                              |
| warna bulunya coklat dengan garis superciliary tipis                                          | warna bulunya coklat dengan garis alis tipis                                                        |
| burung kecil ini memiliki bulu halus, batang kuning dan tubuh putih, dengan kaki hitam kurus. | burung kecil ini memiliki bulu halus, batang tubuh kuning dan tubuh putih, dengan kaki hitam kurus. |
