In [1]:
import json

# Membuka file JSON
filename = "paralel_cub_200_2011_captions.json"

with open(filename, "r", encoding="utf-8") as file:
    json_data = json.load(file)

# Mengumpulkan caption dalam dua bahasa
english_captions = []
indo_captions = []

for entry in json_data['dataset']:
    if "captions" in entry:
        for caption in entry["captions"]:
            if "english" in caption and isinstance(caption["english"], str):
                english_captions.append(caption["english"])
            if "indo" in caption and isinstance(caption["indo"], str):
                indo_captions.append(caption["indo"])

# Debugging: Menampilkan beberapa contoh caption
print("Contoh Caption English:", english_captions[:5])
print("Contoh Caption Indo:", indo_captions[:5])


Contoh Caption English: ['the medium sized bird has a dark grey color, a black downward curved beak, and long wings.', 'the bird is dark grey brown with a thick curved bill and a flat shaped tail.', 'bird has brown body feathers, white breast feathers and black beak', 'this bird has a dark brown overall body color, with a small white patch around the base of the bill.', 'the bird has very long and large brown wings, as well as a black body and a long black beak.']
Contoh Caption Indo: ['burung berukuran sedang memiliki warna abu-abu gelap, paruh melengkung hitam, dan sayap panjang.', 'burung berwarna coklat abu-abu gelap dengan tagihan melengkung tebal dan ekor berbentuk datar.', 'burung ini memiliki warna tubuh keseluruhan coklat gelap, dengan bercak putih kecil di sekitar pangkal tagihan.', 'burung ini memiliki warna tubuh keseluruhan coklat gelap, dengan bercak putih kecil di sekitar pangkal tagihan.', 'burung itu memiliki sayap cokelat yang sangat panjang dan besar, serta tubuh hit

In [2]:
import re
from collections import Counter

# Fungsi membersihkan teks dari karakter non-alfabet
def clean_text(text):
    return re.sub(r"[^a-zA-Z\s]", "", text).lower()

# Membersihkan dan tokenisasi (memecah menjadi kata-kata)
english_words = []
indo_words = []

for text in english_captions:
    english_words.extend(clean_text(text).split())

for text in indo_captions:
    indo_words.extend(clean_text(text).split())

# Menghitung frekuensi kata
english_word_counts = Counter(english_words)
indo_word_counts = Counter(indo_words)

# Tampilkan 20 kata yang paling sering muncul
print("\n20 Kata Paling Umum dalam English Captions:")
for word, freq in english_word_counts.most_common(20):
    print(f"{word}: {freq}")

print("\n20 Kata Paling Umum dalam Indo Captions:")
for word, freq in indo_word_counts.most_common(20):
    print(f"{word}: {freq}")



20 Kata Paling Umum dalam English Captions:
a: 178326
and: 162860
bird: 115431
with: 80503
has: 79738
black: 77652
this: 74266
white: 65086
brown: 42196
is: 41179
beak: 36880
belly: 34114
small: 33090
wings: 31192
yellow: 30684
the: 30174
bill: 29021
long: 21664
grey: 20271
breast: 20115

20 Kata Paling Umum dalam Indo Captions:
dan: 166886
burung: 115978
memiliki: 80548
dengan: 80431
hitam: 77380
ini: 75874
putih: 63974
yang: 42183
paruh: 36564
kecil: 36562
perut: 35995
sayap: 34879
abuabu: 33355
kuning: 30768
berwarna: 29632
tagihan: 28157
cokelat: 27107
panjang: 22076
payudara: 19577
mahkota: 19199


In [3]:
import nltk
from nltk.corpus import stopwords
import json
import re
from collections import Counter

# Download stopwords NLTK (hanya perlu dilakukan sekali)
nltk.download("stopwords")

# Load stopwords bahasa Inggris dan Indonesia
stopwords_english = set(stopwords.words("english"))
stopwords_indo = set(stopwords.words("indonesian"))  # Bahasa Indonesia tersedia di NLTK


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cacai\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
# Fungsi membersihkan teks
def clean_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Hanya huruf dan spasi
    return text.lower()

# Tokenisasi + filtering stopwords
def filter_stopwords(text_list, stopword_set):
    words = [clean_text(text).split() for text in text_list]  # Tokenisasi
    filtered_words = [word for sublist in words for word in sublist if word not in stopword_set]  # Hapus stopwords
    return filtered_words

# Proses filtering stopwords
filtered_english_words = filter_stopwords(english_captions, stopwords_english)
filtered_indo_words = filter_stopwords(indo_captions, stopwords_indo)

# Menghitung frekuensi kata setelah stopwords dihapus
filtered_english_word_counts = Counter(filtered_english_words)
filtered_indo_word_counts = Counter(filtered_indo_words)

# Menampilkan hasil setelah filtering
print("\n50 Kata Paling Umum dalam English Captions (Tanpa Stopwords):")
for word, freq in filtered_english_word_counts.most_common(50):
    print(f"{word}: {freq}")

print("\n50 Kata Paling Umum dalam Indo Captions (Tanpa Stopwords):")
for word, freq in filtered_indo_word_counts.most_common(50):
    print(f"{word}: {freq}")



50 Kata Paling Umum dalam English Captions (Tanpa Stopwords):
bird: 115431
black: 77652
white: 65086
brown: 42196
beak: 36880
belly: 34114
small: 33090
wings: 31192
yellow: 30684
bill: 29021
long: 21664
grey: 20271
breast: 20115
crown: 19960
head: 18841
short: 13910
orange: 13593
gray: 13376
body: 13328
red: 12946
feathers: 10762
throat: 9887
blue: 9353
large: 9107
tail: 8460
pointed: 8354
color: 7574
pointy: 7409
back: 7232
feet: 6993
dark: 6967
green: 6430
light: 5980
eyes: 5757
eye: 5524
bright: 5151
neck: 5048
medium: 4786
wing: 4541
nape: 3991
colored: 3806
sized: 3485
tarsus: 3460
wingbars: 3354
chest: 3269
patch: 3221
secondaries: 3180
curved: 3152
sharp: 3144
particular: 3018

50 Kata Paling Umum dalam Indo Captions (Tanpa Stopwords):
burung: 115978
memiliki: 80548
hitam: 77380
putih: 63974
paruh: 36564
perut: 35995
sayap: 34879
abuabu: 33355
kuning: 30768
berwarna: 29632
tagihan: 28157
cokelat: 27107
payudara: 19577
mahkota: 19199
coklat: 16982
kepala: 16191
runcing: 14838
pe