In [5]:
# Preparing dataset
from gensim.corpora import WikiCorpus
from tqdm import tqdm

# Path to the downloaded Wikipedia dump
wiki_dump_path = 'idwiki-latest-pages-articles.xml.bz2'

# Create a WikiCorpus object
wiki_corpus = WikiCorpus(wiki_dump_path, dictionary={})

# Process the dump with a progress bar
sentences = []
for text in tqdm(wiki_corpus.get_texts(), desc="Processing Wikipedia Dump"):
    sentences.append(text)


Processing Wikipedia Dump: 500903it [09:11, 908.62it/s] 


In [8]:
# Phrase Detection
from gensim.models.phrases import Phrases, Phraser

# Detect bigrams
print("Detecting bigrams...")
bigram_model = Phrases(sentences, min_count=10, threshold=10)
bigram_phraser = Phraser(bigram_model)

# Apply bigram model to the sentences
print("Applying bigram model to sentences...")
bigram_sentences = [bigram_phraser[sentence] for sentence in tqdm(sentences, desc="Bigram Processing")]

# Detect trigrams
print("Detecting trigrams...")
trigram_model = Phrases(bigram_sentences, min_count=10, threshold=10)
trigram_phraser = Phraser(trigram_model)

# Apply trigram model
print("Applying trigram model to sentences...")
trigram_sentences = [trigram_phraser[bigram_sentence] for bigram_sentence in tqdm(bigram_sentences, desc="Trigram Processing")]


Detecting bigrams...
Applying bigram model to sentences...


Bigram Processing: 100%|██████████| 500903/500903 [09:35<00:00, 870.52it/s]  


Detecting trigrams...
Applying trigram model to sentences...


Trigram Processing: 100%|██████████| 500903/500903 [09:56<00:00, 839.68it/s]  


In [9]:
print(trigram_sentences[:2])  

[['berkas', 'dna', 'structure', 'key', 'labelled', 'pn', 'nobb', 'png_jmpl_ka_px', 'struktur_heliks', 'ganda', 'dna', 'atom_atom', 'pada', 'struktur', 'tersebut', 'diwarnai', 'sesuai_dengan', 'unsur', 'kimianya', 'dan', 'struktur', 'detail', 'dua', 'pasangan_basa', 'ditunjukkan', 'oleh', 'gambar', 'kanan', 'bawah', 'gambaran', 'tiga_dimensi', 'dna', 'asam', 'lebih', 'dikenal', 'dengan', 'singkatan', 'dna', 'bahasa_inggris', 'eoxyribo', 'ucleic', 'cid', 'adalah', 'salah_satu_jenis', 'asam_nukleat', 'yang', 'memiliki_kemampuan', 'pewarisan_sifat', 'keberadaan', 'asam', 'ditemukan', 'di', 'dalam', 'nukleoprotein', 'yang', 'membentuk_inti', 'sel', 'james', 'dewey', 'watson', 'dan', 'francis_crick', 'merupakan', 'ilmuwan', 'pertama', 'yang', 'mengajukan', 'model', 'struktur', 'dna', 'pada', 'tahun', 'dengan', 'bentuk', 'pilinan', 'ganda', 'setiap', 'dna', 'tersusun', 'dari', 'dua_buah', 'rantai', 'polinukleotida', 'dna', 'merupakan', 'sejenis', 'biomolekul', 'yang', 'menyimpan', 'dan', 'men

In [10]:
#Step 3: Training the Word2Vec Model

from gensim.models import Word2Vec
import logging

# Enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Initialize Word2Vec model
model = Word2Vec(vector_size=200, window=5, min_count=5, workers=4)

# Build the vocabulary
model.build_vocab(trigram_sentences)

# Train the model
model.train(trigram_sentences, total_examples=model.corpus_count, epochs=model.epochs)

# Save the model
model.save("w2v_wiki_own_phrase_training_200.model")




2023-12-08 11:30:04,700 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=200, alpha=0.025>', 'datetime': '2023-12-08T11:30:04.700103', 'gensim': '4.3.0', 'python': '3.11.4 (main, Jul  5 2023, 08:40:20) [Clang 14.0.6 ]', 'platform': 'macOS-14.1.2-arm64-arm-64bit', 'event': 'created'}
2023-12-08 11:30:04,704 : INFO : collecting all words and their counts
2023-12-08 11:30:04,705 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-12-08 11:30:08,744 : INFO : PROGRESS: at sentence #10000, processed 9009663 words, keeping 465179 word types
2023-12-08 11:30:11,435 : INFO : PROGRESS: at sentence #20000, processed 14424113 words, keeping 621765 word types
2023-12-08 11:30:13,996 : INFO : PROGRESS: at sentence #30000, processed 18418817 words, keeping 727800 word types
2023-12-08 11:30:16,009 : INFO : PROGRESS: at sentence #40000, processed 22057741 words, keeping 812319 word types
2023-12-08 11:30:18,327 : INFO : PROGRESS: at sentence #5000