In [1]:
import fasttext
import os
import json
import re
from tqdm import tqdm

from nltk.tokenize import word_tokenize
from gensim.models.phrases import Phrases, Phraser

In [2]:
# PreProcess data
def preprocess_text(text):
    # Lowercasing and removing special characters
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    return ' '.join(tokens)

def process_wiki_files(input_dir, output_file):
    with open(output_file, 'w', encoding='utf-8') as f_out:
        for root, dirs, files in os.walk(input_dir):
            for file in tqdm(files, desc="Processing files"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f_in:
                    for line in tqdm(f_in, desc=f"Processing lines in {file}", leave=False):
                        article = json.loads(line)
                        text = preprocess_text(article['text'])
                        f_out.write(text + '\n')

# Assuming preprocess_text is a function you have defined
process_wiki_files('extracted_text', 'processed_wiki_id.txt')



Processing files: 0it [00:00, ?it/s]
Processing files: 100%|██████████| 35/35 [00:28<00:00,  1.24it/s]
Processing files: 100%|██████████| 100/100 [01:12<00:00,  1.38it/s]
Processing files: 100%|██████████| 100/100 [01:10<00:00,  1.42it/s]
Processing files: 100%|██████████| 100/100 [01:06<00:00,  1.51it/s]
Processing files: 100%|██████████| 100/100 [01:11<00:00,  1.40it/s]
Processing files: 100%|██████████| 100/100 [00:59<00:00,  1.67it/s]
Processing files: 100%|██████████| 100/100 [01:14<00:00,  1.34it/s]
Processing files: 100%|██████████| 100/100 [01:15<00:00,  1.32it/s]
Processing files: 100%|██████████| 100/100 [01:09<00:00,  1.45it/s]
Processing files: 100%|██████████| 100/100 [01:05<00:00,  1.52it/s]


In [3]:
# Train the FastText Model

def train_fasttext_with_phrases(input_file, epochs=5, lr=0.05):
    # Read the file and build sentences
    sentences = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            sentences.append(line.split())

    # Detect and create bigrams/trigrams
    bigram = Phraser(Phrases(sentences, min_count=5, threshold=10))
    trigram = Phraser(Phrases(bigram[sentences], min_count=5, threshold=10))

    # Apply the bigram/trigram models to the sentences
    sentences_with_phrases = [trigram[bigram[sentence]] for sentence in sentences]

    # Write the sentences with phrases to a temporary file
    temp_file = 'temp_sentences_with_phrases.txt'
    with open(temp_file, 'w', encoding='utf-8') as f:
        for sentence in sentences_with_phrases:
            f.write(' '.join(sentence) + '\n')

    # Train FastText model with a progress bar
    model = None
    for epoch in tqdm(range(1, epochs + 1), desc='Training Progress'):
        model = fasttext.train_unsupervised(temp_file, model='skipgram', lr=lr, epoch=epoch, dim=200)

    # Clean up the temporary file
    if os.path.exists(temp_file):
        os.remove(temp_file)

    return model


# Train the model with the preprocessed Wikipedia text
fasttext_model = train_fasttext_with_phrases('processed_wiki_id.txt')

# Save the model
fasttext_model.save_model("fasttext_ina_200_with_phrases.bin")


Read 87M wordsess:   0%|          | 0/5 [00:00<?, ?it/s]
Number of words:  710071
Number of labels: 0
Progress: 100.0% words/sec/thread:    9596 lr:  0.000000 avg.loss:  1.226992 ETA:   0h 0m 0s avg.loss:  1.416471 ETA:   0h20m11s  5.7% words/sec/thread:    9701 lr:  0.047163 avg.loss:  1.404183 ETA:   0h20m10s  6.5% words/sec/thread:    9746 lr:  0.046726 avg.loss:  1.401475 ETA:   0h19m54s 16.2% words/sec/thread:    9100 lr:  0.041886 avg.loss:  1.388726 ETA:   0h19m 6s 17.0% words/sec/thread:    9085 lr:  0.041476 avg.loss:  1.388488 ETA:   0h18m57s   9065 lr:  0.041206 avg.loss:  1.389065 ETA:   0h18m52s ETA:   0h18m 0s 23.1% words/sec/thread:    8929 lr:  0.038464 avg.loss:  1.385370 ETA:   0h17m53s 28.9% words/sec/thread:    8884 lr:  0.035569 avg.loss:  1.363605 ETA:   0h16m37s 31.0% words/sec/thread:    8865 lr:  0.034510 avg.loss:  1.356263 ETA:   0h16m 9s 0.032402 avg.loss:  1.347154 ETA:   0h15m10s 39.5% words/sec/thread:    9039 lr:  0.030262 avg.loss:  1.338279 ETA:   0h13

In [4]:
# Using the Model
# Load model
model = fasttext.load_model("fasttext_ina_200_with_phrases.bin")

# Get word vector for a word in Bahasa Indonesia
word_vector = model.get_word_vector("singa")
print(word_vector)
# Find similar words
similar_words = model.get_nearest_neighbors("harimau")
print(similar_words)




[ 9.61573839e-01  8.38488162e-01  2.13865265e-01  2.21283183e-01
  7.49650225e-02  2.88928837e-01 -1.52228311e-01  1.19806945e-01
 -1.76374361e-01 -3.07138294e-01 -2.93017596e-01 -2.89286375e-01
 -3.96756232e-02 -2.48717636e-01 -2.53759325e-01 -9.49425936e-01
 -2.62389004e-01  1.46104563e-02  1.33989498e-01 -9.44157243e-02
 -2.42440611e-01  3.98812085e-01 -9.27317739e-02  4.84634697e-01
  4.39166665e-01 -3.31306487e-01  2.79952019e-01 -6.71473593e-02
 -3.79338115e-02  2.56961763e-01 -2.27411777e-01 -3.28549594e-01
  1.32463843e-01 -1.29975513e-01 -5.72263777e-01  2.08370537e-01
  9.64891836e-02 -2.62546062e-01  2.79067725e-01  7.29772091e-01
  2.49267012e-01  7.62313008e-02 -2.29858696e-01 -1.24097936e-01
 -2.99408566e-02  1.07388198e+00 -1.11079760e-01  4.24602538e-01
 -6.63948655e-02 -3.23917389e-01 -1.44984856e-01  1.57006875e-01
  2.71386445e-01 -6.66582882e-01 -9.77723673e-02  3.47777128e-01
 -6.95869327e-01  2.78268725e-01  1.53705418e-01 -2.33361274e-01
 -2.23944649e-01 -6.70941