In [1]:
import fasttext
import os
import json
import re
from tqdm import tqdm

from nltk.tokenize import word_tokenize
from gensim.models.phrases import Phrases, Phraser

In [2]:
# PreProcess data
def preprocess_text(text):
    # Lowercasing and removing special characters
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    return ' '.join(tokens)

def process_wiki_files(input_dir, output_file):
    with open(output_file, 'w', encoding='utf-8') as f_out:
        for root, dirs, files in os.walk(input_dir):
            for file in tqdm(files, desc="Processing files"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f_in:
                    for line in tqdm(f_in, desc=f"Processing lines in {file}", leave=False):
                        article = json.loads(line)
                        text = preprocess_text(article['text'])
                        f_out.write(text + '\n')

# Assuming preprocess_text is a function you have defined
process_wiki_files('extracted_text', 'processed_wiki_id.txt')



Processing files: 0it [00:00, ?it/s]
Processing files: 100%|██████████| 35/35 [00:27<00:00,  1.29it/s]
Processing files: 100%|██████████| 100/100 [01:12<00:00,  1.38it/s]
Processing files: 100%|██████████| 100/100 [01:15<00:00,  1.32it/s]
Processing files: 100%|██████████| 100/100 [01:15<00:00,  1.33it/s]
Processing files: 100%|██████████| 100/100 [01:22<00:00,  1.22it/s]
Processing files: 100%|██████████| 100/100 [01:05<00:00,  1.52it/s]
Processing files: 100%|██████████| 100/100 [01:16<00:00,  1.31it/s]
Processing files: 100%|██████████| 100/100 [01:19<00:00,  1.25it/s]
Processing files: 100%|██████████| 100/100 [01:14<00:00,  1.35it/s]
Processing files: 100%|██████████| 100/100 [01:15<00:00,  1.32it/s]


In [3]:
# Train the FastText Model

def train_fasttext_with_phrases(input_file, epochs=5, lr=0.05):
    # Read the file and build sentences
    sentences = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            sentences.append(line.split())

    # Detect and create bigrams/trigrams
    bigram = Phraser(Phrases(sentences, min_count=5, threshold=10))
    trigram = Phraser(Phrases(bigram[sentences], min_count=5, threshold=10))

    # Apply the bigram/trigram models to the sentences
    sentences_with_phrases = [trigram[bigram[sentence]] for sentence in sentences]

    # Write the sentences with phrases to a temporary file
    temp_file = 'temp_sentences_with_phrases.txt'
    with open(temp_file, 'w', encoding='utf-8') as f:
        for sentence in sentences_with_phrases:
            f.write(' '.join(sentence) + '\n')

    # Train FastText model with a progress bar
    model = None
    for epoch in tqdm(range(1, epochs + 1), desc='Training Progress'):
        model = fasttext.train_unsupervised(temp_file, model='skipgram', lr=lr, epoch=epoch, dim=200)

    # Clean up the temporary file
    if os.path.exists(temp_file):
        os.remove(temp_file)

    return model


# Train the model with the preprocessed Wikipedia text
fasttext_model = train_fasttext_with_phrases('processed_wiki_id.txt')

# Save the model
fasttext_model.save_model("fasttext_ina_200_with_phrases.bin")


Read 87M wordsess:   0%|          | 0/5 [00:00<?, ?it/s]
Number of words:  710071
Number of labels: 0
Progress: 100.0% words/sec/thread:   10828 lr:  0.000000 avg.loss:  1.220157 ETA:   0h 0m 0s  2.2% words/sec/thread:   10039 lr:  0.048904 avg.loss:  1.394719 ETA:   0h20m13s 18.3% words/sec/thread:   10216 lr:  0.040832 avg.loss:  1.364442 ETA:   0h16m35s 36.0% words/sec/thread:   10313 lr:  0.031987 avg.loss:  1.331006 ETA:   0h12m52s 1.302212 ETA:   0h 9m19s  10591 lr:  0.006877 avg.loss:  1.216736 ETA:   0h 2m41s
Read 87M wordsess:  20%|██        | 1/5 [19:34<1:18:18, 1174.73s/it]
Number of words:  710071
Number of labels: 0
Progress: 100.0% words/sec/thread:   12194 lr:  0.000000 avg.loss:  0.803152 ETA:   0h 0m 0s 0.048667 avg.loss:  1.360469 ETA:   0h35m42s 0.009289 avg.loss:  0.910470 ETA:   0h 5m58s 84.6% words/sec/thread:   12756 lr:  0.007689 avg.loss:  0.888377 ETA:   0h 5m 0s 92.0% words/sec/thread:   12463 lr:  0.004004 avg.loss:  0.843869 ETA:   0h 2m40sm 0s
Read 87M wor

In [None]:
# Using the Model
# Load model
model = fasttext.load_model("fasttext_ina_200_with_phrases.bin")

# Get word vector for a word in Bahasa Indonesia
word_vector = model.get_word_vector("singa")
print(word_vector)
# Find similar words
similar_words = model.get_nearest_neighbors("harimau")
print(similar_words)


[-0.5710175   0.30058366  0.34796658  0.88001174  0.52425134 -0.02964741
 -0.17222954 -0.5002736   0.762855   -0.7790979   0.08115216  0.14185356
 -0.6173891  -0.0973238   0.3765095  -0.18282877 -0.02402508  0.19270276
 -0.63856393 -0.22311787  0.02718417  0.09362951 -0.13754086  0.12484796
  0.29579833 -1.0931784  -0.21157692  0.85819167  0.21909578 -0.35906434
  0.33472145 -0.01592343 -0.26751727 -0.82759494 -0.17570087 -0.4351454
 -0.0893899  -0.73693675 -0.04124119 -0.35097325 -0.89134544 -0.8440361
 -0.2711332   0.20650207  0.37705436  0.02467851 -0.34632164  0.0023733
  0.334291    0.01247291 -0.6152394  -0.4529973  -0.43055084 -0.08579167
  0.6661112  -0.42551488  0.7379743   0.06338852 -0.23205082 -0.49874333
 -0.94483197 -0.38910538  0.19761562 -0.48624828 -0.50785965 -0.03037283
  0.05746352  0.21924978 -0.0416922  -0.31730434  0.40931764 -0.69406706
  0.2582438   0.7814642  -1.0616256  -0.19469967 -0.15268809  0.24732283
  0.3922155  -0.57278365 -0.31278664  0.31659085  0.18

