<a href="https://colab.research.google.com/github/ksnugroho/feel-in/blob/main/model-word-embedding/01_id_wiki_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Create Word Embedding (Word2Vec & FastText) from ID Wikipedia Dump**

**Thesis: Emotion Detection in Indonesian Text**

Kuncahyo Setyo Nugroho<br>
Supervisor:
1. Dr. Eng. Fitra A. Bachtiar, S.T., M.Eng.
2. Prof. Ir. Wayan Firdaus Mahmudy, S.Si., M.T., Ph.D.

Faculty of Computer Science, Brawijaya University, Indonesia &copy; 2021-2022

# 01 Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Path to google drive folder
%cd /content/drive/MyDrive/Code/tesis

/content/drive/MyDrive/Code/tesis


In [None]:
import io
import os
import time
from datetime import timedelta
import multiprocessing
import gensim
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec, FastText, KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec

from tqdm import tqdm
tqdm.pandas()

In [None]:
gensim.__version__

'3.6.0'

# 02 Load & Prepare Data

In [None]:
start_time = time.time()

file_path = 'data/id-wiki-dump'

id_wiki = WikiCorpus(
    f'{file_path}/idwiki-latest-pages-articles.xml.bz2',
    dictionary={}, 
    processes=multiprocessing.cpu_count()-1, 
    lower=True
  )

article_count = 0
with io.open(f'{file_path}/id-wiki_dump_lower.txt', 'w', encoding='utf-8') as wiki_txt:
    for text in id_wiki.get_texts():
        wiki_txt.write(' '.join(text) + '\n')
        article_count += 1
        
        if article_count % 10000 == 0:
            print('{} articles processed'.format(article_count))
            
finish_time = time.time()

print('total: {} articles'.format(article_count))
print('Elapsed time: {}'.format(timedelta(seconds=finish_time-start_time)))

10000 articles processed
20000 articles processed
30000 articles processed
40000 articles processed
50000 articles processed
60000 articles processed
70000 articles processed
80000 articles processed
90000 articles processed
100000 articles processed
110000 articles processed
120000 articles processed
130000 articles processed
140000 articles processed
150000 articles processed
160000 articles processed
170000 articles processed
180000 articles processed
190000 articles processed
200000 articles processed
210000 articles processed
220000 articles processed
230000 articles processed
240000 articles processed
250000 articles processed
260000 articles processed
270000 articles processed
280000 articles processed
290000 articles processed
300000 articles processed
310000 articles processed
320000 articles processed
330000 articles processed
340000 articles processed
350000 articles processed
360000 articles processed
370000 articles processed
380000 articles processed
390000 articles proce

In [None]:
sentences = gensim.models.word2vec.LineSentence(f'{file_path}/id-wiki_dump_lower.txt')

# 03 Define Training Parameter

In [None]:
# https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec

CPU_CORES = multiprocessing.cpu_count()  # Jumlah CPU core pada komputer
EMBEDDING_SIZE = 300                     # Dimensi word vektors
WINDOW_SIZE = 5                          # Window size. Jarak maksimum antara kata saat ini dan yang diprediksi dalam sebuah kalimat
MIN_WORD = 5                             # Model akan mengabaikan semua kata dengan frekuensi total lebih rendah dari ini (opsional)
EPOCH = 10                               # Jumlah iterasi (epoch)
SG = 1                                   # Strategi algoritma pelatihan: 1 untuk skip-gram, 0 untuk CBOW
NEGATIVE = 5                             # Negative sampling. Jika 0, negative sampling tidak digunakan
HS = 0                                   # Hierarchical softmax. Jika 1, hierarchical softmax difunakan
SEED = 69                                # Number generator

print('CPU CORES:', CPU_CORES)

CPU CORES: 4


In [None]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [None]:
# Original source: https://github.com/HichemMaiza/Word2tensor/blob/master/word2vec2tensor.py

from smart_open import smart_open

def word2vec2tensor(word2vec_model_path, tensor_filename):
    """Convert file in Word2Vec format and writes two files 2D tensor TSV file.
    File "tensor_filename"_tensor.tsv contains word-vectors, "tensor_filename"_metadata.tsv contains words.
    Parameters
    ----------
    word2vec_model_path : str
        Path to file in Word2Vec format.
    tensor_filename : str
        Prefix for output files.
    """
    model = gensim.models.KeyedVectors.load(word2vec_model_path)
    outfiletsv = tensor_filename + '_tensor.tsv'
    outfiletsvmeta = tensor_filename + '_metadata.tsv'

    with smart_open(outfiletsv, 'wb') as file_vector, smart_open(outfiletsvmeta, 'wb') as file_metadata:
        for word in model.wv.index2word:
            file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
            vector_row = '\t'.join(str(x) for x in model[word])
            file_vector.write(gensim.utils.to_utf8(vector_row) + gensim.utils.to_utf8('\n'))

# 04 Word2Vec

In [None]:
%%time

# Proses training Word2Vec 
word2vec_model = Word2Vec(
    sentences, 
    # vector_size=EMBEDDING_SIZE, # gensim 4
    size=EMBEDDING_SIZE, # gensim 3
    sg=SG, 
    min_count=MIN_WORD, 
    window=WINDOW_SIZE, 
    # epochs=EPOCH, # gensim 4
    iter=EPOCH, # gensim 3
    workers=CPU_CORES-1,
    negative=NEGATIVE,
    hs=HS,
    seed=SEED,
    callbacks=[EpochLogger()]
  )

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end
CPU times: user 7h 39min 42s, sys: 44.8 s, total: 7h 40min 27s
Wall time: 2h 35min 42s


In [None]:
# Save sebagai full model
word2vec_model.save('model-word-embedding/checkpoint/idwiki-word2vec/idwiki-word2vec-300.model')

In [None]:
# Save sebagai full model dengan binary format (word2vec C format)
word2vec_model.save('model-word-embedding/checkpoint/idwiki-word2vec/idwiki-word2vec-300.bin')

In [None]:
# Save sebagai wordvectors. Hanya menyimpan kata & trained embeddingnya
word2vec_word_vectors = word2vec_model.wv
word2vec_word_vectors.save('model-word-embedding/checkpoint/idwiki-word2vec/idwiki-word2vec-300.wordvectors')

In [None]:
# Convert the word2vec format to Tensorflow 2D tensor
word2vec_model_path = 'model-word-embedding/checkpoint/idwiki-word2vec/idwiki-word2vec-300.model'
tensor_filename = 'model-word-embedding/checkpoint/idwiki-word2vec/idwiki-word2vec-300'

word2vec2tensor(word2vec_model_path, tensor_filename)

# 05 FastText

In [None]:
%%time

# Proses training FastText 
fasttext_model = FastText(
    sentences, 
    # vector_size=EMBEDDING_SIZE, # gensim 4
    size=EMBEDDING_SIZE, # gensim 3
    sg=SG, 
    min_count=MIN_WORD, 
    window=WINDOW_SIZE, 
    # epochs=EPOCH, # gensim 4
    iter=EPOCH, # gensim 3
    workers=CPU_CORES-1,
    negative=NEGATIVE,
    hs=HS,
    seed=SEED,
    callbacks=[EpochLogger()]
  )

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end
CPU times: user 14h 15min 18s, sys: 1min, total: 14h 16min 19s
Wall time: 4h 49min 16s


In [None]:
# Save sebagai full model
fasttext_model.save('model-word-embedding/checkpoint/idwiki-fasttext/idwiki-fasttext-300.model')

In [None]:
# Save sebagai full model dengan binary format (word2vec C format)
fasttext_model.save('model-word-embedding/checkpoint/idwiki-fasttext/idwiki-fasttext-300.bin')

In [None]:
# Save sebagai wordvectors. Hanya menyimpan kata & trained embeddingnya
fasttext_word_vectors = fasttext_model.wv
fasttext_word_vectors.save('model-word-embedding/checkpoint/idwiki-fasttext/idwiki-fasttext-300.wordvectors')

In [None]:
# Convert the word2vec format to Tensorflow 2D tensor
word2vec_model_path = 'model-word-embedding/checkpoint/idwiki-fasttext/idwiki-fasttext-300.model'
tensor_filename = 'model-word-embedding/checkpoint/idwiki-fasttext/idwiki-fasttext-300'

word2vec2tensor(word2vec_model_path, tensor_filename)