<a href="https://colab.research.google.com/github/ksnugroho/feel-in/blob/main/model-word-embedding/02_id_tweet_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Create Word Embedding (Word2Vec & FastText) from ID Tweet**

**Thesis: Emotion Detection in Indonesian Text**

Kuncahyo Setyo Nugroho<br>
Supervisor:
1. Dr. Eng. Fitra A. Bachtiar, S.T., M.Eng.
2. Prof. Ir. Wayan Firdaus Mahmudy, S.Si., M.T., Ph.D.

Faculty of Computer Science, Brawijaya University, Indonesia &copy; 2021-2022

# 01 Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Path to google drive folder
%cd /content/drive/MyDrive/Code/tesis

/content/drive/MyDrive/Code/tesis


In [None]:
import re
import pandas as pd
import multiprocessing
import gensim
from gensim.models import Word2Vec, FastText, KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec

from tqdm import tqdm
tqdm.pandas()

In [None]:
gensim.__version__

'3.6.0'

# 02 Load & Prepare Data

In [None]:
df = pd.read_csv('data/id-tweet-dump/id-tweet-dump-clean.csv', usecols=['tweet'], dtype={'tweet':'str'})

In [None]:
df.shape

(3126987, 1)

In [None]:
def text_preprocessing(text):
    text = text.lower()                               # Mengubah teks menjadi lower case
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # Menghapus URL
    text = re.sub(r'[-+]?[0-9]+', '', text)           # Menghapus angka
    text = re.sub(r'[^\w\s]','', text)                # Menghapus karakter tanda baca
    text = text.strip()                               # Menghapus whitespaces
    return text

def tokenize(text):
    text = re.split('\W+', text)
    return text

In [None]:
%%time 
df['clean_tweet'] = df['tweet'].astype(str).progress_apply(text_preprocessing)

100%|██████████| 3126987/3126987 [00:30<00:00, 102487.78it/s]


CPU times: user 30.4 s, sys: 487 ms, total: 30.9 s
Wall time: 30.7 s


In [None]:
%%time
sentences = df['clean_tweet'].progress_apply(lambda x: tokenize(x.lower()))

100%|██████████| 3126987/3126987 [00:23<00:00, 130391.88it/s]

CPU times: user 22.3 s, sys: 1.86 s, total: 24.2 s
Wall time: 24 s





In [None]:
print(sentences)

0          [eh, l, nas, l, watya, ely, f, hayaty, dol, ms...
1          [lihat, kehidupanku, jauh, lebih, baik, kan, m...
2                                           [ramai, nya, dm]
3          [udane, kyk, aku, mengagumi, mu, dek, awet, ka...
4          [fpi, dibubarkan, kembali, viral, video, tito,...
                                 ...                        
3126982                           [trims, dinda, km, jg, yh]
3126983    [mantap, betul, dbk, udah, engsubbbbb, cus, no...
3126984                 [krucukan, adalah, nada, nada, alam]
3126985    [ana, belum, yo, tapi, dah, boleh, jalan, naik...
3126986                                 [suka, baca, au, ga]
Name: clean_tweet, Length: 3126987, dtype: object


# 03 Define Training Parameter

In [None]:
# https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec

CPU_CORES = multiprocessing.cpu_count()  # Jumlah CPU core pada komputer
EMBEDDING_SIZE = 300                     # Dimensi word vektors
WINDOW_SIZE = 5                          # Window size. Jarak maksimum antara kata saat ini dan yang diprediksi dalam sebuah kalimat
MIN_WORD = 5                             # Model akan mengabaikan semua kata dengan frekuensi total lebih rendah dari ini (opsional)
EPOCH = 10                               # Jumlah iterasi (epoch)
SG = 1                                   # Strategi algoritma pelatihan: 1 untuk skip-gram, 0 untuk CBOW
NEGATIVE = 5                             # Negative sampling. Jika 0, negative sampling tidak digunakan
HS = 0                                   # Hierarchical softmax. Jika 1, hierarchical softmax difunakan
SEED = 69                                # Number generator

print('CPU CORES:', CPU_CORES)

CPU CORES: 4


In [None]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [None]:
# Original source: https://github.com/HichemMaiza/Word2tensor/blob/master/word2vec2tensor.py

from smart_open import smart_open

def word2vec2tensor(word2vec_model_path, tensor_filename):
    """Convert file in Word2Vec format and writes two files 2D tensor TSV file.
    File "tensor_filename"_tensor.tsv contains word-vectors, "tensor_filename"_metadata.tsv contains words.
    Parameters
    ----------
    word2vec_model_path : str
        Path to file in Word2Vec format.
    tensor_filename : str
        Prefix for output files.
    """
    model = gensim.models.KeyedVectors.load(word2vec_model_path)
    outfiletsv = tensor_filename + '_tensor.tsv'
    outfiletsvmeta = tensor_filename + '_metadata.tsv'

    with smart_open(outfiletsv, 'wb') as file_vector, smart_open(outfiletsvmeta, 'wb') as file_metadata:
        for word in model.wv.index2word:
            file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
            vector_row = '\t'.join(str(x) for x in model[word])
            file_vector.write(gensim.utils.to_utf8(vector_row) + gensim.utils.to_utf8('\n'))

# 04 Word2Vec

In [None]:
%%time

# Proses training Word2Vec 
word2vec_model = Word2Vec(
    sentences, 
    # vector_size=EMBEDDING_SIZE, # gensim 4
    size=EMBEDDING_SIZE, # gensim 3
    sg=SG, 
    min_count=MIN_WORD, 
    window=WINDOW_SIZE, 
    # epochs=EPOCH, # gensim 4
    iter=EPOCH, # gensim 3
    workers=CPU_CORES-1,
    negative=NEGATIVE,
    hs=HS,
    seed=SEED,
    callbacks=[EpochLogger()]
  )

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end
CPU times: user 1h 18min 58s, sys: 7.13 s, total: 1h 19min 5s
Wall time: 27min 9s


In [None]:
# Save sebagai full model
word2vec_model.save('model-word-embedding/checkpoint/idtweet-word2vec/idtweet-word2vec-300.model')

In [None]:
# Save sebagai full model dengan binary format (word2vec C format)
word2vec_model.save('model-word-embedding/checkpoint/idtweet-word2vec/idtweet-word2vec-300.bin')

In [None]:
# Save sebagai wordvectors. Hanya menyimpan kata & trained embeddingnya
word2vec_word_vectors = word2vec_model.wv
word2vec_word_vectors.save('model-word-embedding/checkpoint/idtweet-word2vec/idtweet-word2vec-300.wordvectors')

In [None]:
# Convert the word2vec format to Tensorflow 2D tensor
word2vec_model_path = 'model-word-embedding/checkpoint/idtweet-word2vec/idtweet-word2vec-300.model'
tensor_filename = 'model-word-embedding/checkpoint/idtweet-word2vec/idtweet-word2vec-300'

word2vec2tensor(word2vec_model_path, tensor_filename)

# 05 FastText

In [None]:
%%time

# Proses training FastText 
fasttext_model = FastText(
    sentences, 
    # vector_size=EMBEDDING_SIZE, # gensim 4
    size=EMBEDDING_SIZE, # gensim 3
    sg=SG, 
    min_count=MIN_WORD, 
    window=WINDOW_SIZE, 
    # epochs=EPOCH, # gensim 4
    iter=EPOCH, # gensim 3
    workers=CPU_CORES-1,
    negative=NEGATIVE,
    hs=HS,
    seed=SEED,
    callbacks=[EpochLogger()]
  )

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end
CPU times: user 2h 28min 37s, sys: 13.1 s, total: 2h 28min 50s
Wall time: 51min 6s


In [None]:
# Save sebagai full model
fasttext_model.save('model-word-embedding/checkpoint/idtweet-fasttext/idtweet-fasttext-300.model')

In [None]:
# Save sebagai full model dengan binary format (word2vec C format)
fasttext_model.save('model-word-embedding/checkpoint/idtweet-fasttext/idtweet-fasttext-300.bin')

In [None]:
# Save sebagai wordvectors. Hanya menyimpan kata & trained embeddingnya
fasttext_word_vectors = fasttext_model.wv
fasttext_word_vectors.save('model-word-embedding/checkpoint/idtweet-fasttext/idtweet-fasttext-300.wordvectors')

In [None]:
# Convert the word2vec format to Tensorflow 2D tensor
word2vec_model_path = 'model-word-embedding/checkpoint/idtweet-fasttext/idtweet-fasttext-300.model'
tensor_filename = 'model-word-embedding/checkpoint/idtweet-fasttext/idtweet-fasttext-300'

word2vec2tensor(word2vec_model_path, tensor_filename)