# Import Modules

In [1]:
import csv
import string
import pandas as pd
import numpy as np
import plotly.express as px

from tqdm.notebook import tqdm
from umap import UMAP
from polyglot.mapping import Embedding, CaseExpander, DigitExpander

# Read dataset and word embeddings

In [2]:
dataset = pd.read_csv("../datasets/raw/Indonesian_Manually_Tagged_Corpus.tsv", sep="\t", header=None, names=["token", "tag"], quoting=csv.QUOTE_NONE, skip_blank_lines=False)
chars = list(string.ascii_letters + string.digits + string.punctuation)
vocabs = set(dataset.token.values.tolist() + chars)
vocabs.remove(np.nan)
vocabs = list(vocabs)
n_vocab = len(vocabs)

In [3]:
embeddings = Embedding.load("../word_embeddings/polyglot/idn_embeddings.tar.bz2")
embeddings.apply_expansion(DigitExpander)
embeddings.apply_expansion(CaseExpander)

In [4]:
def count_oov(vocabs, return_vocabs=False):
    counter = 0

    if return_vocabs:
        oov_tokens = []
        for vocab in tqdm(vocabs):
            if vocab not in embeddings:
                counter += 1
                oov_tokens.append(vocab)
        
        return counter, oov_tokens
    else:
        for vocab in tqdm(vocabs):
            if vocab not in embeddings:
                counter += 1        
    
        return counter

# Check character in word embedding

In [5]:
n_oov_chars, oov_chars = count_oov(chars, return_vocabs=True)

print(f"Characters: {chars}")
print(f"Number of OOV chars: {n_oov_chars}")
print(f"OOV chars: {oov_chars}")

  0%|          | 0/94 [00:00<?, ?it/s]

Characters: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
Number of OOV chars: 0
OOV chars: []


# Filter vocabs not in words embedding

In [6]:
n_oov_vocab, oov_vocabs = count_oov(vocabs + chars, return_vocabs=True)
n_not_oov_vocab, not_oov_vocabs= len(vocabs) - n_oov_vocab, list(set(vocabs).difference(set(oov_vocabs)))

print(f"Number of vocab: {n_vocab}\n")

print(f"Number of OOV vocab: {n_oov_vocab} ({round(n_oov_vocab / n_vocab * 100, 2)}%)")
print(f"OOV vocab: {oov_vocabs[:10]}\n")

print(f"Number of not OOV vocab: {n_not_oov_vocab} ({round(n_not_oov_vocab / n_vocab * 100, 2)}%)")
print(f"Not OOV vocab: {not_oov_vocabs[:10]}")

  0%|          | 0/18421 [00:00<?, ?it/s]

Number of vocab: 18327

Number of OOV vocab: 3748 (20.45%)
OOV vocab: ['semaksimal mungkin', 'Perusahaan-perusahaan', 'angkatan darat', 'UKTI', 'Joelianto', 'baru-baru ini', 'Resettlement', 'Al-Hariri', 'berkali-kali', 'partikulir']

Number of not OOV vocab: 14579 (79.55%)
Not OOV vocab: ['Polri', 'keampuhan', '400', 'menetralkan', 'Fatah', 'tenang', 'system', '17,00', 'alternatif', 'Rahmat']


# Visualize word embedding

In [7]:
word_embeddings = []

for vocab in not_oov_vocabs:
    word_embeddings.append(embeddings[vocab])

word_embeddings = np.array(word_embeddings)

## 2 Dimensions

In [8]:
embedding_2dim = UMAP(n_neighbors=15, n_components=2, n_epochs=500, metric="correlation", random_state=42).fit_transform(word_embeddings)

In [9]:
embedding_2dim_df = pd.DataFrame(data={
    "dim_1": embedding_2dim[:, 0],
    "dim_2": embedding_2dim[:, 1],
    "vocab": not_oov_vocabs
})

In [10]:
fig = px.scatter(embedding_2dim_df, x="dim_1", y="dim_2", hover_data=["vocab"], width=800, height=800, title="Word Embedding Polyglot Quality in 2 Dimensions")
fig.write_html("../images/data_analysis/embedding_2dim.html")

## 3 Dimensions

In [11]:
embedding_3dim = UMAP(n_neighbors=15, n_components=3, n_epochs=500, metric="correlation", random_state=42).fit_transform(word_embeddings)

embedding_3dim_df = pd.DataFrame(data={
    "dim_1": embedding_3dim[:, 0],
    "dim_2": embedding_3dim[:, 1],
    "dim_3": embedding_3dim[:, 2],
    "vocab": not_oov_vocabs
})

In [12]:
fig = px.scatter_3d(embedding_3dim_df, x="dim_1", y="dim_2", z="dim_3", hover_data=["vocab"], width=800, height=800, title="Word Embedding Polyglot Quality in 3 Dimensions")
fig.write_html("../images/data_analysis/embedding_3dim.html")

# Save vectors

In [13]:
id_not_oov_embeddings = pd.DataFrame(word_embeddings, columns=["dim_" + str(i) for i in range(1, 65)])
id_not_oov_embeddings["token"] = not_oov_vocabs
id_not_oov_embeddings = id_not_oov_embeddings[["token"] + ["dim_" + str(i) for i in range(1, 65)]]
id_not_oov_embeddings[id_not_oov_embeddings["token"] == "p"]
id_not_oov_embeddings.to_csv("../datasets/data_analysis/id_not_oov_embeddings.csv", index=False)

# Notes

- Polyglot mengandung 100.004 vocab dengan dimensi embedding 64.
- Polyglot mencakup 14.579 kosakata (79.55% dari keseluruhan dataset) dan sisanya 3.748 (20.45%) Out Of Vocabulary.
- OOV umumnya berupa, singkatan, nama orang, nama tempat, nama merek (barang dan jasa), mengandung tanda baca (spasi dan kurang) dan bahasa asing.
- Semua karakter (tanda baca, digit angka, huruf kapital, dan huruf kecil) berada dalam word embedding polyglot.
- Polyglot mengandung semua embedding numerikal positif.
- Nilai vektor huruf kapital dan vektor huruf kecil pada Polyglot berbeda.
- Word embedding Polyglot memperlihatkan hasil embedding yang cukup baik. Dapat dilihat pada hasil visualisasi sebagian token dikelompokkan sesuai dengan karakteristiknya.

# Questions
- Mekanisme pengujian OOV?

- Masiih ada kekeliruan dibagian data preprocessing dan model (classifier dan word embedding).
- OOV bisa berada di kalimat lain dengan posisi dan konteks yang berbeda? Cara penanganannya bagaimana? Alternatif gunakan fungsi aggregate,seperti mean atau sum. Apakah efektif??

***

In [14]:
embedding_3dim = UMAP(n_neighbors=15, n_components=3, n_epochs=500, metric="correlation", random_state=42, verbose=True).fit_transform(embeddings.vectors)

embedding_3dim_df = pd.DataFrame(data={
    "dim_1": embedding_3dim[:, 0],
    "dim_2": embedding_3dim[:, 1],
    "dim_3": embedding_3dim[:, 2],
    "vocab": embeddings.words
})

fig = px.scatter_3d(embedding_3dim_df, x="dim_1", y="dim_2", z="dim_3", hover_data=["vocab"], width=800, height=800, title="Word Embedding Polyglot Quality in 3 Dimensions")
fig.write_html("../images/data_analysis/idn_embedding_polyglot_3dim.html")

UMAP(angular_rp_forest=True, metric='correlation', n_components=3, n_epochs=500, random_state=42, verbose=True)
Thu Sep 15 03:47:16 2022 Construct fuzzy simplicial set
Thu Sep 15 03:47:16 2022 Finding Nearest Neighbors
Thu Sep 15 03:47:16 2022 Building RP forest with 21 trees
Thu Sep 15 03:47:17 2022 NN descent for 17 iterations
	 1  /  17
	 2  /  17
	 3  /  17
	 4  /  17
	 5  /  17
	Stopping threshold met -- exiting after 5 iterations
Thu Sep 15 03:47:25 2022 Finished Nearest Neighbor Search
Thu Sep 15 03:47:25 2022 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Thu Sep 15 03:51:08 2022 Finished embedding


# Char Embedding

In [15]:
import numpy as np

with open("../word_embeddings/chars_embedding/char_mimick_glove_d100_c20") as f:
    chars_embedding = f.readlines()

chars_embedding = [embedding.split("\n") for embedding in chars_embedding]
chars_embedding = [embedding[0].split(" ") for embedding in chars_embedding]

chars_embedding = np.array(chars_embedding)
chars = chars_embedding[:, 0]
embeddings = chars_embedding[:, 1:].astype(np.float32)
char_embeddings = {char: embedding for char, embedding in zip(chars, embeddings)}
char_embeddings["!"]

NameError: name 'chars_embedding_df' is not defined