# Import Modules

In [1]:
import pickle
import numpy as np
from itertools import chain
from polyglot.mapping import Embedding, CaseExpander, DigitExpander

# Read word embedding

In [2]:
embeddings = Embedding.load("../word_embeddings/polyglot/idn_embeddings.tar.bz2")
embeddings.apply_expansion(DigitExpander)
embeddings.apply_expansion(CaseExpander)

# Vocabs

In [3]:
left_context_vocabs = open("../datasets/ngrams/left_79_context_vocabs.pkl", "rb")
left_context_vocabs = pickle.load(left_context_vocabs)

right_context_vocabs = open("../datasets/ngrams/right_79_context_vocabs.pkl", "rb")
right_context_vocabs = pickle.load(right_context_vocabs)

left_right_context_vocabs = list(set(left_context_vocabs + right_context_vocabs))
left_right_context_vocabs = left_right_context_vocabs + ['</S>', '<UNK>', '<PAD>', '<S>']

oov_context_vocabs = open("../datasets/ngrams/oov_79_context_vocabs.pkl", "rb")
oov_context_vocabs = pickle.load(oov_context_vocabs)
oov_chars = list(set(list(chain(*[list(vocab.lower()) for vocab in oov_context_vocabs]))))

In [4]:
len(left_right_context_vocabs), len(left_context_vocabs), len(right_context_vocabs), len(oov_context_vocabs), len(oov_chars)

(13203, 10801, 10602, 3611, 48)

# Check vocab

In [5]:
# OOV context
num_oov = 0

for vocab in oov_context_vocabs:
    if vocab not in embeddings:
        num_oov += 1
        
num_oov - len(oov_context_vocabs)

0

In [6]:
# Left context
num_not_oov = 0

for vocab in left_context_vocabs:
    if vocab in embeddings:
        num_not_oov += 1
        
num_not_oov - len(left_context_vocabs)

0

In [7]:
# Right context
num_not_oov = 0

for vocab in right_context_vocabs:
    if vocab in embeddings:
        num_not_oov += 1
        
num_not_oov - len(right_context_vocabs)

0

In [8]:
# Left + Right context
num_not_oov = 0

for vocab in left_right_context_vocabs:
    if vocab in embeddings:
        num_not_oov += 1
        
num_not_oov - len(left_right_context_vocabs)

0

# Context

In [9]:
file = open("../datasets/ngrams/oov_ngrams_79_context.pkl", "rb")
oov_ngrams = pickle.load(file)
oov_ngrams[:5]

[[('Kera', False), ('untuk', False), ('amankan', True)],
 [('pesta olahraga', True)],
 [('Pemerintah', False),
  ('kota', False),
  ('Delhi', False),
  ('mengerahkan', False),
  ('monyet', False),
  ('untuk', False),
  ('mengusir', False),
  ('monyet-monyet', True),
  ('lain', False),
  ('yang', False),
  ('berbadan', False),
  ('lebih', False),
  ('kecil', False),
  ('dari', False),
  ('arena', False)],
 [('lain', False),
  ('yang', False),
  ('berbadan', False),
  ('lebih', False),
  ('kecil', False),
  ('dari', False),
  ('arena', False),
  ('Pesta Olahraga', True),
  ('Persemakmuran', False),
  ('.', False)],
 [('Pemkot', False),
  ('Delhi', False),
  ('memiliki', False),
  ('28', False),
  ('monyet', False),
  ('dan', False),
  ('berencana', False),
  ('mendatangkan', False),
  ('10', False),
  ('monyet', False),
  ('sejenis', False),
  ('dari', False),
  ('negara bagian', True),
  ('Rajasthan', False),
  ('.', False)]]

In [10]:
file = open("../datasets/ngrams/split_oov_ngrams_79_context.pkl", "rb")
split_oov_ngrams = pickle.load(file)
split_oov_ngrams[:5]

[(['Kera', 'untuk'], ['a', 'm', 'a', 'n', 'k', 'a', 'n'], []),
 ([],
  ['p', 'e', 's', 't', 'a', ' ', 'o', 'l', 'a', 'h', 'r', 'a', 'g', 'a'],
  []),
 (['Pemerintah',
   'kota',
   'Delhi',
   'mengerahkan',
   'monyet',
   'untuk',
   'mengusir'],
  ['m', 'o', 'n', 'y', 'e', 't', '-', 'm', 'o', 'n', 'y', 'e', 't'],
  ['lain', 'yang', 'berbadan', 'lebih', 'kecil', 'dari', 'arena']),
 (['lain', 'yang', 'berbadan', 'lebih', 'kecil', 'dari', 'arena'],
  ['p', 'e', 's', 't', 'a', ' ', 'o', 'l', 'a', 'h', 'r', 'a', 'g', 'a'],
  ['Persemakmuran', '.']),
 (['Pemkot',
   'Delhi',
   'memiliki',
   '28',
   'monyet',
   'dan',
   'berencana',
   'mendatangkan',
   '10',
   'monyet',
   'sejenis',
   'dari'],
  ['n', 'e', 'g', 'a', 'r', 'a', ' ', 'b', 'a', 'g', 'i', 'a', 'n'],
  ['Rajasthan', '.'])]

# Setting word embedding

## Word Embedding

In [11]:
def token_to_token_embedding(tokens, token_embeddings):
    return {token: token_embeddings[token] for token in tokens}

word_embeddings = token_to_token_embedding(left_right_context_vocabs, embeddings)

In [12]:
word_embedding_keys = word_embeddings.keys()
word_embedding_values = word_embeddings.values()

In [13]:
word_to_idx = {token : idx for idx, token in enumerate(list(word_embedding_keys))}

## Char Embedding

In [14]:
import numpy as np

with open("../word_embeddings/chars_embedding/char_mimick_glove_d100_c20", encoding='utf-8') as f:
    chars_embedding = f.readlines()

char_embeddings = [embedding.split("\n") for embedding in chars_embedding]
char_embeddings = np.array([embedding[0].split(" ") for embedding in char_embeddings])
chars = char_embeddings[:, 0]
chars_embeddings = char_embeddings[:, 1:].astype(np.float32)

In [15]:
import torch
from torch import nn

embed_chars = torch.FloatTensor(chars_embeddings)
embedding = nn.Embedding.from_pretrained(embed_chars)
embedding(torch.LongTensor([0]))

tensor([[ 0.4507,  0.2386, -0.1907,  0.4303,  0.2195,  0.2263,  0.3209,  0.0868,
         -0.0430,  0.4124,  0.1751,  0.3666,  0.3070, -0.3652, -0.1363,  0.4268,
         -0.0977, -0.2378, -0.2367, -0.4339]])

# Notes
- Beberapa dokumen tidak memiliki konteks kiri atau kanan.
- Rentang left dan right context berada diantara 0-79.

In [16]:
chars

array(['PAD', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',',
       '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
       '{', '|', '}', '~', '¡', '¢', '£', '¤', '¥', '¨', 'ª', '«', '·',
       '»', '¼', '½', '¾', '¿', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ',
       'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó',
       'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ', 'ā', 'ă',
       'ą', 'ć', 'č', 'ď', 'đ', 'ē', 'ė', 'ę', 'ě', 'ğ', 'ġ', 'ħ', 'ĩ',
       'ī', 'ı', 'ľ', 'ł', 'ń', 'ņ', 'ň', 'ŋ', 'ō', 'ŏ', 'ő', 'œ', 'ř',
       'ś', 'ş', 'š', 'ţ', 'ť', 'ũ', 'ū', 'ŭ', 'ů', 'ű', 'ŵ', 'ŷ', 'ź',
       'ż', 'ž', 'ƒ', 'ơ', 'ư', 'ǐ', 'ǒ', 'ǔ', 'ș', 'ț', 'ɔ', 'ə', 'ɛ',
       'ɣ', 'ʃ', 'ʔ', 'ʻ', 'ʿ', '˚', '́', '̇', '̍', 'ά', 'α',

In [31]:
left_context = open("../datasets/features/79_context/left_context_with_pad.pkl", "rb")
left_context = pickle.load(left_context)
left_context

[['Kera',
  'untuk',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>'],
 ['<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',

In [17]:
idx2token = open("../datasets/features/79_context/idx2token_left_context.pkl", "rb")
idx2token = pickle.load(idx2token)

In [18]:
token2idx = open("../datasets/features/79_context/token2idx_left_context.pkl", "rb")
token2idx = pickle.load(token2idx)

# idx2token

In [19]:
print(idx2token)

{0: '!', 1: '"', 2: '$', 3: '%', 4: '&', 5: "'", 6: '(', 7: ')', 8: '+', 9: ',', 10: '-', 11: '.', 12: '...', 13: '/', 14: '0', 15: '0,01', 16: '0,02', 17: '0,025', 18: '0,03', 19: '0,05', 20: '0,07', 21: '0,08', 22: '0,1', 23: '0,10', 24: '0,12', 25: '0,16', 26: '0,17', 27: '0,18', 28: '0,19', 29: '0,23', 30: '0,24', 31: '0,25', 32: '0,29', 33: '0,3', 34: '0,32', 35: '0,33', 36: '0,34', 37: '0,38', 38: '0,39', 39: '0,4', 40: '0,43', 41: '0,45', 42: '0,46', 43: '0,5', 44: '0,50', 45: '0,51', 46: '0,52', 47: '0,59', 48: '0,62', 49: '0,65', 50: '0,66', 51: '0,7', 52: '0,75', 53: '0,77', 54: '0,79', 55: '0,8', 56: '0,80', 57: '0,85', 58: '0,87', 59: '0,927', 60: '1', 61: '1,00', 62: '1,02', 63: '1,021', 64: '1,04', 65: '1,080', 66: '1,09', 67: '1,1', 68: '1,11', 69: '1,12', 70: '1,14', 71: '1,16', 72: '1,2', 73: '1,21', 74: '1,217', 75: '1,22', 76: '1,225', 77: '1,25', 78: '1,28', 79: '1,29', 80: '1,3', 81: '1,30', 82: '1,4', 83: '1,41', 84: '1,46', 85: '1,48', 86: '1,5', 87: '1,568', 88:

# token2idx

In [20]:
print(token2idx)

{0: '!', 1: '"', 2: '$', 3: '%', 4: '&', 5: "'", 6: '(', 7: ')', 8: '+', 9: ',', 10: '-', 11: '.', 12: '...', 13: '/', 14: '0', 15: '0,01', 16: '0,02', 17: '0,025', 18: '0,03', 19: '0,05', 20: '0,07', 21: '0,08', 22: '0,1', 23: '0,10', 24: '0,12', 25: '0,16', 26: '0,17', 27: '0,18', 28: '0,19', 29: '0,23', 30: '0,24', 31: '0,25', 32: '0,29', 33: '0,3', 34: '0,32', 35: '0,33', 36: '0,34', 37: '0,38', 38: '0,39', 39: '0,4', 40: '0,43', 41: '0,45', 42: '0,46', 43: '0,5', 44: '0,50', 45: '0,51', 46: '0,52', 47: '0,59', 48: '0,62', 49: '0,65', 50: '0,66', 51: '0,7', 52: '0,75', 53: '0,77', 54: '0,79', 55: '0,8', 56: '0,80', 57: '0,85', 58: '0,87', 59: '0,927', 60: '1', 61: '1,00', 62: '1,02', 63: '1,021', 64: '1,04', 65: '1,080', 66: '1,09', 67: '1,1', 68: '1,11', 69: '1,12', 70: '1,14', 71: '1,16', 72: '1,2', 73: '1,21', 74: '1,217', 75: '1,22', 76: '1,225', 77: '1,25', 78: '1,28', 79: '1,29', 80: '1,3', 81: '1,30', 82: '1,4', 83: '1,41', 84: '1,46', 85: '1,48', 86: '1,5', 87: '1,568', 88:

In [30]:
idx2token_key = list(idx2token.keys())
idx2token_val = list(idx2token.values())