In [1]:
import json
import glob
import malaya
from unidecode import unidecode
import re

In [2]:
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
rules_normalizer = malaya.texts._tatabahasa.rules_normalizer

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(unidecode(string))
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

In [3]:
english, bahasa = [], []

files = glob.glob('Malaya-Dataset/english-malay/*.json')
for file in files:
    with open(file) as fopen:
        x = json.load(fopen)
    for l, r in x:
        english.append(l)
        bahasa.append(r)

In [4]:
len(english), len(bahasa)

(332964, 332964)

In [5]:
from tqdm import tqdm

x, y = [], []
for i in tqdm(range(len(english))):
    p = preprocessing(english[i])
    u = preprocessing(bahasa[i])
    if len(p) <= 100 and len(p) > 3 and len(u) > 3:
        x.append(p)
        y.append(u)

100%|██████████| 332964/332964 [01:37<00:00, 3412.79it/s]


In [6]:
len(x), len(y)

(321749, 321749)

## Limit to 100k only, too big

In [7]:
english = x[:100000]
bahasa = y[:100000]

In [8]:
import collections

def build_dataset(words, n_words, atleast=1):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [9]:
import itertools

concat = list(itertools.chain(*english))
vocabulary_size_english = len(list(set(concat)))
data, count, dictionary_english, rev_dictionary_english = build_dataset(concat, vocabulary_size_english)
print('vocab from size: %d'%(vocabulary_size_english))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary_english[i] for i in data[:10]])

vocab from size: 35547
Most common words [('the', 132997), ('and', 67192), ('of', 67190), ('to', 65903), ('in', 44747), ('that', 41188)]
Sample data [12, 10, 8, 54, 592, 9, 30, 6, 66, 17] ['it', 'is', 'in', 'their', 'interest', 'that', 'all', 'of', 'us', 'are']


In [10]:
concat = list(itertools.chain(*bahasa))
vocabulary_size_bahasa = len(list(set(concat)))
data, count, dictionary_bahasa, rev_dictionary_bahasa = build_dataset(concat, vocabulary_size_bahasa)
print('vocab from size: %d'%(vocabulary_size_bahasa))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary_bahasa[i] for i in data[:10]])

vocab from size: 27434
Most common words [('yang', 78745), ('dan', 67280), ('untuk', 37803), ('saya', 37802), ('ini', 31480), ('di', 31300)]
Sample data [20, 15, 298, 16, 17, 10, 40, 15, 10247, 10] ['ia', 'adalah', 'kepentingan', 'mereka', 'bahawa', 'kita', 'semua', 'adalah', 'idiot', 'kita']


In [11]:
with open('dictionary.json', 'w') as fopen:
    json.dump({'english':{'dictionary': dictionary_english,
                         'rev_dictionary': rev_dictionary_english},
              'bahasa':{
                  'dictionary': dictionary_bahasa,
                  'rev_dictionary': rev_dictionary_bahasa
              }}, fopen)

In [12]:
with open('english-malay.json', 'w') as fopen:
    json.dump([english, bahasa], fopen)