In [1]:
import os
import re
import tensorflow as tf
import numpy as np
import json
import random

In [2]:
labels = os.listdir('news')
news = ['news/' + i for i in labels if '.json' in i]
labels = [i.replace('.json','') for i in labels]
len(news)

123

In [3]:
import malaya
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
split_sentence = malaya.texts._text_functions.split_into_sentences

In [4]:
accept_tokens = ',-.()"\''

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    splitted = split_sentence(string)
    for i, string in enumerate(splitted):
        tokenized = tokenizer(string)
        tokenized = [w.lower() for w in tokenized if len(w) > 1]
        tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
        tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
        splitted[i] = tokenized
    return splitted

In [5]:
min_len = 20
x = []
for no, n in enumerate(news):
    with open(n) as fopen: 
        news_ = json.load(fopen)
    for row in news_:
        if len(row['text'].split()) > min_len:
            p = preprocessing(row['text'])
            x.extend(p)
            
len(x)

263638

In [6]:
x = random.sample(x, 10000)

In [7]:
import collections

def batch_sequence(sentences, dictionary, maxlen = 50):
    np_array = np.zeros((len(sentences), maxlen), dtype = np.int32)
    for no_sentence, sentence in enumerate(sentences):
        current_no = 0
        for no, word in enumerate(sentence[: maxlen - 2]):
            np_array[no_sentence, no] = dictionary.get(word, 1)
            current_no = no
        np_array[no_sentence, current_no + 1] = 3
    return np_array

def build_dataset(words, n_words, atleast=2):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [8]:
import itertools

X = list(itertools.chain(*x))

In [9]:
maxlen = 50
vocabulary_size = len(set(X))
embedding_size = 256
learning_rate = 1e-3
batch_size = 16
vocabulary_size

24384

In [10]:
from sklearn.utils import shuffle

stride = 1
t_range = int((len(x) - 3) / stride + 1)
left, middle, right = [], [], []
for i in range(t_range):
    slices = x[i * stride : i * stride + 3]
    left.append(slices[0])
    middle.append(slices[1])
    right.append(slices[2])

left, middle, right = shuffle(left, middle, right)
len(left), len(middle), len(right)

(9998, 9998, 9998)

In [11]:
concat = X
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])
print('filtered vocab size:',len(dictionary))
print("% of vocab used: {}%".format(round(len(dictionary)/vocabulary_size,4)*100))

vocab from size: 24384
Most common words [('the', 3997), ('yang', 3660), ('dan', 3369), ('<NUM>', 2534), ('di', 2310), ('to', 1847)]
Sample data [14, 272, 138, 7, 7, 32, 4038, 532, 3178, 21] ['dalam', 'laporan', 'ekonomi', '<NUM>', '<NUM>', 'kerajaan', 'menjangkakan', 'pertumbuhan', 'kdnk', 'malaysia']
filtered vocab size: 12188
% of vocab used: 49.980000000000004%


In [12]:
class Attention:
    def __init__(self,hidden_size):
        self.hidden_size = hidden_size
        self.dense_layer = tf.layers.Dense(hidden_size)
        self.v = tf.random_normal([hidden_size],mean=0,stddev=1/np.sqrt(hidden_size))
        
    def score(self, hidden_tensor, encoder_outputs):
        energy = tf.nn.tanh(self.dense_layer(tf.concat([hidden_tensor,encoder_outputs],2)))
        energy = tf.transpose(energy,[0,2,1])
        batch_size = tf.shape(encoder_outputs)[0]
        v = tf.expand_dims(tf.tile(tf.expand_dims(self.v,0),[batch_size,1]),1)
        energy = tf.matmul(v,energy)
        return tf.squeeze(energy,1)
    
    def __call__(self, hidden, encoder_outputs):
        seq_len = tf.shape(encoder_outputs)[1]
        batch_size = tf.shape(encoder_outputs)[0]
        H = tf.tile(tf.expand_dims(hidden, 1),[1,seq_len,1])
        attn_energies = self.score(H,encoder_outputs)
        return tf.expand_dims(tf.nn.softmax(attn_energies),1)
    
class Model:
    def __init__(
        self,
        dict_size,
        size_layers,
        learning_rate,
        maxlen,
        num_blocks = 3,
    ):
        block_size = size_layers
        self.BEFORE = tf.placeholder(tf.int32,[None,maxlen])
        self.INPUT = tf.placeholder(tf.int32,[None,maxlen])
        self.AFTER = tf.placeholder(tf.int32,[None,maxlen])
        self.batch_size = tf.shape(self.INPUT)[0]
        self.output_layer = tf.layers.Dense(dict_size, name="output_layer")
        self.output_layer.build(size_layers)
        self.embeddings = tf.Variable(tf.random_uniform([dict_size, size_layers], -1, 1))
        embedded = tf.nn.embedding_lookup(self.embeddings, self.INPUT)
        self.attention = Attention(size_layers)

        def residual_block(x, size, rate, block, reuse = False):
            with tf.variable_scope(
                'block_%d_%d' % (block, rate), reuse = reuse
            ):
                attn_weights = self.attention(tf.reduce_sum(x,axis=1), x)
                conv_filter = tf.layers.conv1d(
                    attn_weights,
                    x.shape[2] // 4,
                    kernel_size = size,
                    strides = 1,
                    padding = 'same',
                    dilation_rate = rate,
                    activation = tf.nn.tanh,
                )
                conv_gate = tf.layers.conv1d(
                    x,
                    x.shape[2] // 4,
                    kernel_size = size,
                    strides = 1,
                    padding = 'same',
                    dilation_rate = rate,
                    activation = tf.nn.sigmoid,
                )
                out = tf.multiply(conv_filter, conv_gate)
                out = tf.layers.conv1d(
                    out,
                    block_size,
                    kernel_size = 1,
                    strides = 1,
                    padding = 'same',
                    activation = tf.nn.tanh,
                )
                return tf.add(x, out), out

        forward = tf.layers.conv1d(
            embedded, block_size, kernel_size = 1, strides = 1, padding = 'SAME'
        )
        zeros = tf.zeros_like(forward)
        for i in range(num_blocks):
            for r in [1, 2, 4, 8, 16]:
                forward, s = residual_block(
                    forward, size = 7, rate = r, block = i
                )
                zeros = tf.add(zeros, s)
        forward = tf.layers.conv1d(
            zeros,
            block_size,
            kernel_size = 1,
            strides = 1,
            padding = 'SAME',
            activation = tf.nn.tanh,
        )
        self.get_thought = tf.reduce_sum(forward,axis=1, name = 'logits')
        
        def decoder(labels, reuse):
            decoder_in = tf.nn.embedding_lookup(self.embeddings, labels)
            forward = tf.layers.conv1d(
                decoder_in, block_size, kernel_size = 1, strides = 1, padding = 'SAME'
            )
            zeros = tf.zeros_like(forward)
            for r in [8, 16, 24]:
                forward, s = residual_block(forward, size = 7, rate = r, block = 10, reuse = reuse)
                zeros = tf.add(zeros, s)
            return tf.layers.conv1d(
                zeros,
                block_size,
                kernel_size = 1,
                strides = 1,
                padding = 'SAME',
                activation = tf.nn.tanh,
            )
        
        fw_logits = decoder(self.AFTER, False)
        bw_logits = decoder(self.BEFORE, True)
        self.attention = tf.matmul(
            self.get_thought, tf.transpose(self.embeddings), name = 'attention'
        )
        self.loss = self.calculate_loss(fw_logits, self.AFTER) + self.calculate_loss(bw_logits, self.BEFORE)
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
    
    def calculate_loss(self, outputs, labels):
        mask = tf.cast(tf.sign(labels), tf.float32)
        logits = self.output_layer(outputs)
        return tf.contrib.seq2seq.sequence_loss(logits, labels, mask)

In [13]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(len(dictionary), embedding_size, learning_rate, maxlen)
sess.run(tf.global_variables_initializer())

In [14]:
from tqdm import tqdm

for i in range(10):
    pbar = tqdm(range(0, len(middle), batch_size), desc='train minibatch loop')
    for p in pbar:
        index = min(p + batch_size, len(middle))
        batch_x = batch_sequence(
                middle[p : index],
                dictionary,
                maxlen = maxlen,
        )
        batch_y_before = batch_sequence(
                left[p : index],
                dictionary,
                maxlen = maxlen,
        )
        batch_y_after = batch_sequence(
                right[p : index],
                dictionary,
                maxlen = maxlen,
        )
        loss, _ = sess.run([model.loss, model.optimizer], 
                           feed_dict = {model.BEFORE: batch_y_before,
                                        model.INPUT: batch_x,
                                        model.AFTER: batch_y_after,})
        pbar.set_postfix(cost=loss)

train minibatch loop: 100%|██████████| 625/625 [00:36<00:00, 13.49it/s, cost=9.81]
train minibatch loop: 100%|██████████| 625/625 [00:35<00:00, 17.48it/s, cost=7.15]
train minibatch loop: 100%|██████████| 625/625 [00:36<00:00, 17.30it/s, cost=5.54]
train minibatch loop: 100%|██████████| 625/625 [00:35<00:00, 17.39it/s, cost=4.42]
train minibatch loop: 100%|██████████| 625/625 [00:36<00:00, 17.25it/s, cost=3.6] 
train minibatch loop: 100%|██████████| 625/625 [00:35<00:00, 17.36it/s, cost=2.95]
train minibatch loop: 100%|██████████| 625/625 [00:36<00:00, 17.22it/s, cost=2.38]
train minibatch loop: 100%|██████████| 625/625 [00:35<00:00, 17.36it/s, cost=1.87]
train minibatch loop: 100%|██████████| 625/625 [00:36<00:00, 17.25it/s, cost=1.59]
train minibatch loop: 100%|██████████| 625/625 [00:36<00:00, 17.35it/s, cost=1.36]


In [15]:
test = random.sample(x, 100)

sequences = batch_sequence(test, dictionary, maxlen = maxlen)
encoded, attention = sess.run([model.get_thought, model.attention],feed_dict={model.INPUT:sequences})

In [16]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans = kmeans.fit(encoded)
avg = []
closest = []
for j in range(n_clusters):
    idx = np.where(kmeans.labels_ == j)[0]
    avg.append(np.mean(idx))
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,encoded)
ordering = sorted(range(n_clusters), key=lambda k: avg[k])
sentences = [test[closest[idx]] for idx in ordering]

In [17]:
sentences = [' '.join(s) for s in sentences]
'. '.join(sentences)

'jelasnya malaysia mempunyai kemudahan lengkap yang boleh ditawarkan sebagai venue kejohanan itu antaranya arena axiata bukit jalil dan stadium malawati shah alam malah sukan gimnastik estetik lebih mudah dikendalikan kerana kurang menggunakan peralatan. ini penyelesaian jangka pendek yang lebih praktikal dalam memastikan semua isu berkaitan kebajikan dan perlindungan kanak kanak dapat dipantau serta memastikan agar agensi atau badan badan berkaitan memikul tanggungjawab masing masing katanya sewaktu sesi soal jawab lisan di dewan rakyat hari ini. presiden persatuan bola sepak malaysia fam tunku ismail sultan ibrahim berkata semua jurulatih terlibat tan cheng hoe harimau malaya datuk ong kim swee <NUM> bojan hodak <NUM> dan lim teong kim ppbn perlu mengikut perancangan dan strategi taktikal yang dirangka pengarah teknikal fam peter de roo. four people have been charged with multiple counts of money laundering in connection to the case and are now out on bail pending trial. saya perlu b

In [18]:
indices = np.argsort(attention.mean(axis=0))[::-1]
rev_dictionary = {v:k for k, v in dictionary.items()}
[rev_dictionary[i] for i in indices[:10]]

['mendukung',
 'mind',
 'interest',
 'introduced',
 'paul',
 'evolusi',
 'entire',
 'sejujurnya',
 'ilmuwan',
 'barangkali']