In [1]:
import os
import re
import tensorflow as tf
import numpy as np
import json
import random

In [2]:
labels = os.listdir('news')
news = ['news/' + i for i in labels if '.json' in i]
labels = [i.replace('.json','') for i in labels]
len(news)

123

In [3]:
import malaya
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
split_sentence = malaya.texts._text_functions.split_into_sentences

In [4]:
accept_tokens = ',-.()"\''

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    splitted = split_sentence(string)
    for i, string in enumerate(splitted):
        tokenized = tokenizer(string)
        tokenized = [w.lower() for w in tokenized if len(w) > 1]
        tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
        tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
        splitted[i] = tokenized
    return splitted

In [5]:
min_len = 20
x = []
for no, n in enumerate(news):
    with open(n) as fopen: 
        news_ = json.load(fopen)
    for row in news_:
        if len(row['text'].split()) > min_len:
            p = preprocessing(row['text'])
            x.extend(p)
            
len(x)

263638

In [6]:
x = random.sample(x, 10000)

In [7]:
import collections

def batch_sequence(sentences, dictionary, maxlen = 50):
    np_array = np.zeros((len(sentences), maxlen), dtype = np.int32)
    for no_sentence, sentence in enumerate(sentences):
        current_no = 0
        for no, word in enumerate(sentence[: maxlen - 2]):
            np_array[no_sentence, no] = dictionary.get(word, 1)
            current_no = no
        np_array[no_sentence, current_no + 1] = 3
    return np_array

def build_dataset(words, n_words, atleast=2):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [8]:
import itertools

X = list(itertools.chain(*x))

In [9]:
maxlen = 50
vocabulary_size = len(set(X))
embedding_size = 256
learning_rate = 1e-3
batch_size = 16
vocabulary_size

24667

In [10]:
from sklearn.utils import shuffle

stride = 1
t_range = int((len(x) - 3) / stride + 1)
left, middle, right = [], [], []
for i in range(t_range):
    slices = x[i * stride : i * stride + 3]
    left.append(slices[0])
    middle.append(slices[1])
    right.append(slices[2])

left, middle, right = shuffle(left, middle, right)
len(left), len(middle), len(right)

(9998, 9998, 9998)

In [11]:
concat = X
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])
print('filtered vocab size:',len(dictionary))
print("% of vocab used: {}%".format(round(len(dictionary)/vocabulary_size,4)*100))

vocab from size: 24667
Most common words [('yang', 3919), ('the', 3820), ('dan', 3525), ('<NUM>', 2603), ('di', 2357), ('ini', 1876)]
Sample data [2292, 173, 1674, 2485, 12, 1859, 337, 0, 5462, 356] ['meanwhile', 'sabah', 'tourism', 'culture', 'and', 'environment', 'minister', 'PAD', 'liew', 'when']
filtered vocab size: 12246
% of vocab used: 49.65%


In [12]:
class Model:
    def __init__(self,maxlen=50, 
                 vocabulary_size=20000,
                 learning_rate=1e-3,
                 embedding_size = 256):
        self.output_size = embedding_size
        self.maxlen = maxlen
        word_embeddings = tf.Variable(
            tf.random_uniform(
                [vocabulary_size, embedding_size], -np.sqrt(3), np.sqrt(3)
            )
        )
        self.global_step = tf.get_variable(
            "global_step", shape=[], trainable=False,
            initializer=tf.initializers.zeros())
        self.embeddings = word_embeddings
        self.output_layer = tf.layers.Dense(vocabulary_size, name="output_layer")
        self.output_layer.build(self.output_size)
        
        self.BEFORE = tf.placeholder(tf.int32,[None,maxlen])
        self.INPUT = tf.placeholder(tf.int32,[None,maxlen])
        self.AFTER = tf.placeholder(tf.int32,[None,maxlen])
        self.batch_size = tf.shape(self.INPUT)[0]
        
        self.get_thought = self.thought(self.INPUT)
        self.attention = tf.matmul(
            self.get_thought, tf.transpose(self.embeddings), name = 'attention'
        )
        fw_logits = self.decoder(self.get_thought, self.AFTER)
        bw_logits = self.decoder(self.get_thought, self.BEFORE)
        self.loss = self.calculate_loss(fw_logits, self.AFTER) + self.calculate_loss(bw_logits, self.BEFORE)
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

    def get_embedding(self, inputs):
        return tf.nn.embedding_lookup(self.embeddings, inputs)
        
    def thought(self, inputs):
        encoder_in = self.get_embedding(inputs)
        fw_cell = tf.nn.rnn_cell.GRUCell(self.output_size)
        bw_cell = tf.nn.rnn_cell.GRUCell(self.output_size)
        sequence_length = tf.reduce_sum(tf.sign(inputs), axis=1)
        rnn_output = tf.nn.bidirectional_dynamic_rnn(
            fw_cell, bw_cell, encoder_in, sequence_length=sequence_length,
            dtype=tf.float32)[1]
        return sum(rnn_output)
        
    def decoder(self, thought, labels):
        main = tf.strided_slice(labels, [0, 0], [self.batch_size, -1], [1, 1])
        shifted_labels = tf.concat([tf.fill([self.batch_size, 1], 2), main], 1)
        decoder_in = self.get_embedding(shifted_labels)
        cell = tf.nn.rnn_cell.GRUCell(self.output_size)
        max_seq_lengths = tf.fill([self.batch_size], self.maxlen)
        helper = tf.contrib.seq2seq.TrainingHelper(
            decoder_in, max_seq_lengths, time_major = False
        )
        decoder = tf.contrib.seq2seq.BasicDecoder(cell, helper, thought)
        decoder_out = tf.contrib.seq2seq.dynamic_decode(decoder)[0].rnn_output
        return decoder_out
        
    def calculate_loss(self, outputs, labels):
        mask = tf.cast(tf.sign(labels), tf.float32)
        logits = self.output_layer(outputs)
        return tf.contrib.seq2seq.sequence_loss(logits, labels, mask)

In [13]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(vocabulary_size = len(dictionary), embedding_size = embedding_size)
sess.run(tf.global_variables_initializer())

In [14]:
from tqdm import tqdm

for i in range(10):
    pbar = tqdm(range(0, len(middle), batch_size), desc='train minibatch loop')
    for p in pbar:
        index = min(p + batch_size, len(middle))
        batch_x = batch_sequence(
                middle[p : index],
                dictionary,
                maxlen = maxlen,
        )
        batch_y_before = batch_sequence(
                left[p : index],
                dictionary,
                maxlen = maxlen,
        )
        batch_y_after = batch_sequence(
                right[p : index],
                dictionary,
                maxlen = maxlen,
        )
        loss, _ = sess.run([model.loss, model.optimizer], 
                           feed_dict = {model.BEFORE: batch_y_before,
                                        model.INPUT: batch_x,
                                        model.AFTER: batch_y_after,})
        pbar.set_postfix(cost=loss)

train minibatch loop: 100%|██████████| 625/625 [01:59<00:00,  5.37it/s, cost=12.7]
train minibatch loop: 100%|██████████| 625/625 [01:58<00:00,  5.37it/s, cost=11.1]
train minibatch loop: 100%|██████████| 625/625 [01:58<00:00,  5.32it/s, cost=9.72]
train minibatch loop: 100%|██████████| 625/625 [01:59<00:00,  5.38it/s, cost=8.56]
train minibatch loop: 100%|██████████| 625/625 [01:58<00:00,  5.39it/s, cost=7.55]
train minibatch loop: 100%|██████████| 625/625 [01:59<00:00,  5.39it/s, cost=6.65]
train minibatch loop: 100%|██████████| 625/625 [01:59<00:00,  5.44it/s, cost=5.95]
train minibatch loop: 100%|██████████| 625/625 [01:59<00:00,  5.34it/s, cost=5.34]
train minibatch loop: 100%|██████████| 625/625 [01:59<00:00,  5.34it/s, cost=4.85]
train minibatch loop: 100%|██████████| 625/625 [01:58<00:00,  5.40it/s, cost=4.36]


In [17]:
test = random.sample(x, 100)

sequences = batch_sequence(test, dictionary, maxlen = maxlen)
encoded, attention = sess.run([model.get_thought, model.attention],feed_dict={model.INPUT:sequences})

In [23]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans = kmeans.fit(encoded)
avg = []
closest = []
for j in range(n_clusters):
    idx = np.where(kmeans.labels_ == j)[0]
    avg.append(np.mean(idx))
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,encoded)
ordering = sorted(range(n_clusters), key=lambda k: avg[k])
sentences = [test[closest[idx]] for idx in ordering]

In [26]:
sentences = [' '.join(s) for s in sentences]
'. '.join(sentences)

'kita sudah banyak pegawai tadbir diplomatik dalam sektor awam dan syarikat syarikat besar kepunyaan kerajaan juga perlu bersaing dengan syarikat gergasi antarabangsa. dalam tempoh sama mcmc juga melaksanakan tindakan sekatan terhadap <NUM> laman sesawang portal dan blog yang menyebarkan kandungan atau berita palsu. gambas instagram bahkan ia menambahkan konservasi alam di bali pun erat kaitannya dengan budaya. begitu juga bn. sikap keterbukaan dan faham memahami amat diperlukan di antara umat islam dan bukan islam bagi menjamin keharmonian kaum. sesungguhnya yang demikian itu mengandungi tanda tanda membuktikan kekuasaan allah bagi kaum yang berfikir untuk memahaminya. khoo added that it was difficult for the police to take any action because these are civil cases between tnb and the property owners. usaha ini boleh menambah pendapatan penduduk jika ia dikendalikan dengan penuh minat sebelum ia mengeluarkan hasil nanti katanya. sebagai contoh insentif untuk meningkatkan perbelanjaan p

In [27]:
indices = np.argsort(attention.mean(axis=0))[::-1]
rev_dictionary = {v:k for k, v in dictionary.items()}
[rev_dictionary[i] for i in indices[:10]]

['garden',
 'ditanggung',
 'majesty',
 'maritime',
 'himpunan',
 'statik',
 'mbm',
 'permukaan',
 'trial',
 'pass']