In [1]:
import malaya
malaya.bump_version

'2.3.4'

In [2]:
text_split = malaya.texts._text_functions.split_into_sentences
text_cleaning = malaya.texts._text_functions.summary_textcleaning

In [3]:
import json

files = ['politics.json', 'education.json', 'economy.json', 'business.json']
sentences = []
for file in files:
    with open(file) as fopen:
        news = json.load(fopen)
    for n in news:
        if len(n['text']) > 50:
            splitted = text_split(n['text'])
            sentences.extend(splitted)
            
len(sentences)

11258

In [4]:
sentences = [text_cleaning(s)[1] for s in sentences]

In [5]:
window_size = 4
n_topics = 10
embedding_size = 128
epoch = 5
switch_loss = 2

In [6]:
class LDA2VEC:
    def __init__(
        self,
        num_unique_documents,
        vocab_size,
        num_topics,
        freqs,
        embedding_size = 128,
        num_sampled = 40,
        learning_rate = 1e-3,
        lmbda = 150.0,
        alpha = None,
        power = 0.75,
        batch_size = 32,
        clip_gradients = 5.0,
        **kwargs
    ):
        moving_avgs = tf.train.ExponentialMovingAverage(0.9)
        self.batch_size = batch_size
        self.freqs = freqs
        self.sess = tf.InteractiveSession()
        self.X = tf.placeholder(tf.int32, shape = [None])
        self.Y = tf.placeholder(tf.int64, shape = [None])
        self.DOC = tf.placeholder(tf.int32, shape = [None])
        step = tf.Variable(0, trainable = False, name = 'global_step')
        self.switch_loss = tf.Variable(0, trainable = False)
        train_labels = tf.reshape(self.Y, [-1, 1])
        sampler = tf.nn.fixed_unigram_candidate_sampler(
            train_labels,
            num_true = 1,
            num_sampled = num_sampled,
            unique = True,
            range_max = vocab_size,
            distortion = power,
            unigrams = self.freqs,
        )

        self.word_embedding = tf.Variable(
            tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0)
        )
        self.nce_weights = tf.Variable(
            tf.truncated_normal(
                [vocab_size, embedding_size],
                stddev = tf.sqrt(1 / embedding_size),
            )
        )
        self.nce_biases = tf.Variable(tf.zeros([vocab_size]))
        scalar = 1 / np.sqrt(num_unique_documents + num_topics)
        self.doc_embedding = tf.Variable(
            tf.random_normal(
                [num_unique_documents, num_topics],
                mean = 0,
                stddev = 50 * scalar,
            )
        )
        self.topic_embedding = tf.get_variable(
            'topic_embedding',
            shape = [num_topics, embedding_size],
            dtype = tf.float32,
            initializer = tf.orthogonal_initializer(gain = scalar),
        )
        pivot = tf.nn.embedding_lookup(self.word_embedding, self.X)
        proportions = tf.nn.embedding_lookup(self.doc_embedding, self.DOC)
        doc = tf.matmul(proportions, self.topic_embedding)
        doc_context = doc
        word_context = pivot
        context = tf.add(word_context, doc_context)
        loss_word2vec = tf.reduce_mean(
            tf.nn.nce_loss(
                weights = self.nce_weights,
                biases = self.nce_biases,
                labels = self.Y,
                inputs = context,
                num_sampled = num_sampled,
                num_classes = vocab_size,
                num_true = 1,
                sampled_values = sampler,
            )
        )
        self.fraction = tf.Variable(1, trainable = False, dtype = tf.float32)

        n_topics = self.doc_embedding.get_shape()[1].value
        log_proportions = tf.nn.log_softmax(self.doc_embedding)
        if alpha is None:
            alpha = 1.0 / n_topics
        loss = -(alpha - 1) * log_proportions
        prior = tf.reduce_sum(loss)

        loss_lda = lmbda * self.fraction * prior
        self.cost = tf.cond(
            step < self.switch_loss,
            lambda: loss_word2vec,
            lambda: loss_word2vec + loss_lda,
        )
        loss_avgs_op = moving_avgs.apply([loss_lda, loss_word2vec, self.cost])
        with tf.control_dependencies([loss_avgs_op]):
            self.optimizer = tf.contrib.layers.optimize_loss(
                self.cost,
                tf.train.get_global_step(),
                learning_rate,
                'Adam',
                clip_gradients = clip_gradients,
            )
        self.sess.run(tf.global_variables_initializer())

    def train(
        self, pivot_words, target_words, doc_ids, num_epochs, switch_loss = 3
    ):
        from tqdm import tqdm

        temp_fraction = self.batch_size / len(pivot_words)
        self.sess.run(tf.assign(self.fraction, temp_fraction))
        self.sess.run(tf.assign(self.switch_loss, switch_loss))
        for e in range(num_epochs):
            pbar = tqdm(
                range(0, len(pivot_words), self.batch_size),
                desc = 'minibatch loop',
            )
            for i in pbar:
                batch_x = pivot_words[
                    i : min(i + self.batch_size, len(pivot_words))
                ]
                batch_y = target_words[
                    i : min(i + self.batch_size, len(pivot_words))
                ]
                batch_doc = doc_ids[
                    i : min(i + self.batch_size, len(pivot_words))
                ]
                _, cost = self.sess.run(
                    [self.optimizer, self.cost],
                    feed_dict = {
                        self.X: batch_x,
                        self.Y: batch_y,
                        self.DOC: batch_doc,
                    },
                )
                pbar.set_postfix(cost = cost, epoch = e + 1)

In [7]:
import random
from sklearn.utils import shuffle

def skipgrams(
    sequence,
    vocabulary_size,
    window_size = 4,
    negative_samples = 1.0,
    shuffle = True,
    categorical = False,
    sampling_table = None,
    seed = None,
):
    couples = []
    labels = []
    for i, wi in enumerate(sequence):
        if not wi:
            continue
        if sampling_table is not None:
            if sampling_table[wi] < random.random():
                continue

        window_start = max(0, i - window_size)
        window_end = min(len(sequence), i + window_size + 1)
        for j in range(window_start, window_end):
            if j != i:
                wj = sequence[j]
                if not wj:
                    continue
                couples.append([wi, wj])
                if categorical:
                    labels.append([0, 1])
                else:
                    labels.append(1)

    if negative_samples > 0:
        num_negative_samples = int(len(labels) * negative_samples)
        words = [c[0] for c in couples]
        random.shuffle(words)

        couples += [
            [words[i % len(words)], random.randint(1, vocabulary_size - 1)]
            for i in range(num_negative_samples)
        ]
        if categorical:
            labels += [[1, 0]] * num_negative_samples
        else:
            labels += [0] * num_negative_samples

    if shuffle:
        if seed is None:
            seed = random.randint(0, 10e6)
        random.seed(seed)
        random.shuffle(couples)
        random.seed(seed)
        random.shuffle(labels)

    return couples, labels

In [8]:
import tensorflow as tf
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

bow = CountVectorizer().fit(sentences)
transformed = bow.transform(sentences)
idx_text_clean, len_idx_text_clean = [], []
for text in transformed:
    splitted = text.nonzero()[1]
    idx_text_clean.append(splitted)
    
dictionary = {
        i: no for no, i in enumerate(bow.get_feature_names())
    }
reversed_dictionary = {
        no: i for no, i in enumerate(bow.get_feature_names())
    }
freqs = transformed.toarray().sum(axis = 0).tolist()
doc_ids = np.arange(len(idx_text_clean))
num_unique_documents = doc_ids.max()
pivot_words, target_words, doc_ids = [], [], []
for i, t in enumerate(idx_text_clean):
    pairs, _ = skipgrams(
            t,
            vocabulary_size = len(dictionary),
            window_size = window_size,
            shuffle = True,
            negative_samples = 0,
        )
    for pair in pairs:
        temp_data = pair
        pivot_words.append(temp_data[0])
        target_words.append(temp_data[1])
        doc_ids.append(i)
pivot_words, target_words, doc_ids = shuffle(
        pivot_words, target_words, doc_ids, random_state = 10
)
num_unique_documents = len(idx_text_clean)

In [9]:
model = LDA2VEC(
        num_unique_documents,
        len(dictionary),
        n_topics,
        freqs,
        embedding_size = embedding_size)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [10]:
model.train(
    pivot_words, target_words, doc_ids, epoch, switch_loss = switch_loss
)

minibatch loop: 100%|██████████| 45372/45372 [08:43<00:00, 86.60it/s, cost=-2.21e+4, epoch=1]
minibatch loop: 100%|██████████| 45372/45372 [08:44<00:00, 86.50it/s, cost=-4.71e+4, epoch=2]
minibatch loop: 100%|██████████| 45372/45372 [08:44<00:00, 86.50it/s, cost=-7.2e+4, epoch=3] 
minibatch loop: 100%|██████████| 45372/45372 [08:44<00:00, 86.44it/s, cost=-9.62e+4, epoch=4]
minibatch loop: 100%|██████████| 45372/45372 [08:44<00:00, 86.45it/s, cost=-1.19e+5, epoch=5]


In [11]:
doc_embed = model.sess.run(model.doc_embedding)
topic_embed = model.sess.run(model.topic_embedding)
word_embed = model.sess.run(model.word_embedding)

In [12]:
components = topic_embed.dot(word_embed.T)
for no, topic in enumerate(components):
    topic_string = ' '.join([reversed_dictionary[i]
              for i in topic.argsort()[: -10 : -1]])
    print('topic %d : %s'%(no + 1, topic_string))

topic 1 : g25 hardiknas doktor kashif kompetensi keep banjir harvest ditargetkan
topic 2 : g25 izzah kashif ioi halilintar harvest 1984 keep candreva
topic 3 : g25 hardiknas 1984 hamisah 2001 kashif doktor keep halilintar
topic 4 : g25 hardiknas keep kashif doktor alfamart lombok diimport washing
topic 5 : keep g25 harvest halilintar menghiraukan 1984 administrative kejahatan marketplace
topic 6 : keep ioi kompetensi washing kashif g25 dominan halilintar asuhan
topic 7 : g25 kashif harvest keep kritis diimport chow escas berbau
topic 8 : keep citi mulai 1984 escas g25 doktor garis asuhan
topic 9 : g25 asuhan 2001 gapoktan doktor halilintar umt kashif harmonis
topic 10 : g25 keep harvest doktor kashif tribunkaltim administrative asuhan halilintar
