In [1]:
import re
import numpy as np
import collections
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import tensorflow as tf
import pandas as pd
from unidecode import unidecode
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import time
import malaya



In [2]:
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize
rules_normalizer = malaya.texts._tatabahasa.rules_normalizer

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(unidecode(string))
    tokenized = [malaya.stem.naive(w) for w in tokenized]
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = [rules_normalizer.get(w, w) for w in tokenized]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

In [3]:
def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen][::-1]):
            X[i, -1 - no] = dic.get(k, UNK)
    return X

In [4]:
preprocessing('kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya jd')

['raja',
 'benar',
 'sangat',
 'benci',
 'rakyat',
 'minyak',
 'naik',
 'gala',
 'jadi']

In [6]:
with open('subjectivity-negative-bm.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('subjectivity-positive-bm.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts

assert len(labels) == len(texts)

In [7]:
from tqdm import tqdm
pbar = tqdm(range(len(texts)))
for i in pbar:
    texts[i] = preprocessing(texts[i])

100%|██████████| 9962/9962 [00:03<00:00, 3057.22it/s]


In [8]:
import itertools

concat = list(itertools.chain(*texts))
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 13353
Most common words [('yang', 11804), ('untuk', 3880), ('tidak', 2898), ('deng', 2827), ('ada', 2356), ('dalam', 2194)]
Sample data [10, 70, 13, 28, 57, 54, 11, 382, 36, 187] ['filem', 'mula', 'pada', 'masa', 'lalu', 'mana', 'orang', 'budak', 'lelaki', 'nama']


In [9]:
def position_encoding(inputs):
    T = tf.shape(inputs)[1]
    repr_dim = inputs.get_shape()[-1].value
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1])

class Model:
    def __init__(
        self,
        size_layer,
        num_layers,
        dimension_output,
        learning_rate,
        dropout,
        dict_size,
    ):
        def cells(size, reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer = tf.orthogonal_initializer(),
                    reuse = reuse,
                ),
                state_keep_prob = dropout,
                output_keep_prob = dropout,
            )

        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(
            tf.random_uniform([dict_size, size_layer], -1, 1)
        )
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        encoder_embedded += position_encoding(encoder_embedded)
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
            num_units = size_layer, memory = encoder_embedded
        )
        rnn_cells = tf.contrib.seq2seq.AttentionWrapper(
            cell = tf.nn.rnn_cell.MultiRNNCell(
                [cells(size_layer) for _ in range(num_layers)]
            ),
            attention_mechanism = attention_mechanism,
            attention_layer_size = size_layer,
            alignment_history = True,
        )
        outputs, last_state = tf.nn.dynamic_rnn(
            rnn_cells, encoder_embedded, dtype = tf.float32
        )
        self.alignments = tf.transpose(
            last_state.alignment_history.stack(), [1, 2, 0]
        )
        self.logits_seq = tf.layers.dense(outputs, dimension_output)
        self.logits_seq = tf.identity(self.logits_seq, name = 'logits_seq')
        self.logits = self.logits_seq[:, -1]
        self.logits = tf.identity(self.logits, name = 'logits')
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        self.attention = tf.nn.softmax(
            tf.reduce_sum(self.alignments[0], 1), name = 'alphas'
        )

In [10]:
size_layer = 256
num_layers = 2
dimension_output = 2
learning_rate = 1e-4
batch_size = 32
dropout = 0.8
maxlen = 100

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    size_layer,
    num_layers,
    dimension_output,
    learning_rate,
    dropout,
    len(dictionary),
)
sess.run(tf.global_variables_initializer())

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(
    texts, labels, test_size = 0.2
)

In [12]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = str_idx(train_X[i : min(i + batch_size, len(train_X))], dictionary, maxlen)
        batch_y = train_Y[i : min(i + batch_size, len(train_X))]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = str_idx(test_X[i : min(i + batch_size, len(test_X))], dictionary, maxlen)
        batch_y = test_Y[i : min(i + batch_size, len(test_X))]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 250/250 [01:37<00:00,  2.48it/s, accuracy=0, cost=1.08]     
test minibatch loop: 100%|██████████| 63/63 [00:11<00:00,  4.35it/s, accuracy=0.667, cost=0.734]
train minibatch loop:   0%|          | 0/250 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.668005
time taken: 108.64598202705383
epoch: 0, training loss: 0.618148, training acc: 0.637345, valid loss: 0.618830, valid acc: 0.668005



train minibatch loop: 100%|██████████| 250/250 [02:41<00:00,  1.73it/s, accuracy=1, cost=0.211]    
test minibatch loop: 100%|██████████| 63/63 [00:14<00:00,  4.46it/s, accuracy=0.556, cost=0.733]
train minibatch loop:   0%|          | 0/250 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.668005, current acc: 0.753526
time taken: 175.7188491821289
epoch: 1, training loss: 0.474724, training acc: 0.775254, valid loss: 0.524445, valid acc: 0.753526



train minibatch loop: 100%|██████████| 250/250 [02:41<00:00,  1.74it/s, accuracy=1, cost=0.113]    
test minibatch loop: 100%|██████████| 63/63 [00:14<00:00,  4.43it/s, accuracy=0.667, cost=0.585]
train minibatch loop:   0%|          | 0/250 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.753526, current acc: 0.808998
time taken: 175.92001390457153
epoch: 2, training loss: 0.402917, training acc: 0.817543, valid loss: 0.437974, valid acc: 0.808998



train minibatch loop: 100%|██████████| 250/250 [02:42<00:00,  1.74it/s, accuracy=1, cost=0.0551]   
test minibatch loop: 100%|██████████| 63/63 [00:14<00:00,  4.45it/s, accuracy=0.667, cost=0.596]
train minibatch loop:   0%|          | 0/250 [00:00<?, ?it/s]

epoch: 3, pass acc: 0.808998, current acc: 0.823047
time taken: 177.3283658027649
epoch: 3, training loss: 0.350113, training acc: 0.846781, valid loss: 0.416687, valid acc: 0.823047



train minibatch loop: 100%|██████████| 250/250 [02:40<00:00,  1.72it/s, accuracy=1, cost=0.0694]   
test minibatch loop: 100%|██████████| 63/63 [00:14<00:00,  4.43it/s, accuracy=0.778, cost=0.457]
train minibatch loop:   0%|          | 0/250 [00:00<?, ?it/s]

epoch: 4, pass acc: 0.823047, current acc: 0.842895
time taken: 175.29672193527222
epoch: 4, training loss: 0.309891, training acc: 0.868365, valid loss: 0.402665, valid acc: 0.842895



train minibatch loop: 100%|██████████| 250/250 [02:40<00:00,  1.74it/s, accuracy=1, cost=0.0098]   
test minibatch loop: 100%|██████████| 63/63 [00:14<00:00,  4.44it/s, accuracy=0.778, cost=0.467]
train minibatch loop:   0%|          | 0/250 [00:00<?, ?it/s]

epoch: 5, pass acc: 0.842895, current acc: 0.847410
time taken: 175.14018750190735
epoch: 5, training loss: 0.277412, training acc: 0.888317, valid loss: 0.399790, valid acc: 0.847410



train minibatch loop: 100%|██████████| 250/250 [02:40<00:00,  1.74it/s, accuracy=1, cost=0.0107]    
test minibatch loop: 100%|██████████| 63/63 [00:14<00:00,  4.41it/s, accuracy=0.667, cost=0.421]
train minibatch loop:   0%|          | 0/250 [00:00<?, ?it/s]

time taken: 175.12843656539917
epoch: 6, training loss: 0.255826, training acc: 0.899360, valid loss: 0.450100, valid acc: 0.838100



train minibatch loop: 100%|██████████| 250/250 [02:40<00:00,  1.73it/s, accuracy=1, cost=0.0173]    
test minibatch loop: 100%|██████████| 63/63 [00:14<00:00,  4.32it/s, accuracy=0.667, cost=0.407]
train minibatch loop:   0%|          | 0/250 [00:00<?, ?it/s]

time taken: 175.43931555747986
epoch: 7, training loss: 0.233549, training acc: 0.907266, valid loss: 0.430690, valid acc: 0.843118



train minibatch loop: 100%|██████████| 250/250 [02:40<00:00,  1.73it/s, accuracy=1, cost=0.00437]   
test minibatch loop: 100%|██████████| 63/63 [00:14<00:00,  4.52it/s, accuracy=0.778, cost=0.364] 
train minibatch loop:   0%|          | 0/250 [00:00<?, ?it/s]

epoch: 8, pass acc: 0.847410, current acc: 0.847912
time taken: 175.31820154190063
epoch: 8, training loss: 0.203132, training acc: 0.923955, valid loss: 0.441600, valid acc: 0.847912



train minibatch loop: 100%|██████████| 250/250 [01:40<00:00,  2.84it/s, accuracy=1, cost=0.00284]   
test minibatch loop: 100%|██████████| 63/63 [00:08<00:00,  7.58it/s, accuracy=0.778, cost=0.411] 
train minibatch loop:   0%|          | 0/250 [00:00<?, ?it/s]

time taken: 109.00232148170471
epoch: 9, training loss: 0.173978, training acc: 0.937131, valid loss: 0.479786, valid acc: 0.847410



train minibatch loop: 100%|██████████| 250/250 [01:34<00:00,  2.89it/s, accuracy=1, cost=0.00366]   
test minibatch loop: 100%|██████████| 63/63 [00:08<00:00,  7.70it/s, accuracy=0.778, cost=0.703]
train minibatch loop:   0%|          | 0/250 [00:00<?, ?it/s]

time taken: 102.65900731086731
epoch: 10, training loss: 0.165481, training acc: 0.938135, valid loss: 0.537419, valid acc: 0.831856



train minibatch loop: 100%|██████████| 250/250 [01:34<00:00,  2.88it/s, accuracy=1, cost=0.0163]    
test minibatch loop: 100%|██████████| 63/63 [00:08<00:00,  7.68it/s, accuracy=0.778, cost=0.459] 

time taken: 102.60771703720093
epoch: 11, training loss: 0.153757, training acc: 0.943406, valid loss: 0.512000, valid acc: 0.832358

break epoch:12






In [13]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = str_idx(test_X[i : min(i + batch_size, len(test_X))], dictionary, maxlen)
    batch_y = test_Y[i : min(i + batch_size, len(test_X))]
    predict_Y += np.argmax(
        sess.run(
            model.logits, feed_dict = {model.X: batch_x, model.Y: batch_y}
        ),
        1,
    ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 63/63 [00:08<00:00,  7.73it/s]


In [14]:
print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['negative', 'positive']
    )
)

             precision    recall  f1-score   support

   negative       0.76      0.93      0.84       985
   positive       0.91      0.72      0.80      1008

avg / total       0.84      0.82      0.82      1993

