In [1]:
import os
import tensorflow as tf
import malaya
import json
import numpy as np
from tqdm import tqdm

In [2]:
negatives = ['negative/' + i for i in os.listdir('negative') if '.json' in i]
positives = ['positive/' + i for i in os.listdir('positive') if '.json' in i]
negatives

['negative/9.json',
 'negative/2.json',
 'negative/1.json',
 'negative/4.json',
 'negative/5.json',
 'negative/6.json',
 'negative/8.json',
 'negative/7.json']

In [3]:
import re

tokenizer = malaya.preprocessing._SocialTokenizer().tokenize

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(string)
    tokenized = [w.lower() for w in tokenized if len(w) > 1]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

def clean_label(label):
    string = re.sub('[^A-Za-z\- ]+', ' ', label)
    return re.sub(r'[ ]+', ' ', string.lower()).strip()

In [4]:
X, Y = [], []

for n in negatives:
    with open(n) as fopen:
        x = json.load(fopen)
    processed = [preprocessing(s) for s in x]
    X.extend(processed)
    Y.extend([0] * len(processed))
    
len(X), len(Y)

(344733, 344733)

In [5]:
for p in positives:
    with open(p) as fopen:
        x = json.load(fopen)
    processed = [preprocessing(s) for s in x]
    X.extend(processed)
    Y.extend([1] * len(processed))
    
len(X), len(Y)

(658298, 658298)

In [6]:
def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [7]:
import itertools
import collections

concat = list(itertools.chain(*X))
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 131450
Most common words [('saya', 525204), ('yang', 188475), ('tidak', 158850), ('untuk', 124207), ('anda', 123928), ('di', 119593)]
Sample data [204, 18477, 35497, 488, 78, 4, 1432, 743, 85, 4] ['kenapa', 'solange', 'knowles', 'marah', 'kepada', 'saya', 'berbunyi', 'pecah', 'membuat', 'saya']


In [8]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [9]:
class Model:
    def __init__(self, embedded_size, dict_size, dimension_output, learning_rate):
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        self.logits = tf.layers.dense(tf.reduce_mean(encoder_embedded, 1), dimension_output)
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=self.logits,
            labels=self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [10]:
maxlen = 50
batch_size = 128
embedded_size = 128

In [11]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(embedded_size, len(dictionary), 2, 1e-3)
sess.run(tf.global_variables_initializer())

In [12]:
def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen][::-1]):
            X[i, -1 - no] = dic.get(k, UNK)
    return X

vectors = str_idx(X,dictionary,maxlen)

In [13]:
from sklearn.cross_validation import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(vectors, 
                                                    Y,
                                                    test_size = 0.2)

In [14]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 4115/4115 [00:50<00:00, 81.85it/s, accuracy=0.826, cost=0.393]
test minibatch loop: 100%|██████████| 1029/1029 [00:02<00:00, 458.99it/s, accuracy=0.816, cost=0.53] 
train minibatch loop:   0%|          | 9/4115 [00:00<00:49, 82.72it/s, accuracy=0.75, cost=0.503] 

epoch: 0, pass acc: 0.000000, current acc: 0.777794
time taken: 52.52127766609192
epoch: 0, training loss: 0.526249, training acc: 0.753018, valid loss: 0.489578, valid acc: 0.777794



train minibatch loop: 100%|██████████| 4115/4115 [00:50<00:00, 82.26it/s, accuracy=0.848, cost=0.355]
test minibatch loop: 100%|██████████| 1029/1029 [00:02<00:00, 465.15it/s, accuracy=0.829, cost=0.531]
train minibatch loop:   0%|          | 9/4115 [00:00<00:49, 82.43it/s, accuracy=0.758, cost=0.485]

epoch: 1, pass acc: 0.777794, current acc: 0.782600
time taken: 52.243654012680054
epoch: 1, training loss: 0.469864, training acc: 0.791425, valid loss: 0.482992, valid acc: 0.782600



train minibatch loop: 100%|██████████| 4115/4115 [00:49<00:00, 82.33it/s, accuracy=0.87, cost=0.335] 
test minibatch loop: 100%|██████████| 1029/1029 [00:02<00:00, 462.19it/s, accuracy=0.829, cost=0.531]
train minibatch loop:   0%|          | 9/4115 [00:00<00:49, 83.01it/s, accuracy=0.758, cost=0.475]

epoch: 2, pass acc: 0.782600, current acc: 0.782661
time taken: 52.21110391616821
epoch: 2, training loss: 0.456914, training acc: 0.798993, valid loss: 0.482993, valid acc: 0.782661



train minibatch loop: 100%|██████████| 4115/4115 [00:49<00:00, 82.49it/s, accuracy=0.87, cost=0.323] 
test minibatch loop: 100%|██████████| 1029/1029 [00:02<00:00, 470.27it/s, accuracy=0.829, cost=0.531]
train minibatch loop:   0%|          | 9/4115 [00:00<00:50, 81.92it/s, accuracy=0.773, cost=0.466]

time taken: 52.0782995223999
epoch: 3, training loss: 0.447884, training acc: 0.804443, valid loss: 0.485083, valid acc: 0.781521



train minibatch loop: 100%|██████████| 4115/4115 [00:50<00:00, 82.29it/s, accuracy=0.87, cost=0.314] 
test minibatch loop: 100%|██████████| 1029/1029 [00:02<00:00, 465.54it/s, accuracy=0.816, cost=0.531]
train minibatch loop:   0%|          | 9/4115 [00:00<00:49, 83.10it/s, accuracy=0.773, cost=0.458]

time taken: 52.22002625465393
epoch: 4, training loss: 0.440469, training acc: 0.808788, valid loss: 0.488281, valid acc: 0.779891



train minibatch loop: 100%|██████████| 4115/4115 [00:49<00:00, 82.44it/s, accuracy=0.891, cost=0.307]
test minibatch loop: 100%|██████████| 1029/1029 [00:02<00:00, 469.07it/s, accuracy=0.816, cost=0.531]

time taken: 52.11386489868164
epoch: 5, training loss: 0.434015, training acc: 0.812294, valid loss: 0.492204, valid acc: 0.777635

break epoch:6






In [15]:
pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
y_predict = []
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    logits = sess.run(model.logits,
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x
            },
        )
    logits = np.argmax(logits,1).tolist()
    y_predict.extend(logits)

test minibatch loop: 100%|██████████| 1029/1029 [00:00<00:00, 1156.29it/s]


In [16]:
from sklearn import metrics

print(metrics.classification_report(test_Y, y_predict, target_names = ['-', '+']))

             precision    recall  f1-score   support

          -       0.76      0.85      0.80     68727
          +       0.81      0.70      0.75     62933

avg / total       0.78      0.78      0.78    131660

