In [0]:
import numpy as np
from pyvi import ViTokenizer as vitoken
import os
import io
import re
import string
import json
from collections import Counter

regex = '[^A-Za-zÁáÀàẢảÃãẠạĂăẮắẰằẲẳẴẵẶặÂâẤấẦầẨẩẪẫẬậÐđÉéÈèẺẻẼẽẸẹÊêẾếỀềỂểỄễỆệĨÍíÌìỈỉĨĩỊịÓóÒòỎỏÕõỌọÔôỐốỒồỔổỖỗỘộƠơỚớỜờỞởỠỡỢợÙùÚúỦủŨũỤụƯưỨứỪừỬửỮữỰựÝýỲỳỶỷỸỹỴỵ]+'  # only character and number
STOP_WORD_DIR = "C:\\Users\\hungv\\OneDrive - VNU-HCMUS\\000@@@1.KL\\00002.data\\stop_word"
MODEL_DIR = "C:\\Users\\hungv\\OneDrive - VNU-HCMUS\\000@@@1.KL\\00003.models"
data_dir = "C:\\Users\\hungv\\OneDrive - VNU-HCMUS\\000@@@1.KL\\00002.data"
TRAIN_FOLDER = 'data_train'
TEST_FOLDER = 'data_test'
VALIDATE_FOLDER = 'data_validation'
FILE_CLEANED_LABELED = "file_cleaned_label.txt"
WORD2INDEX = 'word2index.txt'
FINAL_FILE = "shuffled_data.txt"
VOCAB = 'vocab_new.txt'
FILE_CLEANED_LABELED_TRAIN = 'file_cleaned_label_train.txt'
# flags
TRAINING_DATA = 1
TESTING_DATA = 2
VALIDATION_DATA = 3


# vocabulary
def build_vocab(tokenized_data):
    # build vocab
    tmp_list = [item for sentence in tokenized_data for item in sentence]
    vocab = Counter()
    vocab.update(tmp_list)
    tokens = [k for k, c in vocab.items() if c >= 3]

    f = open(os.path.join(data_dir, 'vocab_new.txt'), 'w', encoding='utf-8')
    for token in tokens:
        f.write(token + '\n')
    f.close()


def load_vocab(file_name):
    file = open(file_name, 'r', encoding='utf-8')
    text = file.read()
    text = text.split('\n')
    file.close()
    return text


def save_vocab(vocab, file_name):
    data = "\n".join(vocab)
    file = open(file_name, 'w', encoding='utf-8')
    file.write(data)
    file.close()


def normalize_data(sentence: string):
    sentence = re.sub(" +", " ", sentence)
    if sentence:
        sentence = re.sub(regex, ' ', sentence)
        sentence = sentence.lower()
        sentence = sentence.lstrip()
        if len(sentence) > 1:
            return sentence
        else:
            return False


def clean_data(data: list):
    '''

    :param data: @data: untokenize data
    :return: cleaned data, tokenized
    '''
    cleaned_data = []
    vocab = load_vocab(os.path.join(data_dir, VOCAB))
    vocab = set(vocab)

    for i in range(len(data)):
        if type(data[i]) is str:
            if len(data[i]) > 1:
                tmp_sentence = data[i].lower().replace("\n", " ")
                tmp_sentence = normalize_data(tmp_sentence)
                # print(i)
                # if i == 9523:

                if type(tmp_sentence) is str and len(tmp_sentence) > 1:
                    tmp_sentence = vitoken.tokenize(tmp_sentence)
                    print(tmp_sentence)
                    tmp_sentence = tmp_sentence.split(" ")
                    print(tmp_sentence)
                    tmp_sentence = [w for w in tmp_sentence if w in vocab]
                    print(tmp_sentence)
                    cleaned_data.append(tmp_sentence)
    removed_stop_word = remove_stop_words(cleaned_data)

    return removed_stop_word


def add_labels(positive_list: list, negative_list: list):
    link = os.path.join(data_dir, FILE_CLEANED_LABELED)

    with open(link, 'w', encoding="utf-8") as writer:
        for i in range(len(positive_list)):
            # sent = ' '.join(positive_list[i])
            # line = sent + '| ' + str(1) + "\n"

            writer.write(positive_list[i] + '| ' + str(1) + '\n')

        for i in range(len(negative_list)):
            # sent = ' '.join(negative_list[i])
            # line = sent + '| ' + str(-1) + "\n"
            # writer.write(line)
            writer.write(negative_list[i] + '| ' + str(-1) + '\n')

        writer.close()


def remove_stop_words(tokenized_data: list):
    # remove all stop word
    stop_word_file = open(os.path.join(STOP_WORD_DIR, "vietnamese-stopwords.txt"), 'r', encoding='utf-8')
    stop_word_list = stop_word_file.read()
    stop_word_file.close()
    stop_word_list = stop_word_list.split('\n')
    tokenized_sentences_stop_word = []
    for sentence in tokenized_data:
        index = tokenized_data.index(sentence)
        not_in_stop_word = []
        for word in sentence:
            if word not in stop_word_list:
                not_in_stop_word.append(word)
        if len(not_in_stop_word) > 1:
            tokenized_sentences_stop_word.append(not_in_stop_word)
    return tokenized_sentences_stop_word


def word2index(tokenized_data: list):
    word_2_index = {'PAD': 0}
    number_of_words = 1
    for i in range(len(tokenized_data)):
        for token in tokenized_data[i]:
            if token not in word_2_index:
                word_2_index[token] = number_of_words
                number_of_words += 1

    with io.open(os.path.join(data_dir, WORD2INDEX), 'w', encoding='utf-8') as json_file:
        json.dump(word_2_index, json_file, ensure_ascii=False)
    return word_2_index


def pad_sequence(sequence, max_length):
    padded_seq = []
    for i in range(len(sequence)):
        padded_seq.append(sequence[i])
        print(i)
        k = len(sequence[i])
        while (k < max_length):
            padded_seq[i].append(0)
            k = len(padded_seq[i])

    return sequence


def word_to_index(data: list, word2idx: dict):
    sequence = []
    for sentence in data:
        sentence = [word2idx.get(token) for token in sentence if word2idx.get(token) is not None]
        sequence.append(sentence)
    return sequence


def shuffle_file():
    '''Shuffle the data in the file '''
    link = os.path.join(data_dir, FILE_CLEANED_LABELED)

    with open(link, 'r', encoding='utf-8') as source:
        data = [(np.random.random(), line) for line in source]
    data.sort()
    link = os.path.join(data_dir, FINAL_FILE)

    with open(link, 'w', encoding='utf-8') as target:
        for _, line in data:
            target.write(line)


# def split_train_test_file():
#     link = os.path.join(data_dir, FINAL_FILE)
#     f = open(link, encoding='utf-8')
#     data = f.read()
#
#     n_len = len(data)
#     test_size = 0.2 * n_len
#     train_size = n_len - test_size
#     train_writer = open(TRAIN_FOLDER, 'w', encoding='utf-8')
#     test_writer = open(TEST_DATA, 'w', encoding='utf-8')
#
#     for i in range(n_len):
#         if i < test_size:
#             test_writer.write(data[i])
#         else:
#             train_writer.write(data[i])


def count_labels(labeled):
    pos = 0
    neg = 0
    neu = 0
    for label in labeled:
        if label == 1:
            pos += 1
        elif label == 0:
            neu += 1
        else:
            neg += 1
    print(str(pos) + "- " + str(neg) + "- " + str(neu))


def load_data_new_version():
    # path = os.path.join(data_dir, VALIDATE_FOLDER)
    # labels = os.listdir(path)
    #
    # neg_data = []
    # pos_data = []
    # for label in labels:
    #     link = os.path.join(path, label)
    #     list_files = os.listdir(link)
    #     for file in list_files:
    #         f = open(os.path.join(link, file), 'r', encoding='utf-8')
    #         sentence = f.read()
    #         sentence = sentence.replace('\n', ' ')
    #         if label == 'neg':
    #             neg_data.append(sentence)
    #         else:
    #             pos_data.append(sentence)
    #
    # # d = pos_data + neg_data
    # pos_cleaned_data = clean_data(pos_data)
    # neg_cleaned_data = clean_data(neg_data)
    # print("pos: %d" % len(pos_cleaned_data))
    # print("neg: %d" % len(neg_cleaned_data))
    # # d = clean_data(d)
    # # word2idx = word2index(d)
    #
    # f = open(os.path.join(data_dir, "positive_cleaned.txt"), 'w', encoding='utf-8')
    # for sentence in pos_cleaned_data:
    #     line = ' '.join(sentence)
    #     f.write(line + '\n')
    # f = open(os.path.join(data_dir, "negative_cleaned.txt"), 'w', encoding='utf-8')
    # for sentence in neg_cleaned_data:
    #     line = ' '.join(sentence)
    #     f.write(line + '\n')
    # f.close()
    # # indexed_data = word_to_index(d, word2idx)
    # #
    f = open(os.path.join(constant.data_dir, "preprocessed\\train\\positive_cleaned.txt"), 'r', encoding='utf-8')
    pos_cleaned_data = f.read()
    pos_cleaned_data = pos_cleaned_data.split('\n')
    f = open(os.path.join(constant.data_dir, "preprocessed\\train\\negative_cleaned.txt"), 'r', encoding='utf-8')
    neg_cleaned_data = f.read()
    neg_cleaned_data = neg_cleaned_data.split('\n')
    pos_cleaned_data = [sentence.split(' ') for sentence in pos_cleaned_data]
    neg_cleaned_data = [sentence.split(' ') for sentence in neg_cleaned_data]
    word2index(neg_cleaned_data + pos_cleaned_data)
    # add_labels(positive_list=pos_cleaned_data, negative_list=neg_cleaned_data)
    # shuffle_file()
    print('done')
    # return neg_data, pos_data
    return pos_cleaned_data, neg_cleaned_data


def get_word_index():
    with open(os.path.join(constant.data_dir, constant.WORD2INDEX), encoding='utf-8') as json_file:
        word2idx = json.load(json_file)
    return word2idx


def load_cleaned_data(file_path: str):
    f = open(file_path, 'r', encoding='utf-8')
    df = f.read()
    df = df.split('\n')
    df = [sentence.split('| ') for sentence in df]
    df = df[:-1]
    x = [item[0] for item in df]
    y = [int(item[1]) for item in df]
    return x, y


def load_data(flag):
   # path = os.path.join(data_dir, 'preprocessed')
    if flag == TRAINING_DATA:
        #path = os.path.join(path, "train\\shuffled_data.txt")
        path = 'shuffled_data_train.txt'
    if flag == TESTING_DATA:
        #path = os.path.join(path, "test\\shuffled_data.txt")
         path = 'shuffled_data_test.txt'
    if flag == VALIDATION_DATA:
       # path = os.path.join(path, "validation\\shuffled_data.txt")
         path = 'shuffled_data_validation.txt'

    x, y = load_cleaned_data(path)
    x = [sentence.split(' ') for sentence in x]
    with open(WORD2INDEX, encoding='utf-8') as json_file:
        word2idx = json.load(json_file)
    x = word_to_index(x, word2idx)
    return x, y


In [0]:
pip install pyvi

Collecting pyvi
[?25l  Downloading https://files.pythonhosted.org/packages/65/6d/09e335f8399507cfa2260d5f06b27e6a9399a79251101fa7a47e5d294029/pyvi-0.0.9.3.tar.gz (5.2MB)
[K     |████████████████████████████████| 5.3MB 3.1MB/s 
Collecting sklearn-crfsuite (from pyvi)
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite->pyvi)
[?25l  Downloading https://files.pythonhosted.org/packages/2f/86/cfcd71edca9d25d3d331209a20f6314b6f3f134c29478f90559cee9ce091/python_crfsuite-0.9.6-cp36-cp36m-manylinux1_x86_64.whl (754kB)
[K     |████████████████████████████████| 757kB 40.3MB/s 
Building wheels for collected packages: pyvi
  Building wheel for pyvi (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/49/44/c1/56344a2e33862991f04fdbacc8b8369bfc597723e63cdf17ea
Successfully built pyvi
Installing collecte

In [23]:
import keras as K
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
import os
import time
import tensorflow as tf
from matplotlib import pyplot as plt

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.logging.set_verbosity(tf.logging.ERROR)

def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.backend.sum(K.backend.round(K.backend.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.backend.sum(K.backend.round(K.backend.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.backend.epsilon())
    return recall


def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.backend.sum(K.backend.round(K.backend.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.backend.sum(K.backend.round(K.backend.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.backend.epsilon())
    return precision


def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.backend.sum(K.backend.round(K.backend.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.backend.sum(K.backend.round(K.backend.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.backend.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.backend.sum(K.backend.round(K.backend.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.backend.sum(K.backend.round(K.backend.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.backend.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.backend.epsilon()))


class SAV_Model:

    def __init__(self, activation, max_words=20000, lstm_unit=512, output_dim=32, drop_out=0.2,
                 optimizer='rmsprop', loss_func='binary_crossentropy'):
        self.model = Sequential()
        self.model.add(Embedding(input_dim=max_words, output_dim=256, mask_zero=True))
        self.model.add(LSTM(units=lstm_unit, dropout=drop_out, recurrent_dropout=0.2))
        self.model.add(Dense(256, activation=activation[0]))
        self.model.add(Dropout(drop_out))
        self.model.add(Dense(1, activation=activation[1]))
        self.model.compile(optimizer=optimizer, loss=loss_func, metrics=[f1,recall,precision,'accuracy'])

    def summary(self):
        self.model.summary()

    def train(self, x_train, y_train, batch_size=100, n_epochs=3, verbose=1):
        self.model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epochs, verbose=verbose)

    def save_model(self, file_name):
        model_dir = os.path.join(file_name)
        self.model.save(model_dir)
        print("model is saved!!!")

    @staticmethod
    def load_model(model_path):
        return K.models.load_model(model_path)

    def evaluate_model(self, x_valid, y_valid):
        loss_acc = self.model.evaluate(x_valid, y_valid, verbose=0)
        print("Test data: loss = %0.6f  accuracy = %0.2f%% " % (loss_acc[0], loss_acc[1] * 100))
        return loss_acc

    def predict_(self, sentences: list):
        """
        :param sentences: the sentences list will be predicted.
        :return: probability of positive sentiment.
        """
        prob = []
        d = preprocessing.get_word_index()
        for sentence in sentences:
            review = preprocessing.clean_data(sentence)
            review = preprocessing.word_to_index(review, d)
            review = pad_sequences(review, truncating='post', padding='post', maxlen=80)
            prediction = self.model.predict(review)
            prob.append(prediction[0][0])

        return prob


if __name__ == '__main__':
    x_train, y_train = load_data(TRAINING_DATA)
    #x_test, y_test = load_data(TESTING_DATA)
    x_valid, y_valid = load_data(VALIDATION_DATA)
   # x_train = x_train[:1000]
   # y_train = y_train[:1000]
   # x_valid = x_valid[:300]
   # y_valid = y_valid[:300]
    print('load data done')
    max_review_len = max([len(s) for s in x_train])
    x_train = pad_sequences(x_train, maxlen=max_review_len, padding='post',
                                                         truncating='post')
    x_valid = pad_sequences(x_valid, maxlen=max_review_len, padding='post',
                                                         truncating='post')

    model = SAV_Model(activation=["tanh", 'sigmoid'])
    model.summary()
    start = time.time()
    # model = train(model, x_train, y_train,n_epochs=3)
    model.train(x_train=x_train, y_train=y_train,n_epochs = 10)
    end = time.time()
    print('training time: %.4f seconds' % (end - start))
    model.save_model("model_lstm_new_0405.h5")
    model.evaluate_model(x_valid, y_valid)


# load model and test
#     sentence = "Quán nấu ăn ngon, KHÔNG GIAN chật."
#     sentence2 = "T7 mình ghé đây. Nhân viên không nhiệt tính.\nMang beer ra rồi để mình ngồi 1 đống. Rót beer thì đổ. \nThật không chuyên nghiệp ."
#     sentence2 = "trái_cây nhỏ_xíu . bể nát . . loai rẻ tien . nho chua_lè . thanh_long chua_lè . dừa bào mỏng lét .\n ăn xong 2mej con bi tào tháo dí luon . 1to.30k.bằng tô phở rui . kg dám quay lai luon . goi xe thì xa"
#     d = preprocessing.get_word_index()
#
#     #
#     review = preprocessing.clean_data([sentence2])
#     # print(review)
#     review = preprocessing.word_to_index(review, d)
#     # print(review)
#     review = K.preprocessing.sequence.pad_sequences(review, truncating='post', padding='post', maxlen=80)
#     path = os.path.join(constant.MODEL_DIR, "model_lstm_new_2704.h5")
#     model = SAV_Model.load_model(model_path=path)
#     probs = model.predict(review)
#
#     print("Sentence: \n\t""%s""" % sentence2)
#     print("Predict:", end=" ")
#     if probs[0][0] >= 0.5:
#         print("positive (prob=%0.4f)" % probs[0][0])
#     else:
#         print("negative (prob=%0.4f)" % (1 - probs[0][0]))


load data done
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 256)         5120000   
_________________________________________________________________
lstm_4 (LSTM)                (None, 512)               1574912   
_________________________________________________________________
dense_7 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 257       
Total params: 6,826,497
Trainable params: 6,826,497
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10

KeyboardInterrupt: ignored

In [24]:
pip install matplotlib



# New Section