In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


# INSTALL & IMPORT LIBRARY

In [2]:
!pip install underthesea
!pip install transformers
!pip install vncorenlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting underthesea
  Downloading underthesea-6.2.0-py3-none-any.whl (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
Collecting underthesea-core==1.0.0 (from underthesea)
  Downloading underthesea_core-1.0.0-cp310-cp310-manylinux2010_x86_64.whl (599 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m599.6/599.6 kB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: underthesea-core, python-crfsuite, underthesea
Successfully installed python-crfsuite-0.9.9 underthesea-6.2.0 underthesea-core-1.0.

In [3]:
from bs4 import BeautifulSoup
import numpy as np
import re
# from underthesea import word_tokenize
from keras.utils import to_categorical
from transformers import AutoTokenizer
from tensorflow.data import Dataset
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences
import pandas as pd

# PREPROCESSING

## LOAD DATA

In [4]:
class TextNormalize:
    def __init__(self):
        self.vowels_to_ids = {}
        self.vowels_table = [
            ['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a' ],
            ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
            ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
            ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e' ],
            ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
            ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i' ],
            ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o' ],
            ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'o'],
            ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
            ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u' ],
            ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
            ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y' ]
        ]
        pass

    def createVowelsTable(self):
        """Create Vowels Table"""
        for i in range(len(self.vowels_table)):
            for j in range(len(self.vowels_table[i]) - 1):
                self.vowels_to_ids[self.vowels_table[i][j]] = (i, j)

    def IsValidVietnameseWord(self,word):
        """Nguyên âm chỉ có thể đứng chung với nguyên âm. Một từ không thể có 2 nguyên âm cách nhau bởi 1 phụ âm"""
        chars = list(word)
        #nguyen am
        vowel_index = -1
        for i in range(len(chars)):
            idx_vowel_table = self.vowels_to_ids.get(chars[i],(-1,-1))[0]
            if idx_vowel_table != -1:
                if vowel_index == -1:
                    vowel_index = i
                else:
                    if i - vowel_index != 1:
                        return False
                    vowel_index = i
        return True

    def WordStandardized(self,word):
        """Standardize Word"""
        if not self.IsValidVietnameseWord(word):
            return word

        chars = list(word)
        vowel_indexes = []

        # tìm vị trí nguyên âm
        qu_or_gi = False
        thanh_dieu = 0
        for i in range(len(chars)):
            vowel_table_row, vowel_table_col = self.vowels_to_ids.get(chars[i],(-1,-1))
            if vowel_table_row == -1 :
                continue
            # qu
            if vowel_table_row == 9:
                if i != 0 and chars[i-1] == 'q':
                    chars[i] = 'u'
                    qu_or_gi = True
            # gi
            elif vowel_table_row == 5:
                if i != 0 and chars[i-1] == 'g':
                    chars[i] = 'i'
                    qu_or_gi = True

            # có chứa thanh điệu
            if vowel_table_col != 0:
                thanh_dieu = vowel_table_col
                chars[i] = self.vowels_table[vowel_table_row][0]

            vowel_indexes.append(i)
        # 1 nguyên âm
        if len(vowel_indexes) == 1:
            c = chars[vowel_indexes[0]]
            chars[vowel_indexes[0]] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
            return ''.join(chars)

        for idx_vowel in vowel_indexes:
            vowel_table_row, vowel_table_col = self.vowels_to_ids.get(chars[idx_vowel],(-1,-1))
            #ê, ơ, ô
            if vowel_table_row == 4 or vowel_table_row == 7 or vowel_table_row == 8:
                c = chars[idx_vowel]
                chars[idx_vowel] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
                return ''.join(chars)

            # kiểm tra qu và gi, 2-3 nguyên âm thì nguyên âm thứ 2 chứa dấu
            if qu_or_gi:
                if len(vowel_indexes) == 2 or len(vowel_indexes) == 3:
                    c = chars[vowel_indexes[1]]
                    chars[vowel_indexes[1]] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
                return ''.join(chars)

            # 2 nguyên âm
            if len(vowel_indexes) == 2:
                # âm cuối là nguyên âm
                if vowel_indexes[-1] == len(chars) - 1:
                    c = chars[vowel_indexes[0]]
                    chars[vowel_indexes[0]] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
                else:
                    c = chars[vowel_indexes[-1]]
                    chars[vowel_indexes[-1]] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
                return ''.join(chars)

            elif len(vowel_indexes) == 3:
                # âm cuối là nguyên âm
                if vowel_indexes[-1] == len(chars) - 1:
                    c = chars[vowel_indexes[1]]
                    chars[vowel_indexes[1]] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
                else:
                    c = chars[vowel_indexes[-1]]
                    chars[vowel_indexes[-1]] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
                return ''.join(chars)

        return ''.join(chars)

    def normalize(self,text):

        #Chuyen sang viet thuong
        text = text.lower()

        # Rút gọn từ kéo dài
        text = re.sub(r'(\w)\1+',r'\1',text)

        # xóa các emoji dư thừa
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'',text) # no emoji

        # xóa space d
        text = re.sub(r"( )\1+",r'\1',text)
        text = re.sub(r"[:)^@!`~%;?(\+\-\'\"]+",r'',text)

        # remove hastag
        text = re.sub("(@[A-Za-z0-9]+)|(#[0-9A-Za-z]+)"," ", text)
        return text

In [5]:
def convert_unicode(text):
  char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'
  charutf8 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'
  char1252 = char1252.split('|')
  charutf8 = charutf8.split('|')

  dic = {}
  for i in range(len(char1252)): dic[char1252[i]] = charutf8[i]
  return re.sub(
      r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
      lambda x: dic[x.group()], text
)


# ASPECT SENTIMENT CLASSIFICATION

In [6]:
PRETRAINED_MODEL = "bert-base-multilingual-cased"
SEP = "[SEP]"
MAX_LEN = 256
BATCH_SIZE = 8

## SENTENCE ASPECT PAIR

In [7]:
class LoadSentenceAspectData():
    def __init__(self, file_path):
        self.file_path = file_path

        self.data = pd.read_json(file_path)
        self.data = self.data.dropna()
        self.X = []
        self.aspect = []
        self.polarity = []
    def load(self,):
        _len = len(self.data)
        for i in range(_len):

            aspect_terms = self.data.iloc[i,2] #label

            x = self.data.iloc[i,0].strip() #text
            len_aspect = len(self.data.iloc[i,2])
            for j in range(len_aspect):
              aspect = self.data.iloc[i,2][j]['text'] #aspect
              label = self.data.iloc[i,2][j]['labels'][0] #polarity
              if label == 'Conflict':
                continue

              self.aspect.append(convert_unicode(aspect))
              self.polarity.append(label)
              self.X.append(convert_unicode(x))
        return self.X,self.aspect,self.polarity

In [None]:
sent_asp_data = LoadSentenceAspectData("/content/drive/MyDrive/Đồ án KHDL/data.json")
sentences, aspects,polarities = sent_asp_data.load()

In [None]:
len(sentences),len(polarities)

(18023, 18023)

In [None]:
np.unique(polarities)

array(['Negative', 'Neutral', 'Positive'], dtype='<U8')

In [23]:
# nếu nhiều ý positive hơn neutral 1 tí thì làm tròn positive
tag2idx = {"Neutral":1,"Positive":2,"Negative":0}
idx2tag = {v:k for k,v in tag2idx.items()}


In [8]:
def remove_duplicate_aspects(sentences,aspects,polarities):
  """
    Remove and get mean with duplicate aspects
  """
  df = pd.DataFrame({"Sentences":sentences,"Aspect":aspects,"Polarity":polarities})
  df['Polarity'] = df['Polarity'].apply(lambda x: tag2idx[x])
  gp = df.groupby(by = ['Sentences',"Aspect"])['Polarity'].agg(lambda x: np.ceil(pd.Series.mean(x))).reset_index()
  gp['Polarity'] = gp['Polarity'].astype(int).apply(lambda x: idx2tag[x])

  sents = gp['Sentences'].values
  asps = gp['Aspect'].values
  polars = gp['Polarity'].values
  return sents,asps,polars


In [None]:
sentences, aspects,polarities = remove_duplicate_aspects(sentences,aspects,polarities)
len(sentences),len(aspects),len(polarities)

In [None]:
sentences[:5],aspects[:5],polarities[:5]

In [None]:
text_normalize = TextNormalize()
sentences_normalized = np.asarray(list(map(lambda x:text_normalize.normalize(x),sentences)))
aspects_normalized = np.asarray(list(map(lambda x:text_normalize.normalize(x),aspects)))

In [None]:
X_pair = list(zip(sentences_normalized,aspects_normalized))
X_pair[:5],polarities[:5]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_dev, y_train, y_dev = train_test_split(X_pair, polarities, test_size=0.2, random_state=42)

NameError: ignored

In [None]:
X_train[:5],y_train[:5]

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
tokenizer.model_input_names

['input_ids', 'token_type_ids', 'attention_mask']

In [None]:
b = tokenizer(X_train[2][0],X_train[2][1],padding = 'max_length',max_length = MAX_LEN,truncation=True)
a = np.asarray(b['input_ids'])
first,end = np.argwhere(a==2)[0],np.argwhere(a==2)[-1]
print(first[0],end[0])
c = [[0]* (first[0]+2) + [1] * (end[0]-first[0]-2) + [0]*(MAX_LEN - end[0])]
for k,v in zip(b['input_ids'],c[0]):
  if k == 1:
    break
  print(k,"=>",v)
len(a),len(c[0])

In [None]:
# # <s> context </s> </s> aspect </s>
# def sentence_aspect_pair(X,polarities,tokenizer,batch_size,is_shuffle = False):
#   tag2idx = {"Neutral":1,"Positive":0,"Negative":2}
#   idx2tag = {v:k for k,v in tag2idx.items()}

#   features = {k:[] for k in tokenizer.model_input_names}

#   for i in range(len(X)):
#     token = tokenizer(X[i][0],X[i][1],padding = 'max_length',max_length = MAX_LEN,truncation=True)
#     for name in tokenizer.model_input_names:
#       if name == 'token_type_ids':
#         a = np.asarray(token['input_ids'])
#         first,end = np.argwhere(a==2)[0],np.argwhere(a==2)[-1]
#         token_type_id = [[0]* (first[0]+2) + [1] * (end[0]-first[0]-2) + [0]*(MAX_LEN - end[0])]
#         features[name].append(np.asarray(token_type_id[0]))
#         continue
#       temp = np.asarray(token[name]).astype('float')
#       features[name].append(temp)

#   for k,v in features.items():
#     features[k] = np.asarray(features[k])

#   y_encode = list(map(lambda x:tag2idx[x],polarities))
#   y_new = [to_categorical(y_encode[i],num_classes = len(tag2idx)) for i in range(len(y_encode))]

#   dataset = Dataset.from_tensor_slices((features, y_new))
#   if is_shuffle:
#     dataset = dataset.shuffle(buffer_size = len(y_new))
#   dataset = dataset.batch(batch_size).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

#   return dataset

In [10]:
# <s> aspect </s> </s> context </s>
def sentence_aspect_pair(X,polarities,tokenizer,batch_size,is_shuffle = False):
  tag2idx = {"Neutral":1,"Positive":2,"Negative":0}
  idx2tag = {v:k for k,v in tag2idx.items()}

  features = {k:[] for k in tokenizer.model_input_names}

  for i in range(len(X)):
    token = tokenizer(X[i][1],X[i][0],padding = 'max_length',max_length = MAX_LEN,truncation=True)
    for name in tokenizer.model_input_names:
      # if name == 'token_type_ids':
      #   a = np.asarray(token['input_ids'])
      #   first,end = np.argwhere(a==2)[0],np.argwhere(a==2)[-1]
      #   token_type_id = [[0]* (first[0]+1) + [1] * (end[0]-first[0]-1) + [0]*(MAX_LEN - end[0])]
      #   features[name].append(np.asarray(token_type_id[0]))
      #   continue
      temp = np.asarray(token[name]).astype('float')
      features[name].append(temp)

  for k,v in features.items():
    features[k] = np.asarray(features[k])

  y_encode = list(map(lambda x:tag2idx[x],polarities))
  y_new = [to_categorical(y_encode[i],num_classes = len(tag2idx)) for i in range(len(y_encode))]

  dataset = Dataset.from_tensor_slices((features, y_new))
  if is_shuffle:
    dataset = dataset.shuffle(buffer_size = len(y_new))
  dataset = dataset.batch(batch_size).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

  return dataset

In [None]:
train = sentence_aspect_pair(X_train,y_train,tokenizer,BATCH_SIZE,is_shuffle=True)
dev = sentence_aspect_pair(X_dev,y_dev,tokenizer,BATCH_SIZE)


In [None]:
for i in train.take(1):
  print(i[0]['input_ids'][0])
  print(i[0]['token_type_ids'][0])

  print(tokenizer.decode(tf.cast(i[0]['input_ids'][0],tf.int32)))
  print(tokenizer.decode(tf.cast(i[0]['input_ids'][1],tf.int32)))
  print(tokenizer.decode(tf.cast(i[0]['input_ids'][2],tf.int32)))

In [11]:
import transformers
from tensorflow.keras.layers import Input, Dropout, Dense,concatenate,Bidirectional,LSTM,TimeDistributed,Lambda, Embedding
from tensorflow.keras.models import Model


## ORIGINAL MODEL

In [12]:
def original_model():
  inputs = {
      "input_ids" : Input(shape = (MAX_LEN),dtype = 'int32', name = 'input_ids'),
      "token_type_ids": Input(shape = (MAX_LEN),dtype = 'int32', name = 'token_type_ids'),
      "attention_mask": Input(shape = (MAX_LEN),dtype = 'int32', name = 'attention_mask')
  }

  model = transformers.TFAutoModel.from_pretrained(PRETRAINED_MODEL,output_hidden_states=True)
  features = model(inputs).hidden_states
  concat = concatenate(
      tuple([features[i] for i in range(-4,0)]),axis = -1
  ) [:,0,:]

  dropout = Dropout(0.1) (concat)
  output = Dense(3,activation = 'softmax') (dropout)
  model = Model(inputs = inputs,outputs = output)
  return model

In [13]:
org_model = original_model()
org_model.summary()

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  177853440   ['attention_mask[0][0]',         
                                thPoolingAndCrossAt               'input_ids[0][0]',          

In [None]:
org_model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5),loss = 'categorical_crossentropy',metrics = "categorical_accuracy")
num_train_epochs = 50


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor = 'val_loss',
    patience = 5,
    restore_best_weights = True
    )

In [None]:
org_model.fit(
    train,
    validation_data = dev,
    epochs = num_train_epochs,
    callbacks = [early_stop],
    verbose = 1
)

Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


<keras.callbacks.History at 0x7f82473cbdc0>

In [None]:
org_model.save_weights("/content/drive/MyDrive/Đồ án/ae_checkpoint/bert-multi-asp.h5")

In [16]:
org_model.load_weights("/content/drive/MyDrive/weights/bert-multi.h5")

## EVALUATION

In [17]:
def convert_unicode(text):
  char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'
  charutf8 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'
  char1252 = char1252.split('|')
  charutf8 = charutf8.split('|')

  dic = {}
  for i in range(len(char1252)): dic[char1252[i]] = charutf8[i]
  return re.sub(
      r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
      lambda x: dic[x.group()], text
  )


In [18]:
sent_asp_data = LoadSentenceAspectData("/content/drive/MyDrive/Đồ án KHDL/test_300_Quy.json")
sentences, aspects,polarities = sent_asp_data.load()

In [None]:
tag2idx = {"Neutral":1,"Positive":2,"Negative":0}
idx2tag = {v:k for k,v in tag2idx.items()}


In [None]:
sentences, aspects,polarities = remove_duplicate_aspects(sentences,aspects,polarities)


In [None]:
text_normalize = TextNormalize()
sentences_normalized = np.asarray(list(map(lambda x:text_normalize.normalize(x),sentences)))
aspects_normalized = np.asarray(list(map(lambda x:text_normalize.normalize(x),aspects)))

In [None]:
sentences_normalized[:5],aspects_normalized[:5]

(array(['bánh bèo chén ở đây rất ấn tượng.bánh beo nhỏ xinh mỏng mỏng ăn kèm tôm cháy đúng điệu phong cách miền trung.quán có nhiều thực đơn món ăn phong phú mà giá cả lại rất hợp lí. mình hay tới đây ăn vào dịp cuối tuần lắm',
        'bánh bèo chén ở đây rất ấn tượng.bánh beo nhỏ xinh mỏng mỏng ăn kèm tôm cháy đúng điệu phong cách miền trung.quán có nhiều thực đơn món ăn phong phú mà giá cả lại rất hợp lí. mình hay tới đây ăn vào dịp cuối tuần lắm',
        'bánh bèo chén ở đây rất ấn tượng.bánh beo nhỏ xinh mỏng mỏng ăn kèm tôm cháy đúng điệu phong cách miền trung.quán có nhiều thực đơn món ăn phong phú mà giá cả lại rất hợp lí. mình hay tới đây ăn vào dịp cuối tuần lắm',
        'bánh bèo chén ở đây rất ấn tượng.bánh beo nhỏ xinh mỏng mỏng ăn kèm tôm cháy đúng điệu phong cách miền trung.quán có nhiều thực đơn món ăn phong phú mà giá cả lại rất h

In [None]:
X_pair = list(zip(sentences_normalized,aspects_normalized))
X_pair[:5],polarities[:5]

([('bánh bèo chén ở đây rất ấn tượng.bánh beo nhỏ xinh mỏng mỏng ăn kèm tôm cháy đúng điệu phong cách miền trung.quán có nhiều thực đơn món ăn phong phú mà giá cả lại rất hợp lí. mình hay tới đây ăn vào dịp cuối tuần lắm',
   'bánh beo'),
  ('bánh bèo chén ở đây rất ấn tượng.bánh beo nhỏ xinh mỏng mỏng ăn kèm tôm cháy đúng điệu phong cách miền trung.quán có nhiều thực đơn món ăn phong phú mà giá cả lại rất hợp lí. mình hay tới đây ăn vào dịp cuối tuần lắm',
   'bánh bèo chén'),
  ('bánh bèo chén ở đây rất ấn tượng.bánh beo nhỏ xinh mỏng mỏng ăn kèm tôm cháy đúng điệu phong cách miền trung.quán có nhiều thực đơn món ăn phong phú mà giá cả lại rất hợp lí. mình hay tới đây ăn vào dịp cuối tuần lắm',
   'giá cả'),
  ('bánh bèo chén ở đây rất ấn tượng.bánh beo nhỏ xinh mỏng mỏng ăn kèm tôm cháy đúng điệu phong cách miền trung.quán có nhiều thực đơn món ă

In [None]:
# <s> aspect </s> </s> context </s>
def sentence_aspect_pair(X,polarities,tokenizer,batch_size,is_shuffle = False):
  tag2idx = {"Neutral":1,"Positive":2,"Negative":0}
  idx2tag = {v:k for k,v in tag2idx.items()}

  features = {k:[] for k in tokenizer.model_input_names}

  for i in range(len(X)):
    token = tokenizer(X[i][1],X[i][0],padding = 'max_length',max_length = MAX_LEN,truncation="only_second")
    for name in tokenizer.model_input_names:
      temp = np.asarray(token[name]).astype('float')
      features[name].append(temp)

  for k,v in features.items():
    features[k] = np.asarray(features[k])

  y_encode = list(map(lambda x:tag2idx[x],polarities))
  y_new = [to_categorical(y_encode[i],num_classes = len(tag2idx)) for i in range(len(y_encode))]

  dataset = Dataset.from_tensor_slices((features, y_new))
  if is_shuffle:
    dataset = dataset.shuffle(buffer_size = len(y_new))
  dataset = dataset.batch(batch_size).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

  return dataset

In [None]:
test_data = sentence_aspect_pair(X_pair,polarities,tokenizer,2)


In [None]:
y_temp = np.asarray(list(map(lambda x: tag2idx[x],polarities)))
y_temp

array([2, 2, 2, ..., 1, 1, 2])

In [None]:
y_test_pred = np.argmax(org_model.predict(test_data,batch_size = BATCH_SIZE),axis=-1)
y_test_pred = y_test_pred.reshape((1,-1))[0]



In [None]:
print(y_test_pred)

[2 2 1 ... 1 2 2]


In [None]:
y_temp = np.asarray(list(map(lambda x: tag2idx[x],polarities)))
y_temp

array([2, 2, 2, ..., 1, 1, 2])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_temp, y_test_pred,digits = 4 ))

              precision    recall  f1-score   support

           0     0.5533    0.6646    0.6039       164
           1     0.8158    0.5322    0.6442       932
           2     0.7404    0.9037    0.8139      1319

    accuracy                         0.7441      2415
   macro avg     0.7032    0.7002    0.6873      2415
weighted avg     0.7568    0.7441    0.7341      2415



In [30]:
rd_sample_text = "Thịt heo 2 đầu da, miếng nào cũng to, phải xé ra ăn mới hết, chứ 1 miếng to quá, ăn 1 lần cũng hơi phí. Dùng bánh tráng phơi sương nên ko cần nhúng nước. Đồ chua giòn, dĩa rau siêu to, đủ loại, có cả lá rừng luôn"
rd_sample_aspect = "lá rừng"

normal_text = TextNormalize().normalize(convert_unicode(rd_sample_text))
normal_text = convert_unicode(rd_sample_text)
normal_text = re.sub("< (\w+) >",r"<\1>",normal_text)

normal_aspect = TextNormalize().normalize(convert_unicode(rd_sample_aspect))
normal_aspect = convert_unicode(rd_sample_aspect)
normal_aspect = re.sub("< (\w+) >",r"<\1>",normal_aspect)

token = tokenizer(normal_aspect,normal_text,padding = 'max_length',max_length = MAX_LEN,truncation=True)

features = {i : [[token[i]]] for i in tokenizer.model_input_names}

te = Dataset.from_tensor_slices(features)

y_pred_polarity = org_model.predict(te,batch_size=1)
# y_pred = np.argmax(y_pred,axis=-1)
y_pred_polarity = idx2tag[np.argmax(y_pred_polarity,axis=-1)[0]]
y_pred_polarity



'Positive'