# INSTALL LIBRARY

In [None]:
!pip install underthesea
!pip install transformers
!pip install vncorenlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting underthesea
  Downloading underthesea-6.2.0-py3-none-any.whl (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
Collecting underthesea-core==1.0.0 (from underthesea)
  Downloading underthesea_core-1.0.0-cp310-cp310-manylinux2010_x86_64.whl (599 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m599.6/599.6 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: underthesea-core, python-crfsuite, underthesea
Successfully installed python-crfsuite-0.9.9 underthesea-6.2.0 underthesea-core-1.0.

In [None]:
from bs4 import BeautifulSoup
import numpy as np
import re
# from underthesea import word_tokenize
from keras.utils import to_categorical
from transformers import AutoTokenizer
from tensorflow.data import Dataset
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences
import pandas as pd

In [None]:
from vncorenlp import VnCoreNLP


# LOAD DATA

## PREPROCESSING

In [None]:
class TextNormalize:
    def __init__(self):
        self.vowels_to_ids = {}
        self.vowels_table = [
            ['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a' ],
            ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
            ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
            ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e' ],
            ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
            ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i' ],
            ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o' ],
            ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'o'],
            ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
            ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u' ],
            ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
            ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y' ]
        ]
        pass

    def createVowelsTable(self):
        """Create Vowels Table"""
        for i in range(len(self.vowels_table)):
            for j in range(len(self.vowels_table[i]) - 1):
                self.vowels_to_ids[self.vowels_table[i][j]] = (i, j)

    def IsValidVietnameseWord(self,word):
        """Nguyên âm chỉ có thể đứng chung với nguyên âm. Một từ không thể có 2 nguyên âm cách nhau bởi 1 phụ âm"""
        chars = list(word)
        #nguyen am
        vowel_index = -1
        for i in range(len(chars)):
            idx_vowel_table = self.vowels_to_ids.get(chars[i],(-1,-1))[0]
            if idx_vowel_table != -1:
                if vowel_index == -1:
                    vowel_index = i
                else:
                    if i - vowel_index != 1:
                        return False
                    vowel_index = i
        return True

    def WordStandardized(self,word):
        """Standardize Word"""
        if not self.IsValidVietnameseWord(word):
            return word

        chars = list(word)
        vowel_indexes = []

        # tìm vị trí nguyên âm
        qu_or_gi = False
        thanh_dieu = 0
        for i in range(len(chars)):
            vowel_table_row, vowel_table_col = self.vowels_to_ids.get(chars[i],(-1,-1))
            if vowel_table_row == -1 :
                continue
            # qu
            if vowel_table_row == 9:
                if i != 0 and chars[i-1] == 'q':
                    chars[i] = 'u'
                    qu_or_gi = True
            # gi
            elif vowel_table_row == 5:
                if i != 0 and chars[i-1] == 'g':
                    chars[i] = 'i'
                    qu_or_gi = True

            # có chứa thanh điệu
            if vowel_table_col != 0:
                thanh_dieu = vowel_table_col
                chars[i] = self.vowels_table[vowel_table_row][0]

            vowel_indexes.append(i)
        # 1 nguyên âm
        if len(vowel_indexes) == 1:
            c = chars[vowel_indexes[0]]
            chars[vowel_indexes[0]] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
            return ''.join(chars)

        for idx_vowel in vowel_indexes:
            vowel_table_row, vowel_table_col = self.vowels_to_ids.get(chars[idx_vowel],(-1,-1))
            #ê, ơ, ô
            if vowel_table_row == 4 or vowel_table_row == 7 or vowel_table_row == 8:
                c = chars[idx_vowel]
                chars[idx_vowel] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
                return ''.join(chars)

            # kiểm tra qu và gi, 2-3 nguyên âm thì nguyên âm thứ 2 chứa dấu
            if qu_or_gi:
                if len(vowel_indexes) == 2 or len(vowel_indexes) == 3:
                    c = chars[vowel_indexes[1]]
                    chars[vowel_indexes[1]] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
                return ''.join(chars)

            # 2 nguyên âm
            if len(vowel_indexes) == 2:
                # âm cuối là nguyên âm
                if vowel_indexes[-1] == len(chars) - 1:
                    c = chars[vowel_indexes[0]]
                    chars[vowel_indexes[0]] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
                else:
                    c = chars[vowel_indexes[-1]]
                    chars[vowel_indexes[-1]] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
                return ''.join(chars)

            elif len(vowel_indexes) == 3:
                # âm cuối là nguyên âm
                if vowel_indexes[-1] == len(chars) - 1:
                    c = chars[vowel_indexes[1]]
                    chars[vowel_indexes[1]] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
                else:
                    c = chars[vowel_indexes[-1]]
                    chars[vowel_indexes[-1]] = self.vowels_table[self.vowels_to_ids[c][0]][thanh_dieu]
                return ''.join(chars)

        return ''.join(chars)

    def normalize(self,text):

        #Chuyen sang viet thuong
        text = text.lower()

        # Rút gọn từ kéo dài
        text = re.sub(r'(\w)\1+',r'\1',text)

        # xóa các emoji dư thừa
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'',text) # no emoji

        text = text.split()
        # chuẩn hóa thanh điệu
        for i in range(len(text)):
            text[i] = self.WordStandardized(text[i])

        text = ' '.join(text)

        # xóa space d
        text = re.sub(r"( )\1+",r'\1',text)
        text = re.sub(r"[:)^@!`~%;?(\+\-\'\"]+",r'',text)

        # remove hastag
        text = re.sub("(@[A-Za-z0-9]+)|(#[0-9A-Za-z]+)"," ", text)
        return text

In [None]:
def convert_unicode(text):
  char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'
  charutf8 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'
  char1252 = char1252.split('|')
  charutf8 = charutf8.split('|')

  dic = {}
  for i in range(len(char1252)): dic[char1252[i]] = charutf8[i]
  return re.sub(
      r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
      lambda x: dic[x.group()], text
)


In [None]:
class LoadData():
    def __init__(self, file_path):
        self.file_path = file_path

        self.data = pd.read_json(file_path, lines = True)
        # self.data = self.data.dropna()
        self.X = []
        self.y = []
    def transform(self,x,label):
        y = []
        if len(label) == 0:
          return x,np.array(["O" for i in range(len(x.split()))])

        first_index = label[:,0].astype(int)
        second_index = label[:,1].astype(int)
        asp_cate_pola = label[:,2]
        s = 0
        a = ""
        # chay tu s -> first, cap nhat s
        for i in range(len(label)):

            front = x[s:first_index[i]]
            # print(first_index[i],second_index[i])
            middle = x[first_index[i]:second_index[i]]
            # print(x)
            # print(middle)
            s = second_index[i]

            a += front + " " + middle + " "
            y.extend(["O" for i in range(len(front.split()))])
            y.extend([f"B-{asp_cate_pola[i]}" if j == 0 else f"I-{asp_cate_pola[i]}" for j in range(len(middle.split()))])

        if s != len(x):
            a+= x[s:]
            y.extend(["O" for i in range(len(x[s:].split()))])

        # print(a)
        # for k, v in zip(a.split(),y):
        #     print(k,"=>",v)
        return a,np.array(y)

    def ExtractAspectTermPosition(self,span_labels):
        labels = []
        # print(aspectTerms)
        for ls in span_labels:
            start = ls[0]
            end = ls[1]
            asp_cate_pola = ls[2]

            labels.append([int(start),int(end), asp_cate_pola])
        return np.array(sorted(labels,key = lambda x: x[0]))

    def load(self,):
        _len = len(self.data)
        for i in range(_len):
            x = self.data.iloc[i,0].strip() #text
            span_labels = self.data.iloc[i,1] #label
            span_labels = np.array(sorted(span_labels,key = lambda x: x[0]))

            x,y = self.transform(x,span_labels)
            self.X.append(convert_unicode(x))
            self.y.append(y)

        return self.X,self.y

In [None]:
class AlignLabel():
  def __init__(self):
    pass
  def Convert2LabelPosition(self,label):
    labels_position = []
    lst_first_pos = np.array([i if "B-" in v else 0 for i,v in enumerate(label)])
    lst_first_pos = np.argwhere(lst_first_pos != 0).reshape(1,-1)[0]

    for i in range(len(lst_first_pos)):
      # if i reach last pos: label range should be (i,len(label))
      last_pos = lst_first_pos[i]
      if i == len(lst_first_pos) - 1:
        for j in range(lst_first_pos[i],len(label)):
          if "I-" in label[j]:
            last_pos = j
      else:
        for j in range(lst_first_pos[i],lst_first_pos[i+1]):
          if "I-" in label[j]:
            last_pos = j
      labels_position.append([lst_first_pos[i],last_pos,label[lst_first_pos[i]]])

    return np.array(labels_position)

  def segment_and_alignLabel(self,x,y,tokenizer, SEP):
    def segment_and_addSEP(seg,ismid):
      seg = tokenizer.tokenize(seg)
      seg = [" ".join(s) for s in seg]
      seg = " ".join(seg)
      return seg


    y_new = []
    label = self.Convert2LabelPosition(y)
    preprocess = TextNormalize()

    if len(label) == 0:
      x_temp = preprocess.normalize(x)
      x_temp = segment_and_addSEP(x_temp,False)
      x_temp = " ".join(x_temp.split())
      y_new = ["O" for i in range(len(x_temp.split()))]
      return  [x_temp,np.array(y_new)]

    first_index = label[:,0].astype(int)
    second_index = label[:,1].astype(int)
    asp_cate_pola = label[:,2]
    s = 0
    a = ""
    x = x.split()
    for i in range(len(label)):
        front = " ".join(x[s:first_index[i]])
        if first_index[i] == second_index[i]:
          middle = x[first_index[i]]
          s = second_index[i] + 1
        else:
          middle = " ".join(x[first_index[i]:second_index[i]+1])
          s = second_index[i]+1

        front = preprocess.normalize(front)
        middle = preprocess.normalize(middle)

        front = segment_and_addSEP(front,False)
        middle = segment_and_addSEP(middle,True )

        a += front + " " + middle + " "
        y_new.extend(["O" for i in range(len(front.split()))])

        if first_index[i] == second_index[i]:
          y_new.extend([f"B-{asp_cate_pola[i][2:]}"])
        else:
          y_new.extend([f"B-{asp_cate_pola[i][2:]}" if j == 0 else f"I-{asp_cate_pola[i][2:]}" for j in range(len(middle.split(" ")))])

    if s != len(x):
        enc = " ".join(x[s:])
        enc = preprocess.normalize(enc)
        enc = segment_and_addSEP(enc,False)
        a+= enc
        y_new.extend(["O" for i in range(len(enc.split()))])

    a = " ".join(a.split())
    return [a,np.array(y_new)]

  def tokenize_and_alignlabel(self,x,y,tag2idx,tokenizer):
    x = x.strip().split(" ")
    y_position = self.Convert2LabelPosition(y)
    if len(y_position) == 0:
      return np.zeros(MAX_LEN)

    first_index = y_position[:,0].astype(int)
    second_index = y_position[:,1].astype(int)
    asp_cate_pola = y_position[:,2]
    y_new = np.zeros(MAX_LEN)
    x_tokenize = []
    s = 0
    pre_len = 0
    y_position = 1
    for i in range(len(first_index)):
      front_len = len(tokenizer(" ".join(x[s:first_index[i]]),add_special_tokens = False)['input_ids'])
      y_new[y_position:y_position + front_len] = tag2idx["O"]
      y_position += front_len

      if first_index[i] == second_index[i]:
        words = tokenizer(x[first_index[i]],add_special_tokens = False)['input_ids']
        s = second_index[i] + 1
        # print(x[first_index[i]])

      else:
        # print(x[first_index[i]:second_index[i]+1],)
        words = tokenizer(" ".join(x[first_index[i]:second_index[i]+1]),add_special_tokens = False)['input_ids']
        s = second_index[i] + 1
      y_new[y_position] = tag2idx[f'B-{asp_cate_pola[i][2:]}']
      # print(words)
      if len(words) >= 2:
        y_new[y_position+1:y_position+1+len(words)-1] = tag2idx[f'I-{asp_cate_pola[i][2:]}'] # skip B-name position, subtract -1 because len(words) contain B-name

      y_position += len(words)

    tokenize_last_sents = tokenizer(" ".join(x[s:]),add_special_tokens = False)['input_ids']
    y_new[y_position:y_position + len(tokenize_last_sents) + 1] = tag2idx["O"]

    end_sep_position = y_position  + len(tokenizer(" ".join(x[s:])))

    y_new[0] = tag2idx['O']
    y_new[end_sep_position ] = tag2idx['O']

    return y_new

  def transform(self,x_raw,y_raw,rdrsegmenter,SEP):
    X = []
    Y = []
    for i in range(len(x_raw)):
      x, y = self.segment_and_alignLabel(x_raw[i],y_raw[i],rdrsegmenter,SEP)
      X.append(x)
      Y.append(y)
    X = np.asarray(X)
    Y = np.asarray(Y)
    return X,Y

## READ DATA

In [None]:
data = LoadData("/content/drive/MyDrive/Nhóm - Tiến + Quý + Khanh + Văn/IE403 - Khai thác dữ liệu truyền thông xã hội/Đồ án/dataset/train.jsonl")
X_raw,y_raw = data.load()


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_dev, y_train, y_dev = train_test_split(X_raw, y_raw, test_size=0.3, random_state=42)

In [None]:
f= open('/content/drive/MyDrive/Đồ án KHDL/word2vec_vi_words_100dims.txt','r',encoding='utf-8')
words = []
embedding_words = {}

i = 0
for line in f:
  if i == 0:
    i+= 1
    continue
  value = line.split(' ')
  word = value[0]
  words.append(word)
  try:
    coefs = value[1:]
    embedding_words[word] = np.asarray(coefs,dtype=np.float32)
  except:
    pass


In [None]:
embedding_dim = 100

In [None]:
num_word = len(words)

In [None]:
word2idx = {w:i for i,w in enumerate(words,start = 2)}
word2idx['PAD'] = 0
word2idx['UNK'] = 1

In [None]:
idx2word = {i:w for w,i in word2idx.items()}

In [None]:
embedding_matrix = np.ones((num_word,embedding_dim))

In [None]:
for w,i in word2idx.items():
  if i > 10000:
    continue
  embedding_vector = embedding_words.get(w)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector
  else:
    embedding_matrix[i] = np.random.randn(100)


In [None]:
embedding_matrix

array([[ 5.74881658e-02, -8.54510441e-02,  7.12853372e-02, ...,
        -1.55041525e-02, -9.15896967e-02, -4.40163277e-02],
       [-1.33968771e-01,  7.32150301e-02,  9.38539568e-04, ...,
        -5.94867505e-02,  9.17428359e-02, -6.27832860e-02],
       [-1.34450188e-02,  8.29119608e-02,  9.51128900e-02, ...,
        -1.38094872e-01,  7.76888207e-02, -2.10761756e-01],
       ...,
       [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
         1.00000000e+00,  1.00000000e+00,  1.00000000e+00]])

In [None]:
MAX_LEN = 60
def encoded(X):
  x_new = []
  for sent in X:
    te = []
    for word in sent.split():
      try:
        te.append(word2idx[word])
      except:
        te.append(word2idx["UNK"])
    x_new.append(te)
  x_new = pad_sequences(x_new,maxlen=MAX_LEN,padding='post',truncating = 'post',value = word2idx['PAD'])
  return x_new

In [None]:
def getTag2idx():
  """
    This will return tag2idx, idx2tag
  """
  aspect = np.array(["SCREEN","CAMERA","FEATURES","BATTERY","PERFORMANCE","STORAGE","DESIGN","PRICE","GENERAL","SER&ACC"])
  func_add_pola = lambda aspect,pola: [aspect[i] + "#" + pola for i in range(len(aspect))]
  func_add_prefix = lambda aspect,prefix: [prefix + "-" + aspect[i] for i in range(len(aspect))]

  aspect_pos = func_add_pola(aspect,"POSITIVE")
  aspect_neu = func_add_pola(aspect,"NEUTRAL")
  aspect_neg = func_add_pola(aspect,"NEGATIVE")

  B_aspect_pos = func_add_prefix(aspect_pos,"B")
  B_aspect_neu = func_add_prefix(aspect_neu,"B")
  B_aspect_neg = func_add_prefix(aspect_neg,"B")

  I_aspect_pos = func_add_prefix(aspect_pos,"I")
  I_aspect_neu = func_add_prefix(aspect_neu,"I")
  I_aspect_neg = func_add_prefix(aspect_neg,"I")
  all_labels = np.concatenate([B_aspect_pos,B_aspect_neu,B_aspect_neg,I_aspect_pos,I_aspect_neu,I_aspect_neg])
  tag2idx = {v:i+2 for i,v in enumerate(all_labels)}
  tag2idx["O"] = 1
  tag2idx["PAD"] = 0
  idx2tag = {v:k for k,v in tag2idx.items()}
  return tag2idx, idx2tag

In [None]:
tag2idx, idx2tag = getTag2idx()

In [None]:
X_train_encoded = encoded(X_train)
X_dev_encoded = encoded(X_dev)


In [None]:
X_train_encoded

array([[ 3477,  6178,   742, ...,     0,     0,     0],
       [ 4449,   256,     2, ...,   611,  1262,  1458],
       [ 4449,   162,   131, ...,     0,     0,     0],
       ...,
       [ 4551,   186,   564, ...,     0,     0,     0],
       [40912,    26,  7147, ...,     0,     0,     0],
       [ 5869,     1,  4576, ...,     0,     0,     0]], dtype=int32)

In [None]:
y_train_encoded = [[tag2idx[w] for w in y] for y in y_train]
y_train_encoded = pad_sequences(y_train_encoded,maxlen=MAX_LEN,padding='post',truncating = 'post',value = tag2idx['PAD'])
y_train_encoded = [[to_categorical(z,num_classes = len(tag2idx)) for z in a] for a in y_train_encoded]

y_dev_encoded = [[tag2idx[w] for w in y] for y in y_dev]
y_dev_encoded = pad_sequences(y_dev_encoded,maxlen=MAX_LEN,padding='post',truncating = 'post',value = tag2idx['PAD'])
y_dev_encoded = [[to_categorical(z,num_classes = len(tag2idx)) for z in a] for a in y_dev_encoded]


# MODEL

## CRF CLASS

In [None]:
!pip install tensorflow-addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (591 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m591.0/591.0 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.20.0 typeguard-2.13.3


In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
from tensorflow_addons.text import crf_log_likelihood, crf_decode


class CRF(L.Layer):
    def __init__(self,
                 output_dim,
                 sparse_target=True,
                 **kwargs):
        """
        Args:
            output_dim (int): the number of labels to tag each temporal input.
            sparse_target (bool): whether the the ground-truth label represented in one-hot.
        Input shape:
            (batch_size, sentence length, output_dim)
        Output shape:
            (batch_size, sentence length, output_dim)
        """
        super(CRF, self).__init__(**kwargs)
        self.output_dim = int(output_dim)
        self.sparse_target = sparse_target
        self.input_spec = L.InputSpec(min_ndim=3)
        self.supports_masking = False
        self.sequence_lengths = None
        self.transitions = None

    def build(self, input_shape):
        assert len(input_shape) == 3
        f_shape = tf.TensorShape(input_shape)
        input_spec = L.InputSpec(min_ndim=3, axes={-1: f_shape[-1]})

        if f_shape[-1] is None:
            raise ValueError('The last dimension of the inputs to `CRF` '
                             'should be defined. Found `None`.')
        if f_shape[-1] != self.output_dim:
            raise ValueError('The last dimension of the input shape must be equal to output'
                             ' shape. Use a linear layer if needed.')
        self.input_spec = input_spec
        self.transitions = self.add_weight(name='transitions',
                                           shape=[self.output_dim, self.output_dim],
                                           initializer='glorot_uniform',
                                           trainable=True)
        self.built = True

    def compute_mask(self, inputs, mask=None):
        # Just pass the received mask from previous layer, to the next layer or
        # manipulate it if this layer changes the shape of the input
        return mask

    def call(self, inputs, sequence_lengths=None, training=None, **kwargs):
        sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
        if sequence_lengths is not None:
            assert len(sequence_lengths.shape) == 2
            assert tf.convert_to_tensor(sequence_lengths).dtype == 'int32'
            seq_len_shape = tf.convert_to_tensor(sequence_lengths).get_shape().as_list()
            assert seq_len_shape[1] == 1
            self.sequence_lengths = K.flatten(sequence_lengths)
        else:
            self.sequence_lengths = tf.ones(tf.shape(inputs)[0], dtype=tf.int32) * (
                tf.shape(inputs)[1]
            )

        viterbi_sequence, _ = crf_decode(sequences,
                                         self.transitions,
                                         self.sequence_lengths)
        output = K.one_hot(viterbi_sequence, self.output_dim)
        return K.in_train_phase(sequences, output)

    @property
    def loss(self):
        def crf_loss(y_true, y_pred):
            y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
            log_likelihood, self.transitions = crf_log_likelihood(
                y_pred,
                tf.cast(K.argmax(y_true), dtype=tf.int32) if self.sparse_target else y_true,
                self.sequence_lengths,
                transition_params=self.transitions,
            )
            return tf.reduce_mean(-log_likelihood)
        return crf_loss

    @property
    def accuracy(self):
        def viterbi_accuracy(y_true, y_pred):
            # -1e10 to avoid zero at sum(mask)
            mask = K.cast(
                K.all(K.greater(y_pred, -1e10), axis=2), K.floatx())
            shape = tf.shape(y_pred)
            sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
            y_pred, _ = crf_decode(y_pred, self.transitions, sequence_lengths)
            if self.sparse_target:
                y_true = K.argmax(y_true, 2)
            y_pred = K.cast(y_pred, 'int32')
            y_true = K.cast(y_true, 'int32')
            corrects = K.cast(K.equal(y_true, y_pred), K.floatx())
            return K.sum(corrects * mask) / K.sum(mask)
        return viterbi_accuracy

    def compute_output_shape(self, input_shape):
        tf.TensorShape(input_shape).assert_has_rank(3)
        return input_shape[:2] + (self.output_dim,)

    def get_config(self):
        config = {
            'output_dim': self.output_dim,
            'sparse_target': self.sparse_target,
            'supports_masking': self.supports_masking,
            'transitions': K.eval(self.transitions)
        }
        base_config = super(CRF, self).get_config()
        return dict(base_config, **config)



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
from keras.initializers import Constant
from keras.layers import Dense,Input,GlobalAveragePooling1D ,concatenate,Dropout,GRU,Bidirectional,TimeDistributed, Embedding, Attention, LSTM,Convolution1D,MaxPooling1D,Flatten,SpatialDropout1D,LeakyReLU,AveragePooling1D,MultiHeadAttention,GlobalMaxPooling1D,Dropout
from keras.models import Model
from keras.optimizers import Adamax,Adam
from keras.losses import CategoricalCrossentropy,BinaryCrossentropy
from keras.regularizers import L1,L2
# from tensorflow_addons.layers import CRF
# from tensorflow_addons.losses import SigmoidFocalCrossEntropy
from keras.initializers import Orthogonal
from keras.callbacks import EarlyStopping

In [None]:
input = Input(shape=(MAX_LEN,))
embedding = Embedding(input_dim= len(word2idx), output_dim=100,
                  input_length=MAX_LEN, embeddings_initializer = Constant(embedding_matrix))(input)
bi_lstm = Bidirectional(LSTM(units=100, return_sequences=True,
                           recurrent_dropout=0.1))(embedding)

time = TimeDistributed(Dense(len(tag2idx), activation="softmax"))(bi_lstm)

w_model = Model(input,time)




Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 60)]              0         
                                                                 
 embedding_2 (Embedding)     (None, 60, 100)           158750700 
                                                                 
 bidirectional_2 (Bidirectio  (None, 60, 200)          160800    
 nal)                                                            
                                                                 
 time_distributed_2 (TimeDis  (None, 60, 62)           12462     
 tributed)                                                       
                                                                 
Total params: 158,923,962
Trainable params: 158,923,962
Non-trainable params: 0
_________________________________________________________________


In [None]:
w_model.compile(optimizer=Adamax(learning_rate = 0.005),loss = 'categorical_crossentropy')
w_model.summary()


Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 60)]              0         
                                                                 
 embedding_2 (Embedding)     (None, 60, 100)           158750700 
                                                                 
 bidirectional_2 (Bidirectio  (None, 60, 200)          160800    
 nal)                                                            
                                                                 
 time_distributed_2 (TimeDis  (None, 60, 62)           12462     
 tributed)                                                       
                                                                 
Total params: 158,923,962
Trainable params: 158,923,962
Non-trainable params: 0
_________________________________________________________________


In [None]:
callback = EarlyStopping('val_loss',patience = 2,restore_best_weights=True)

In [None]:
train = tf.data.Dataset.from_tensor_slices((X_train_encoded,y_train_encoded))
train = train.batch(8).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

dev = tf.data.Dataset.from_tensor_slices((X_dev_encoded,y_dev_encoded))
dev = dev.batch(8).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
train

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 60), dtype=tf.int32, name=None), TensorSpec(shape=(None, 60, 62), dtype=tf.float32, name=None))>

In [None]:
w_model.fit(
    train,
    validation_data = dev,
    epochs = 50,
    callbacks = [callback],
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50

KeyboardInterrupt: ignored

# EVALUATION

In [None]:
test_data = LoadData("/content/drive/MyDrive/Nhóm - Tiến + Quý + Khanh + Văn/IE403 - Khai thác dữ liệu truyền thông xã hội/Đồ án/dataset/test.jsonl")
X_test,y_test = test_data.load()

In [None]:
X_test_encoded = encoded(X_test)


In [None]:
y_test_encoded = [[tag2idx[w] for w in y] for y in y_test]
y_test_encoded = pad_sequences(y_test_encoded,maxlen=MAX_LEN,padding='post',truncating = 'post',value = tag2idx['PAD'])

In [None]:
y_temp = y_test_encoded.reshape(1,-1)[0]

In [None]:
def pred2tag(y):
  y = y.astype('object')
  for row in range(y.shape[0]):
    for col in range(y.shape[1]):
      y[row][col] = idx2tag[y[row][col]]
  return y

In [None]:
true = pred2tag(y_test_encoded)
tag = np.array(tag2idx.keys())

In [None]:
aspect = np.array(["SCREEN","CAMERA","FEATURES","BATTERY","PERFORMANCE","STORAGE","DESIGN","PRICE","GENERAL","SER&ACC"])
func_add_pola = lambda aspect,pola: [aspect[i] + "#" + pola for i in range(len(aspect))]

aspect_pos = func_add_pola(aspect,"POSITIVE")
aspect_neu = func_add_pola(aspect,"NEUTRAL")
aspect_neg = func_add_pola(aspect,"NEGATIVE")

tags = np.concatenate([aspect_pos,aspect_neu,aspect_neg])

In [None]:
test = tf.data.Dataset.from_tensor_slices((X_test_encoded,y_test_encoded))
test = test.batch(8).cache().prefetch(buffer_size=tf.data.AUTOTUNE)


## BiLSTM

In [None]:
y_test_pred = np.argmax(w_model.predict(test),axis=-1)



In [None]:
pred = pred2tag(y_test_pred)

In [None]:
pred

array([['B-GENERAL#POSITIVE', 'I-GENERAL#POSITIVE', 'O', ..., 'PAD',
        'PAD', 'PAD'],
       ['O', 'O', 'O', ..., 'PAD', 'PAD', 'PAD'],
       ['B-GENERAL#POSITIVE', 'I-GENERAL#POSITIVE', 'I-GENERAL#POSITIVE',
        ..., 'PAD', 'PAD', 'PAD'],
       ...,
       ['O', 'O', 'O', ..., 'PAD', 'PAD', 'PAD'],
       ['O', 'O', 'O', ..., 'O', 'O', 'I-SER&ACC#POSITIVE'],
       ['B-PERFORMANCE#POSITIVE', 'I-PERFORMANCE#POSITIVE',
        'I-PERFORMANCE#POSITIVE', ..., 'PAD', 'PAD', 'PAD']], dtype=object)

In [None]:
# https://github.com/MantisAI/nervaluate
from nervaluate import Evaluator

evaluator = Evaluator(true, pred, tags=tags, loader="list")

results, results_by_tag = evaluator.evaluate()


In [None]:
results

{'ent_type': {'correct': 4869,
  'incorrect': 1472,
  'partial': 0,
  'missed': 480,
  'spurious': 4164,
  'possible': 6821,
  'actual': 10505,
  'precision': 0.4634935744883389,
  'recall': 0.7138249523530275,
  'f1': 0.5620454807803301},
 'partial': {'correct': 3294,
  'incorrect': 0,
  'partial': 3047,
  'missed': 480,
  'spurious': 4164,
  'possible': 6821,
  'actual': 10505,
  'precision': 0.45859114707282245,
  'recall': 0.7062747397742266,
  'f1': 0.55610065797068},
 'strict': {'correct': 2988,
  'incorrect': 3353,
  'partial': 0,
  'missed': 480,
  'spurious': 4164,
  'possible': 6821,
  'actual': 10505,
  'precision': 0.28443598286530225,
  'recall': 0.4380589356399355,
  'f1': 0.3449151564123283},
 'exact': {'correct': 3294,
  'incorrect': 3047,
  'partial': 0,
  'missed': 480,
  'spurious': 4164,
  'possible': 6821,
  'actual': 10505,
  'precision': 0.31356496906235126,
  'recall': 0.48292039290426625,
  'f1': 0.3802377929123861}}

## BiLSTM-CRF

In [None]:
y_test_pred = np.argmax(w_model.predict(test),axis=-1)



In [None]:
pred = pred2tag(y_test_pred)

In [None]:
pred

array([['O', 'O', 'O', ..., 'PAD', 'PAD', 'PAD'],
       ['I-PERFORMANCE#NEGATIVE', 'I-PERFORMANCE#NEGATIVE',
        'I-PERFORMANCE#NEGATIVE', ..., 'PAD', 'PAD', 'PAD'],
       ['I-PERFORMANCE#NEGATIVE', 'I-PERFORMANCE#NEGATIVE',
        'I-PERFORMANCE#NEGATIVE', ..., 'PAD', 'PAD', 'PAD'],
       ...,
       ['I-PERFORMANCE#NEGATIVE', 'I-PERFORMANCE#NEGATIVE',
        'I-PERFORMANCE#NEGATIVE', ..., 'PAD', 'PAD', 'PAD'],
       ['I-PERFORMANCE#NEGATIVE', 'I-PERFORMANCE#NEGATIVE',
        'I-PERFORMANCE#NEGATIVE', ..., 'I-PERFORMANCE#NEGATIVE',
        'I-PERFORMANCE#NEGATIVE', 'I-PERFORMANCE#NEGATIVE'],
       ['I-PERFORMANCE#NEGATIVE', 'I-PERFORMANCE#NEGATIVE',
        'I-PERFORMANCE#NEGATIVE', ..., 'PAD', 'PAD', 'PAD']], dtype=object)

In [None]:
!pip install nervaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nervaluate
  Downloading nervaluate-0.1.8-py3-none-any.whl (24 kB)
Installing collected packages: nervaluate
Successfully installed nervaluate-0.1.8


In [None]:
# https://github.com/MantisAI/nervaluate
from nervaluate import Evaluator

evaluator = Evaluator(true, pred, tags=tags, loader="list")

results, results_by_tag = evaluator.evaluate()


In [None]:
results['strict']

{'correct': 8,
 'incorrect': 0,
 'partial': 1640,
 'missed': 5173,
 'spurious': 26,
 'possible': 6821,
 'actual': 1674,
 'precision': 0.4946236559139785,
 'recall': 0.12138982553877731,
 'f1': 0.19493819894055325}