<a href="https://colab.research.google.com/github/lavatus/CS321/blob/main/ABSA_Hotel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone -q https://github.com/DNThuan/AspectBasedSentimentAnalysis.git
!pwd
!ls

/content
AspectBasedSentimentAnalysis  sample_data


# 1) Read data

In [2]:
import numpy as np
import json
import pandas as pd


In [3]:
def read_data(path):
  with open(path) as f:
    data = f.read().split("\n\n")
  f.close()
  for i in range(len(data)):
    temp=data[i].split("\n")
    data[i]=temp[1:3]
  return data

In [4]:
path_train = "/content/AspectBasedSentimentAnalysis/VLSP2018_SA_Hotel/1-VLSP2018-SA-Hotel-train (7-3-2018).txt"
path_dev =  "/content/AspectBasedSentimentAnalysis/VLSP2018_SA_Hotel/2-VLSP2018-SA-Hotel-dev (7-3-2018).txt"
path_test = "/content/AspectBasedSentimentAnalysis/VLSP2018_SA_Hotel/3-VLSP2018-SA-Hotel-test (8-3-2018).txt"

train = np.array(read_data(path_train))
dev = np.array(read_data(path_dev))
test = np.array(read_data(path_test))

print("Train: ",train.shape)
print("Dev: ",dev.shape)
print("Test: ",test.shape)


Train:  (3000, 2)
Dev:  (2000, 2)
Test:  (600, 2)


#2) Preprocessing data

## 2.1 Review data

### 2.1.1 Delete emoji

In [5]:
!pip install -q emoji
import emoji
from sklearn.preprocessing import  FunctionTransformer

def Delete_emoji(texts):
    return np.array([emoji.get_emoji_regexp().sub('', text) for text in texts])

delete_emoji = FunctionTransformer(Delete_emoji)

[?25l[K     |██                              | 10 kB 11.8 MB/s eta 0:00:01[K     |███▉                            | 20 kB 17.1 MB/s eta 0:00:01[K     |█████▉                          | 30 kB 20.4 MB/s eta 0:00:01[K     |███████▊                        | 40 kB 23.4 MB/s eta 0:00:01[K     |█████████▋                      | 51 kB 26.0 MB/s eta 0:00:01[K     |███████████▋                    | 61 kB 27.8 MB/s eta 0:00:01[K     |█████████████▌                  | 71 kB 26.4 MB/s eta 0:00:01[K     |███████████████▍                | 81 kB 26.8 MB/s eta 0:00:01[K     |█████████████████▍              | 92 kB 28.4 MB/s eta 0:00:01[K     |███████████████████▎            | 102 kB 30.1 MB/s eta 0:00:01[K     |█████████████████████▏          | 112 kB 30.1 MB/s eta 0:00:01[K     |███████████████████████▏        | 122 kB 30.1 MB/s eta 0:00:01[K     |█████████████████████████       | 133 kB 30.1 MB/s eta 0:00:01[K     |███████████████████████████     | 143 kB 30.1 MB/s eta 0:

### 2.1.2 Replace value of money by special character

In [6]:
import re

def Replace_Symbol(texts):
  texts_result = []
  for text in texts:
    distance_pattern = "([0-9.,]{1,9}?.km)|([0-9.,]{1,9}?.cây số)|([0-9.,]{1,9}?.cây)|([0-9.,]{1,9}?.mét)|([0-9.,]{1,3}?.m)"
    text_result = re.sub(distance_pattern, 'khoang_cach', text)
    money_pattern = "(\d{1,3}k.{0})|([0-9.]{1,9}?.vnd)|([0-9.]{1,9}?.việt nam đồng)|([0-9.]{1,9}?.đồng)"
    text_result = re.sub(money_pattern, 'gia_tien', text_result)
    texts_result.append(text_result)
  return texts_result
replace_symbol = FunctionTransformer(Replace_Symbol)

### 2.1.3 Delete special character

In [7]:
def Delete_Special_Character(texts):
  texts_result = []
  for text in texts:
    special_character_pattern = "[+=<>@#$%^&~]"
    text_result = re.sub(special_character_pattern, '', text)
    words = text_result.split()
    text_result = ' '.join(words)
    texts_result.append(text_result)
  return texts_result
delete_special_character = FunctionTransformer(Delete_Special_Character)

### 2.1.4 Normalize elongate words

In [8]:
def Normalize_Elongate_Words(texts):
  texts_result = []
  for text in texts:
    elongate_pattern = r"(\w)\1*"
    text_result = re.sub(elongate_pattern, r'\1', text)
    texts_result.append(text_result)
  return texts_result
normalize_elongate_words = FunctionTransformer(Normalize_Elongate_Words)

### 2.1.5 Replace negative words

In [144]:

def Replace_Negative_Words(texts):
  texts_result = []
  for text in texts:

    hotel_pattern = r"\bksạn\b|\bk sạn\b|\bks\b|\bKS\b|\bKs\b"
    new = re.sub(hotel_pattern, 'khách sạn', text)
    new = re.sub(r"\bnc\b", 'nước', new)
    new = re.sub(r"\bnvs\b|\bnhà vs\b", 'nhà vệ sinh', new)
    new = re.sub(r"\bnv\b", 'nhân viên', new)
    new = re.sub(r"\bvs\b", 'vệ sinh', new)
    
    negative_pattern = r"\bkh\b|\bko\b|\bkhg\b|\bkhong\b|\bk\b|\bhông\b|\bhem\b|\bk0\b"
    new = re.sub(negative_pattern, 'không', new)
    new = re.sub(r"\bdc\b|\bdk\b", 'được', new)

    new = re.sub(r" 1 ", " một ", new)

    texts_result.append(new)
  return texts_result
replace_negative_words = FunctionTransformer(Replace_Negative_Words)

### 2.1.6 Pos tagging

In [181]:
!pip install pyvi
from pyvi import ViTokenizer



In [170]:
def Pos_Tagging(texts):
  result = []
  for text in texts:
    postag =  ViPosTagger.postagging(ViTokenizer.tokenize(text))
    list_drop = ["F","E","P","L","T"]
    new_result = ""
    for index, catalog in enumerate(postag[1]):
      if catalog not in list_drop:
        new_result +=postag[0][index] + " "
    result.append(new_result)
  return result
pos_tagging = FunctionTransformer(Pos_Tagging)

## 2.2 Tag data

### 2.2.1 Label separation

In [10]:
# Tìm vị trí các cặp dấu ngoặc
# Input: một nhãn dạng string
def find_start_end(label):
  start = 0
  end = 0
  lst_start=[]
  lst_end=[]
  for index ,char in enumerate(label):
    if char == "{":
      start = index
      lst_start.append(start)
    elif char == "}":
      end = index
      lst_end.append(end)
  return tuple(zip(lst_start,lst_end))

In [11]:
def Label_to_ListDict(labels):
  list_dict_label = list()
  for label in labels:
    lst = []
    index = tuple(find_start_end(label))
    for i in index:
      dict_label = dict()
      aspect, polarity = label[i[0]+1:i[1]].replace(" ","").split(",")
      dict_label[aspect] = polarity
      lst.append(dict_label)
    list_dict_label.append(lst)
  return list_dict_label

In [12]:
def Label_str_to_list(label):
  index = tuple(find_start_end(label))
  aspect_temp=[]
  polarity_temp=[]
  for i in index:
    temp = label[i[0]+1:i[1]].replace(" ","").split(",")
    aspect_temp.append(temp[0])
    polarity_temp.append(temp[1])
  return aspect_temp, polarity_temp

In [13]:
def separate_label(labels):
  aspect= []
  polarity = []
  SA = []
  for label in labels:
    temp = Label_str_to_list(label)
    aspect.append(temp[0])
    polarity.append(temp[1])

    sa_temp= []
    for i in range(len(temp[0])):
      sa = "{"+temp[0][i]+", "+temp[1][i]+"}"
      sa_temp.append(sa)
    SA.append(sa_temp)

  return np.array(aspect, dtype=object), np.array(polarity, dtype=object), np.array(SA, dtype=object)

### 2.2.2 Binary Label

In [14]:
# Load list label

def read_label(path):
  with open(path) as f:
    data = json.load(f)
  f.close()
  return data

aspect_path = "/content/AspectBasedSentimentAnalysis/Label/aspect.json"
SA_path = "/content/AspectBasedSentimentAnalysis/Label/SA.json"
aspect_labels = read_label(aspect_path)
AS_labels = read_label(SA_path)

In [15]:
from sklearn.preprocessing import MultiLabelBinarizer
transform_label= MultiLabelBinarizer().fit([aspect_labels])
list_label = transform_label.classes_
print(list_label)

['FACILITIES#CLEANLINESS' 'FACILITIES#COMFORT'
 'FACILITIES#DESIGN&FEATURES' 'FACILITIES#GENERAL'
 'FACILITIES#MISCELLANEOUS' 'FACILITIES#PRICES' 'FACILITIES#QUALITY'
 'FOOD&DRINKS#MISCELLANEOUS' 'FOOD&DRINKS#PRICES' 'FOOD&DRINKS#QUALITY'
 'FOOD&DRINKS#STYLE&OPTIONS' 'HOTEL#CLEANLINESS' 'HOTEL#COMFORT'
 'HOTEL#DESIGN&FEATURES' 'HOTEL#GENERAL' 'HOTEL#MISCELLANEOUS'
 'HOTEL#PRICES' 'HOTEL#QUALITY' 'LOCATION#GENERAL' 'ROOMS#CLEANLINESS'
 'ROOMS#COMFORT' 'ROOMS#DESIGN&FEATURES' 'ROOMS#GENERAL'
 'ROOMS#MISCELLANEOUS' 'ROOMS#PRICES' 'ROOMS#QUALITY'
 'ROOM_AMENITIES#CLEANLINESS' 'ROOM_AMENITIES#COMFORT'
 'ROOM_AMENITIES#DESIGN&FEATURES' 'ROOM_AMENITIES#GENERAL'
 'ROOM_AMENITIES#MISCELLANEOUS' 'ROOM_AMENITIES#PRICES'
 'ROOM_AMENITIES#QUALITY' 'SERVICE#GENERAL']


## 2.3 Make dataFrame

In [16]:

def data_Frame(label_y):
  aspect, polarity,_ =  separate_label(label_y)
  dic = Label_to_ListDict(label_y)
  aspect_tf = transform_label.transform(aspect)

  for index1,label in enumerate(aspect_tf):
    count = 0
    for index2,a in enumerate(label):
      if a == 1:
        if polarity[index1][count] == "positive":
          aspect_tf[index1][index2] = 10
         
        elif polarity[index1][count] == "negative":
          aspect_tf[index1][index2] = 20
    
        else:
          aspect_tf[index1][index2] = 30
        count+=1

  return aspect_tf

In [17]:
def make_data_frame(texts,labels,list_label):
  data = {"Review":texts}
  df = pd.DataFrame(data)

  label = data_Frame(labels)
  for i in range(len(aspect_labels)):
    new_col = label[:,i]
    df[transform_label.classes_[i]] = new_col.tolist()
  return df

### 2.3.1 Aspect dataFrame

In [18]:
def get_aspect_data_frame(df,list_label):
    df_ =df.copy()
    for aspect in aspect_labels:
        df_[aspect]=df_[aspect].replace(10,1)
        df_[aspect]=df_[aspect].replace(20,1)
        df_[aspect]=df_[aspect].replace(30,1)
    df_ = df_.fillna(0)
    return df_

### 2.3.2 Positive dataFrame

In [19]:
def get_positive_data_frame(df,list_label):
    df_ =df.copy()
    for aspect in aspect_labels:
        df_[aspect]=df_[aspect].replace(10,1)
        df_[aspect]=df_[aspect].replace(20,0)
        df_[aspect]=df_[aspect].replace(30,0)
    df_ = df_.fillna(0)
    return df_

### 2.3.3 Negative dataFrame

In [20]:
def get_negative_data_frame(df,list_label):
    df_ =df.copy()
    for aspect in aspect_labels:
        df_[aspect]=df_[aspect].replace(10,0)
        df_[aspect]=df_[aspect].replace(20,1)
        df_[aspect]=df_[aspect].replace(30,0)
    df_ = df_.fillna(0)
    return df_

### 2.3.4 Neutral dataFrame

In [21]:
def get_neutral_data_frame(df,list_label):
    df_ =df.copy()
    for aspect in aspect_labels:
        df_[aspect]=df_[aspect].replace(10,0)
        df_[aspect]=df_[aspect].replace(20,0)
        df_[aspect]=df_[aspect].replace(30,1)
    df_ = df_.fillna(0)
    return df_

# 3) Training

## 3.1 Get data

In [22]:
X_train, y_train = train[:,0], train[:,1]
X_dev,   y_dev   = dev[:,0],   dev[:,1]
X_test,  y_test  = test[:,0],  test[:,1]

In [23]:
def getdata(df,list_label,kind):
  if kind == "aspect":
    data = get_aspect_data_frame(df,list_label)
  elif kind == "positive":
    data= get_positive_data_frame(df,list_label)
  elif kind =="negative":
    data = get_negative_data_frame(df,list_label)
  elif kind =="neutral":
    data = get_neutral_data_frame(df,list_label)  

  X = data.Review
  y = data.drop("Review",1)
  return X,y

In [39]:
def prepare_data(X,y,list_label):
  data = make_data_frame(X,y,list_label)

  X_aspect, y_aspect = getdata(data, list_label, "aspect")
  X_positive, y_positive = getdata(data, list_label, "positive")
  X_negative, y_negative = getdata(data, list_label, "negative")
  X_neutral, y_neutral = getdata(data, list_label, "neutral")

  X_aspect_tf = preproceesing_data.transform(X_aspect).toarray()
  X_positive_tf = preproceesing_data.transform(X_positive).toarray()
  X_negative_tf = preproceesing_data.transform(X_negative).toarray()
  X_neutral_tf = preproceesing_data.transform(X_neutral).toarray()
  dic = {
      "X_aspect_tf":X_aspect_tf,
      "X_positive_tf":X_positive_tf,
      "X_negative_tf":X_negative_tf,
      "X_neutral_tf":X_neutral_tf,
      "y_aspect":y_aspect,
      "y_positive":y_positive,
      "y_negative":y_negative,
      "y_neutral":y_neutral
  }

  return dic

## 3.2 Binary label

In [25]:
transform_label_SA= MultiLabelBinarizer().fit([AS_labels])
#print(transform_label_SA.classes_)

In [26]:
def show_label(pred_as, pred_pos, pred_neg, pred_neu):
  labels = []
  for index, value in enumerate(pred_as):
    if value == 1:
      label = "{"
      if pred_pos[index] ==1:
        label += str(list_label[index])+", "+"positive"
      elif pred_neg[index] ==1:
        label += str(list_label[index])+", "+"negative"
      elif pred_neu[index] ==1:
        label += str(list_label[index])+", "+"neutral"
      label+="}"
      labels.append(label)
  return labels

## 3.3 Make Pipeline

In [171]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

preproceesing_data = make_pipeline(delete_emoji,
                              replace_symbol,
                              delete_special_character,
                              normalize_elongate_words,
                              replace_negative_words,
                              pos_tagging,
                              TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1,2), stop_words='english')).fit(X_train)
                              


In [28]:
from sklearn import metrics

def score(y_true, y_pred):
  print("Precison: ",metrics.precision_score(y_true, y_pred, average='micro'))
  print("Recall: ",metrics.recall_score(y_true, y_pred, average='micro'))
  print("F1: ",metrics.f1_score(y_true, y_pred, average='micro'))


## 3.4 Get data training

In [172]:
data_train = prepare_data(X_train,y_train,list_label)

In [173]:
data_dev = prepare_data(X_dev,y_dev,list_label)

In [174]:
data_test = prepare_data(X_test,y_test,list_label)

## 3.5 Model

In [175]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

clf_aspect = OneVsRestClassifier(LinearSVC(), n_jobs=1)
clf_positive = OneVsRestClassifier(LinearSVC(), n_jobs=1)         
clf_negative = OneVsRestClassifier(LinearSVC(), n_jobs=1)
clf_neutral = OneVsRestClassifier(LinearSVC(), n_jobs=1)   
          

In [176]:
clf_aspect.fit(data_train["X_aspect_tf"], data_train["y_aspect"])
clf_positive.fit(data_train["X_positive_tf"], data_train["y_positive"])
clf_negative.fit(data_train["X_negative_tf"], data_train["y_negative"])
clf_neutral.fit(data_train["X_neutral_tf"], data_train["y_neutral"])


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


OneVsRestClassifier(estimator=LinearSVC(), n_jobs=1)

In [177]:
predict_aspect =  clf_aspect.predict(data_dev["X_aspect_tf"])
predict_positive = clf_positive.predict(data_dev["X_positive_tf"])
predict_negative = clf_negative.predict(data_dev["X_negative_tf"])
predict_neutral = clf_neutral.predict(data_dev["X_neutral_tf"])

In [178]:
score(data_dev["y_aspect"],predict_aspect)

Precison:  0.8041918429003021
Recall:  0.5989312333005203
F1:  0.6865479164987507


In [55]:
def model(X):
  predict_aspect =  clf_aspect.predict(X)
  predict_positive = clf_positive.predict(X)
  predict_negative = clf_negative.predict(X)
  predict_neutral = clf_neutral.predict(X)
  
  pred =[]
  for i in range(len(X)):
    rs = show_label(predict_aspect[i],predict_aspect[i],predict_negative[i],predict_neutral[i])
    pred.append(rs)
  return np.array(pred, dtype=object)

## 3.6 Predict in dev-dataset

In [179]:
_,_, y_true = separate_label(y_dev)
pre =  model(data_dev["X_aspect_tf"])
y_true_tf = transform_label_SA.transform(y_true)
pre_tf = transform_label_SA.transform(pre)

In [180]:
score(y_true_tf,pre_tf)

Precison:  0.6306646525679759
Recall:  0.4696948389818591
F1:  0.5384057386958975


## 3.7 Predict in test-dataset

In [182]:
_,_, y_true = separate_label(y_test)
pre =  model(data_test["X_aspect_tf"])
y_true_tf = transform_label_SA.transform(y_true)
pre_tf = transform_label_SA.transform(pre)

In [183]:
score(y_true_tf,pre_tf)

Precison:  0.6169334021683015
Recall:  0.46246130030959753
F1:  0.528644105286441


## 4) predict with txt file

In [201]:
path_file = "/content/test.txt"
texts = read_data(path_file)
texts = [x[0] for x in texts]
text_tf = preproceesing_data.transform(texts).toarray()

In [226]:
predicts = model(text_tf)
file = open("file_predict.txt","w")
with file as f:
  for index,predict in enumerate(predicts):
    f.write("#"+str(index+1)+"\n")
    f.write(texts[index]+"\n")
    for i in predict:
      f.write(i+", ")
    f.write("\n")
    
   
f.close()