#  Install and Import required Libs



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pyvi

In [None]:
import tensorflow as tf
import pandas as pd 
import numpy as np
from string import digits
from collections import Counter
from pyvi import ViTokenizer
from gensim.models.word2vec import Word2Vec
from keras.utils.np_utils import to_categorical
%matplotlib inline

# Load train and test dataset

In [None]:
def load_data():
  data_train = pd.read_csv("/content/drive/MyDrive/Learning/Natural Language Processing/Exercises/Bai Tap Lon/vlsp_sentiment_train.csv", sep='\t')
  data_train.columns =['Class', 'Data']
  data_aug = pd.read_csv("/content/drive/MyDrive/Learning/Natural Language Processing/Exercises/Bai Tap Lon/augment_data/train_augment.csv")
  data_train = pd.concat((data_train, data_aug))

  data_test = pd.read_csv("/content/drive/MyDrive/Learning/Natural Language Processing/Exercises/Bai Tap Lon/vlsp_sentiment_test.csv", sep='\t')
  data_test.columns =['Class', 'Data'] 
  return data_train, data_test

data_train, data_test = load_data()

In [None]:
print(data_train.shape)
print(data_test.shape)

In [None]:
labels = data_train.iloc[:, 0].values
reviews = data_train.iloc[:, 1].values

# Preprocess Data

In [None]:
def encode_labels(labels):
  result = []
  for label in labels:
      if label == -1:
          result.append([1,0,0])
      elif label == 0:
          result.append([0,1,0])
      else:
          result.append([0,0,1])

  return np.array(result)  

encoded_labels = encode_labels(labels)

In [None]:
def remove_digits(reviews_input):
  result=[]
  for review in reviews_input:
      review_cool_one = ''.join([char for char in review if char not in digits])
      result.append(review_cool_one)
  return result

reviews_processed = remove_digits(reviews)
unlabeled_processed = [] 

In [None]:
def tokenize(reviews_processed_input):
  result = []
  for review in reviews_processed_input:
      review = ViTokenizer.tokenize(review.lower())
      result.append(review.split())
  return result

word_reviews = tokenize(reviews_processed)
all_words = []

In [None]:
EMBEDDING_DIM = 400 # how big is each word vector
MAX_VOCAB_SIZE = 20000 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = 300 # max number of words in a comment to use

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(word_reviews)
sequences_train = tokenizer.texts_to_sequences(word_reviews)
word_index = tokenizer.word_index


In [None]:
data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
labels = encoded_labels

In [None]:
print('Shape of X train and X validation tensor:',data.shape)
print('Shape of label train and validation tensor:', labels.shape)

# Build and model

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors
from keras.layers import Embedding


def build_embedding_layer(word_index):

  word_vectors = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Learning/Natural Language Processing/Exercises/Bai Tap Lon/vi-model-CBOW.bin', binary=True)


  vocabulary_size=min(len(word_index)+1,MAX_VOCAB_SIZE)
  embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
  for word, i in word_index.items():
      if i>=MAX_VOCAB_SIZE:
          continue
      try:
          embedding_vector = word_vectors[word]
          embedding_matrix[i] = embedding_vector
      except KeyError:
          embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

  del(word_vectors)


  return Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)
  


embedding_layer = build_embedding_layer(word_index)

In [None]:
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, Permute
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate,ZeroPadding2D
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
from keras.metrics import Precision
from keras.metrics import Recall

filter_sizes = [3,4,5]
num_filters = 100
drop = 0.5
def build_model(sequence_length, embedding_layer): 
  inputs = Input(shape=(sequence_length,))
  embedding = embedding_layer(inputs)

  ################## LSTM ONLY ###############################
  # reshape = Reshape((sequence_length,EMBEDDING_DIM))(embedding)

  ################# SINGLE LSTM ####################
  # lstm_0 = LSTM(512)(reshape)

  # YOU WANNA ADD MORE LSTM LAYERS? UNCOMMENT THIS #
  # lstm_2 = LSTM(1024, return_sequences=True)(reshape)
  # lstm_1 = LSTM(512, return_sequences=True)(lstm_2)
  # lstm_0 = LSTM(256)(lstm_1)

  ############################################################


  ################## CRNN ####################################
  reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)
  conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
  conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
  conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

  # This will pad all output conv_ layer features to the same size (sequence_length, num_filters)
  conv_0 = ZeroPadding2D(((0, 2), (0, 0)))(conv_0)
  conv_1 = ZeroPadding2D(((0, 3), (0, 0)))(conv_1)
  conv_2 = ZeroPadding2D(((0, 4), (0, 0)))(conv_2)

  conv_0 = Reshape((-1, num_filters))(conv_0)
  conv_1 = Reshape((-1, num_filters))(conv_1)
  conv_2 = Reshape((-1, num_filters))(conv_2)

  concat = concatenate([conv_0, conv_1, conv_2])

  lstm_0 = LSTM(512)(concat)

  # YOU WANNA ADD MORE LSTM LAYERS? UNCOMMENT THIS #
  lstm_2 = LSTM(1024, return_sequences=True)(concat)
  lstm_1 = LSTM(512, return_sequences=True)(lstm_2)
  lstm_0 = LSTM(256)(lstm_1)

############################################################

  dropout = Dropout(drop)(lstm_0)
  output = Dense(units=3, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

# this creates a model that includes
  model = Model(inputs, output)


  adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
  model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy', Precision(), Recall()])
  model.summary()
  return model

model = build_model(data.shape[1], embedding_layer)

Zero padding: them cac so 0 deu de du 300-

In [None]:
### IF YOU HAVE MODEL WEIGHT AND WANNA LOAD IT
#model.load_weights("lstm_only.h5")

In [None]:
#define callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, verbose=1)
callbacks_list = [early_stopping]

history = model.fit(data, labels, validation_split=0.2,
          epochs=100, batch_size=256, callbacks=callbacks_list, shuffle=True, verbose=1)

#Plot data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def show_graph(history):
  pd.DataFrame(history.history).plot(figsize = (16, 10))
  plt.grid(True)
  plt.gca().set_ylim(0, 1)
  plt.xlabel('Epoch')
  plt.ylabel('Score')
  plt.show()

show_graph(history)

# Evaluate Model

In [None]:
labels_test = data_test.iloc[:, 0].values
reviews_test = data_test.iloc[:, 1].values

In [None]:
encoded_labels_test = encode_labels(labels_test)

In [None]:
reviews_processed_test = []
unlabeled_processed_test = [] 
for review_test in reviews_test:
    review_cool_one = ''.join([char for char in review_test if char not in digits])
    reviews_processed_test.append(review_cool_one)

In [None]:
#Use PyVi for Vietnamese word tokenizer
word_reviews_test = []
all_words = []
for review_test in reviews_processed_test:
    review_test = ViTokenizer.tokenize(review_test.lower())
    word_reviews_test.append(review_test.split())

In [None]:
sequences_test = tokenizer.texts_to_sequences(word_reviews_test)
data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
labels_test = encoded_labels_test

In [None]:
print('Shape of X train and X validation tensor:',data_test.shape)
print('Shape of label train and validation tensor:', labels_test.shape)

In [None]:
score = model.evaluate(data_test, labels_test)

In [None]:
print("%s: %.2f%%" % (model.metrics_names[0], score[0]*100))
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))
print("%s: %.2f%%" % (model.metrics_names[2], score[2]*100))
print("%s: %.2f%%" % (model.metrics_names[3], score[3]*100))


In [None]:
model.save_weights("lstm_only.h5")

# Test model

## Review 5*

In [None]:
test = "Áo hơi mỏng nhưng rất đẹp hình thêu các thứ rất đẹp còn 1 vài chỗ có chỉ thừa phần 2 túi áo nên làm kiểu zip chất lương hơn tí hoặc có thể k cần zip cũng đc phần bo chun ống tay và cổ áo cần cải thiện thêm. Nói chung với giá đc sale xuống và mình dùng voucher nữa nên như này mình cũng hài lòng rồi"

reviews_processed_test = []

review_not_contain_digit = ''.join([char for char in test if char not in digits])
reviews_processed_test.append(review_not_contain_digit)

word_reviews_test = tokenize(reviews_processed_test)


tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(word_reviews_test)
sequences_train = tokenizer.texts_to_sequences(word_reviews_test)

sampleToPredit = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)

class_names = ["Negative", "Neutral", "Positive"]
pre = model.predict(sampleToPredit)
print(pre)
print(class_names[np.argmax(pre)])

## Review 3*

In [None]:
test = "Áo mỏng hơn so với mình nghĩ.... Ko xứng đáng với giá 450k sz L mà như M vậy"

reviews_processed_test = []

review_not_contain_digit = ''.join([char for char in test if char not in digits])
reviews_processed_test.append(review_not_contain_digit)


tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(word_reviews_test)
sequences_train = tokenizer.texts_to_sequences(word_reviews_test)

sampleToPredit = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)

pre = model.predict(sampleToPredit)
print(pre)
print(class_names[np.argmax(pre)])

## Review 1*

In [None]:
test = "Như cái rẻ lau chân chán kinh khủng, mua phí tiền.mặc được chết liền"

reviews_processed_test = []

review_not_contain_digit = ''.join([char for char in test if char not in digits])
reviews_processed_test.append(review_not_contain_digit)


tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(word_reviews_test)
sequences_train = tokenizer.texts_to_sequences(word_reviews_test)

sampleToPredit = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)

pre = model.predict(sampleToPredit)
print(pre)
print(class_names[np.argmax(pre)])