In [1]:
import numpy as np
import pandas as pd 
import tensorflow as tf
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, Dense, Dropout, Bidirectional, LSTM, GRU, Input, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pyvi import ViTokenizer
from pyvi import ViUtils

In [6]:
## Load Data
dataSents= pd.read_fwf(r'sents.txt', header=None)
dataLabels= pd.read_fwf(r'sentiments.txt', header=None)
dataTopics= pd.read_fwf(r'topics.txt', header=None)
dataSents.head()
# dataLabels.head()
# dataTopics.head()

Unnamed: 0,0,1,2,3
0,slide giáo trình đầy đủ .,,,
1,"nhiệt tình giảng dạy , gần gũi với sinh viên .",,,
2,đi học đầy đủ full điểm chuyên cần .,,,
3,chưa áp dụng công nghệ thông tin và các thiết ...,,,
4,"thầy giảng bài hay , có nhiều bài tập ví dụ ng...",,,


In [7]:
# Create Data Sentences + Labels + Topics
sentiment_data = pd.DataFrame({'sentence':dataSents[0], 'label': dataLabels[0], 'topic': dataTopics[0]})
sentiment_data = sentiment_data.dropna()
sentiment_data = sentiment_data.reset_index(drop=True)
sentiment_data.head()

Unnamed: 0,sentence,label,topic
0,slide giáo trình đầy đủ .,2,1
1,"nhiệt tình giảng dạy , gần gũi với sinh viên .",2,0
2,đi học đầy đủ full điểm chuyên cần .,0,1
3,chưa áp dụng công nghệ thông tin và các thiết ...,0,0
4,"thầy giảng bài hay , có nhiều bài tập ví dụ ng...",2,0


In [11]:
# Proprocessing Data and Prepare Data For Training
input_data_sentence = sentiment_data['sentence'].values
input_data_label = sentiment_data['label'].values
input_data_topic = sentiment_data['topic'].values

input_preprocessing = []
label_with_accent = []
topic_with_accent = []

for index, dt in enumerate(input_data_sentence):
    input_text_preprocess = list(tf.keras.preprocessing.text.text_to_word_sequence(dt))
    input_text_preprocess = " ".join(input_text_preprocess)
    input_text_preprocess_no_accent = str(ViUtils.remove_accents(input_text_preprocess).decode("utf-8"))
    input_text_preprocess_accent = ViTokenizer.tokenize(input_text_preprocess)
    input_text_preprocess_no_accent = ViTokenizer.tokenize(input_text_preprocess_no_accent)
    input_preprocessing.append(input_text_preprocess_accent)
    input_preprocessing.append(input_text_preprocess_no_accent)
    label_with_accent.append(input_data_label[index])
    label_with_accent.append(input_data_label[index])
    topic_with_accent.append(input_data_topic[index])
    topic_with_accent.append(input_data_topic[index])

[2,
 2,
 1,
 1,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 1,
 1,
 3,
 3,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 2,
 2,
 0,
 0,
 0,
 0,
 3,
 3,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 2,
 2,
 3,
 3,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 2,
 2,
 1,
 1,
 0,
 0,
 1,
 1,
 2,
 2,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 2,
 3,
 3,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 2,
 1,
 1,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 1,
 1,
 0,
 0,
 2,
 2,
 1,
 1,
 0,
 0,
 2,
 2,
 0,
 0,
 1,
 1,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [15]:
# Training:
label_tf = tf.keras.utils.to_categorical(label_with_accent,num_classes = 4, dtype="float32")

# topic_tf = tf.keras.utils.to_categorical(topic_with_accent,num_classes = 3, dtype="float32")

tokenizer_data = Tokenizer(oov_token='<OOV>', filters = '', split= ' ')
tokenizer_data.fit_on_texts(input_preprocessing)

tokenized_data_text = tokenizer_data.texts_to_sequences(input_preprocessing)
vec_data = pad_sequences(tokenized_data_text, padding='post', maxlen = 512)

X_train, X_val, y_train, y_val = train_test_split(vec_data,label_tf,test_size = 0.2, random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size = 0.1, random_state = 42)

print("training sample:", len(X_train))
print("validation sample:", len(X_val))
print("test sample:", len(X_test))


ValueError: Found input variables with inconsistent numbers of samples: [22852, 45704]