In [None]:
!pip install underthesea
!pip install transformers
!pip install underthesea
!pip install torch
!pip install scikit-learn

Collecting underthesea
  Downloading underthesea-6.5.0-py3-none-any.whl (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl (657 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: underthesea-core, python-crfsuite, underthesea
Successfully installed python-crfsuite-0.9.9 underthesea-6.5.0 underthesea-core-1.0.4
Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90

In [None]:
from google.colab import drive
import csv
import random
import pickle
from scipy.stats import linregress
import underthesea
import io
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import pandas as pd
from tensorflow.keras.utils import to_categorical
import re
from sklearn import svm

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
EMBEDDING_DIM = 100
MAXLEN = 30
TRUNCATING = 'post'
PADDING = 'post'
OOV_TOKEN = "<OOV>"
MAX_EXAMPLES = 160000
TRAINING_SPLIT = 0.9

In [None]:
def remove_special_characters(row):

    row = re.sub(r"[\.,\?]+$-", "", row)
    row = row.replace(",", " ").replace(".", " ") \
        .replace(";", " ").replace("“", " ") \
        .replace(":", " ").replace("”", " ") \
        .replace('"', " ").replace("'", " ") \
        .replace("!", " ").replace("?", " ") \
        .replace("-", " ").replace("?", " ")

    row = row.strip()
    return row


def remove_stopwords(sentence):

    # List of stopwords
    stopwords = list(pd.read_csv("/content/gdrive/MyDrive/codelab1/vietnamese-stopwords.txt",header = None)[0])

    sentence = sentence.lower()

    words = sentence.split()
    no_words = [w for w in words if w not in stopwords]
    sentence = " ".join(no_words)

    return sentence


def parse__data_from_file(filename):

    sentences = []
    labels = []
    reader = pd.read_csv(filename)
    labels=reader["sentiment"]
    sentence = reader["sents"].apply(remove_special_characters)
    sentence = [str((sen)) for sen in sentence ]
    labels=to_categorical(labels,3)

    return sentence, labels

In [None]:
def train_val_split(sentences, labels, training_split):

    train_size = int(len(sentences)*training_split)

    # Split the sentences and labels into train/validation splits
    train_sentences = sentences[:train_size]
    train_labels = labels[:train_size]

    validation_sentences =  sentences[train_size:]
    validation_labels = labels[train_size:]

    return train_sentences, validation_sentences, train_labels, validation_labels


def fit_tokenizer(train_sentences, oov_token):

    tokenizer = Tokenizer(oov_token=oov_token)
    tokenizer.fit_on_texts(train_sentences)


    return tokenizer


In [None]:
def seq_pad_and_trunc(sentences, tokenizer, padding, truncating, maxlen):

    # Convert sentences to sequences
    sequences = tokenizer.texts_to_sequences(sentences)

    # Pad the sequences using the correct padding, truncating and maxlen
    pad_trunc_sequences = pad_sequences(sequences,maxlen=maxlen,padding=padding,truncating=truncating)

    return pad_trunc_sequences



def create_model(vocab_size, embedding_dim, maxlen, embeddings_matrix):

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=maxlen, weights=[embeddings_matrix], trainable=False),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
        tf.keras.layers.Dropout(0,75),
        tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dropout(0,75),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(3, activation='softmax')
    ])

    model.compile(loss="categorical_crossentropy",
                  optimizer="adam",
                  metrics=['accuracy'])


    return model

def create_bi_lstm_model(vocab_size, embedding_dim, maxlen, embeddings_matrix):

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=maxlen, weights=[embeddings_matrix], trainable=False),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,activation='relu',dropout=0.5, recurrent_dropout=0.5)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(3, activation='softmax')
    ])

    model.compile(loss="categorical_crossentropy",
                  optimizer="adam",
                  metrics=['accuracy'])
    return model


def create_rnn_model(vocab_size, embedding_dim, maxlen, embeddings_matrix):

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=maxlen, weights=[embeddings_matrix], trainable=False),
        tf.keras.layers.SimpleRNN(64,activation='relu',dropout=0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(3, activation='softmax')
    ])

    model.compile(loss="categorical_crossentropy",
                  optimizer="adam",
                  metrics=['accuracy'])

    return model

def creat_svm_model(vocab_size, embedding_dim, maxlen, embeddings_matrix):
    model=svm.SVC(kernel='linear',C=1000)
    return model


In [None]:
sentences, labels = parse__data_from_file('/content/gdrive/MyDrive/codelab1/test_data.csv')
train_sentences, val_sentences, train_labels, val_labels = train_val_split(sentences, labels, TRAINING_SPLIT)
print(f"There are {len(sentences)} sentences in the dataset.\n")
print(f"First sentence has {len((sentences[0]).split())} words (after removing stopwords).\n")
print(f"There are {len(labels)} labels in the dataset.\n")
print(f"The first 5 labels are {labels[:5]}")

There are 3166 sentences in the dataset.

First sentence has 5 words (after removing stopwords).

There are 3166 labels in the dataset.

The first 5 labels are [[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]]


In [None]:
sentences[1:10]

['giáo viên rất vui tính',
 'cô max có tâm',
 'giảng bài thu hút   dí dỏm',
 'giáo viên không giảng dạy kiến thức   hướng dẫn thực hành trong quá trình học',
 'thầy dạy nhiệt tình và tâm huyết',
 'tính điểm thi đua các nhóm',
 'thầy nhiệt tình giảng lại cho học sinh',
 'có đôi lúc nói hơi nhanh làm sinh viên không theo kịp',
 'giảng dạy nhiệt tình   liên hệ thực tế khá nhiều   tương tác với sinh viên tương đối tốt']

In [None]:
tokenizer = fit_tokenizer(train_sentences,OOV_TOKEN)

word_index = tokenizer.word_index
VOCAB_SIZE = len(word_index)

print(f"Vocabulary contains {VOCAB_SIZE} words\n")


Vocabulary contains 1496 words



In [None]:
train_pad_trunc_seq = seq_pad_and_trunc(train_sentences, tokenizer, PADDING, TRUNCATING, MAXLEN)
val_pad_trunc_seq = seq_pad_and_trunc(val_sentences, tokenizer, PADDING, TRUNCATING, MAXLEN)

print(f"Padded and truncated training sequences have shape: {train_pad_trunc_seq.shape}\n")
print(f"Padded and truncated validation sequences have shape: {val_pad_trunc_seq.shape}")

Padded and truncated training sequences have shape: (2849, 30)

Padded and truncated validation sequences have shape: (317, 30)


In [None]:
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

In [None]:
!unzip "/content/gdrive/MyDrive/AI & mcln & deep ln/NLP in tensorflow/glove.6B.100d.txt.zip"

Archive:  /content/gdrive/MyDrive/AI & mcln & deep ln/NLP in tensorflow/glove.6B.100d.txt.zip
  inflating: glove.6B.100d.txt       


In [None]:
# Define path to file containing the embeddings
GLOVE_FILE = '/content/glove.6B.100d.txt'

# Initialize an empty embeddings index dictionary
GLOVE_EMBEDDINGS = {}

# Read file and fill GLOVE_EMBEDDINGS with its contents
with open(GLOVE_FILE) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        GLOVE_EMBEDDINGS[word] = coefs

In [None]:
EMBEDDINGS_MATRIX = np.zeros((VOCAB_SIZE+1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = GLOVE_EMBEDDINGS.get(word)
    if embedding_vector is not None:
        EMBEDDINGS_MATRIX[i] = embedding_vector

In [None]:
model1 = create_model(VOCAB_SIZE, EMBEDDING_DIM, MAXLEN, EMBEDDINGS_MATRIX)

history = model1.fit(train_pad_trunc_seq, train_labels, epochs=20, validation_data=(val_pad_trunc_seq, val_labels))

SIMPLE RNN

In [None]:
rnn =create_rnn_model(VOCAB_SIZE, EMBEDDING_DIM, MAXLEN, EMBEDDINGS_MATRIX)

In [None]:
history = rnn.fit(train_pad_trunc_seq, train_labels, epochs=20, validation_data=(val_pad_trunc_seq, val_labels))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


BI directional

In [None]:
Bi_lstm =create_model(VOCAB_SIZE, EMBEDDING_DIM, MAXLEN, EMBEDDINGS_MATRIX)

In [None]:
history = Bi_lstm.fit(train_pad_trunc_seq, train_labels, epochs=20, validation_data=(val_pad_trunc_seq, val_labels))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
