In [922]:
import re

import nltk
import numpy as np
from keras import Sequential
from keras import layers
from keras.src.utils import pad_sequences
from pandas import get_dummies
from pandas import read_csv
from sklearn.model_selection import train_test_split

In [924]:
tokenizer = nltk.WordPunctTokenizer()
lemmatizer = nltk.WordNetLemmatizer()

reviews_csv_name = 'reviews.csv'
max_count_words = 150

In [925]:
sizes_reviews = []
mp = {}


def normalize_review(review):
    normalized_words = []

    words = tokenizer.tokenize(review)
    for word in words:
        word = word.lower()

        if re.search("^[a-z]+$", word):
            normalized_words.append(lemmatizer.lemmatize(word))

            if len(normalized_words) >= max_count_words:
                break

    sizes_reviews.append(len(normalized_words))
    return normalized_words

In [926]:
def map_word_to_num(matrix):
    words = set()

    for row in matrix:
        words.update(row)

    words = list(words)
    return {words[i]: i for i in range(len(words))}


def to_num_matrix(matrix):
    global mp
    mp = map_word_to_num(matrix)

    num_matrix = []
    for row in matrix:
        num_row = []

        for word in row:
            num_row.append(mp[word])

        num_matrix.append(num_row)

    return num_matrix

In [927]:
df = read_csv(reviews_csv_name)
X = df['review'].map(lambda review: normalize_review(str(review)))
X = pad_sequences(X, padding='post', value='#FILLER#', maxlen=max_count_words, dtype=object)
X = to_num_matrix(X)
Y = get_dummies(df['voted_up'].values).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [928]:
model = Sequential()
model.add(layers.Embedding(input_dim=len(mp), output_dim=100, input_length=max_count_words))
model.add(layers.Dropout(rate=0.1))
model.add(layers.LSTM(units=100, dropout=0.1))
model.add(layers.BatchNormalization())
model.add(layers.Dense(units=2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [929]:
epochs = 16
batch_size = 128

model.fit(np.array(X_train), np.array(Y_train), epochs=epochs, batch_size=batch_size, validation_split=0.1)

tests = model.evaluate(np.array(X_test), np.array(Y_test))
print('\n' + 'loss: ' + str(tests[0]) + '\n' + 'accuracy: ' + str(tests[1]))

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16

loss: 0.40847718715667725
accuracy: 0.8209999799728394


In [967]:
def predict(review):
    normalized_review = normalize_review(review)
    for _ in range(max_count_words - len(normalized_review)):
        normalized_review.append('#FILLER#')
    num_normalized_review = [mp[word] for word in normalized_review]
    answer = model.predict(np.array(num_normalized_review).reshape(1, 150))
    print(answer)

In [968]:
predict("Excellent")
predict("Terrible")

[[0.08944421 0.9105558 ]]
[[0.85134304 0.14865696]]
