# Imports

In [None]:
import pandas as pd
import numpy as np
import tensorflow.compat.v2 as tf 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow.keras as keras
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sb

tf.version.VERSION

In [None]:
## matplotlib configuration
SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIG_SIZE = 16
LARGE_SIZE = 20

params = {
    'figure.figsize': (14, 6),
    'font.size': SMALL_SIZE,
    'xtick.labelsize': MEDIUM_SIZE,
    'ytick.labelsize': MEDIUM_SIZE,
    'legend.fontsize': BIG_SIZE,
    'figure.titlesize': LARGE_SIZE,
    'axes.titlesize': MEDIUM_SIZE,
    'axes.labelsize': BIG_SIZE
}
plt.rcParams.update(params)

In [None]:
def show_history(history):
    plt.figure()
    for key in history.history.keys():
        plt.plot(history.epoch, history.history[key], label=key)
    plt.legend()
    plt.tight_layout()

# YELP

In [None]:
path = "../data/YelpLemmatized.txt"
yelpData = pd.read_csv(path, sep='\t', header=0, encoding="utf-8")
row_sizes = yelpData['SentimentText'].str.split().str.len()
yelpData['SentimentText'] = yelpData['SentimentText'].str.lower()
print(f"Words count: {pd.Series.sum(row_sizes)}")
yelpData

## Preprocessing

In [None]:
df = yelpData
df['review_lenght'] = np.array(list(map(lambda x: len(x.split()), df['SentimentText'])))
median = df['review_lenght'].median()
mean = df['review_lenght'].mean()
_max_length = df['review_lenght'].max()

In [None]:
fig, ax = plt.subplots()
sb.distplot( df['review_lenght'],  bins=int(_max_length),
            hist_kws={"alpha": 0.9, "color": "blue"}, ax=ax,
            kde_kws={"color": "black", 'linewidth': 3})
ax.set_xlim(left=0, right=_max_length)
ax.set_xlabel('Počet slov v recenzi')
ymax = 0.1
plt.ylim(0, ymax)
ax.plot([mean, mean], [0, ymax], '--', label=f'průměr = {mean:.2f}', linewidth=3)
ax.plot([median, median], [0, ymax], '--',
        label=f'median = {median:.2f}', linewidth=3)
ax.plot([_max_length, _max_length], [0, ymax], '--', label=f'max = {_max_length}', linewidth=0)
# ax.set_title('Distribuce slov v recenzích', fontsize=20)
plt.legend()
plt.show()

In [None]:
df = yelpData
max_dictionary_size = 2071
tokenizer = Tokenizer(num_words=max_dictionary_size)
tokenizer.fit_on_texts(df['SentimentText'])
list_tokenized_train = tokenizer.texts_to_sequences(df['SentimentText'])
max_length = _max_length
X_t = pad_sequences(list_tokenized_train, maxlen=max_length, padding='post')
y = yelpData['Sentiment']
len(tokenizer.index_word)

## LSTM 1

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
fold = 0
results = list()

early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                        min_delta=0,
                                        patience=4,
                                        verbose=1,
                                        mode='auto',
                                        restore_best_weights=True)

for train, test in kfold.split(df['SentimentText'], y):
    model = keras.models.Sequential([
    keras.layers.Embedding(max_dictionary_size, 64, input_length=max_length),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.4),
    keras.layers.Bidirectional(keras.layers.LSTM(100, return_sequences=True)),
    keras.layers.GlobalMaxPooling1D(),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(1, activation="sigmoid")                                
    ])

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    model.fit(X_t[train],y[train], batch_size=8, epochs=12,  validation_data=(X_t[test], y[test]), callbacks=[early_stopping])
    scores = model.evaluate(X_t[test], y[test])
    results.append(scores[1])
    fold += 1

print(f"Average accuracy = {sum(results)/fold * 100:0.2f} %")

## LSTM 2

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
fold = 0
results = list()

early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                        min_delta=0,
                                        patience=4,
                                        verbose=1,
                                        mode='auto',
                                        restore_best_weights=True)

for train, test in kfold.split(df['SentimentText'], y):
    model = keras.models.Sequential([
        keras.layers.Embedding(max_dictionary_size, 64, input_length=max_length, mask_zero=True),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.4),
        keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
        keras.layers.BatchNormalization(),
        keras.layers.MaxPooling1D(),
        keras.layers.Dropout(0.3),
        keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
        keras.layers.GlobalMaxPooling1D(),
        keras.layers.BatchNormalization(),
        keras.layers.Dense(100),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid")                             
    ])

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    model.fit(X_t[train],y[train], batch_size=8, epochs=12,  validation_data=(X_t[test], y[test]), callbacks=[early_stopping])
    scores = model.evaluate(X_t[test], y[test])
    results.append(scores[1])
    fold += 1

print(f"Average accuracy = {sum(results)/fold * 100:0.2f} %")

## CNN 1

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
fold = 0
results = list()
filters = 250
kernel_size = 3
early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                        min_delta=0,
                                        patience=4,
                                        verbose=1,
                                        mode='auto',
                                        restore_best_weights=True)
for train, test in kfold.split(df['SentimentText'], y):
    model = keras.models.Sequential([
        keras.layers.Embedding(max_dictionary_size, 50, input_length=max_length),
        keras.layers.Dropout(0.3),
        keras.layers.Conv1D(filters, kernel_size, activation="relu"),
        keras.layers.GlobalMaxPooling1D(),
        keras.layers.Dense(250),
        keras.layers.Dropout(0.2),
        keras.layers.Activation("relu"),
        keras.layers.Dense(1, activation="sigmoid")                                
    ])

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    model.fit(X_t[train],y[train], batch_size=8, epochs=10,  validation_data=(X_t[test], y[test]), callbacks=[early_stopping])
    scores = model.evaluate(X_t[test], y[test])
    results.append(scores[1])
    fold += 1
print(f"Average accuracy = {sum(results)/fold * 100:0.2f} %")

## CNN 2


In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
fold = 0
results = list()
early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                        min_delta=0,
                                        patience=4,
                                        verbose=1,
                                        mode='auto',
                                        restore_best_weights=True)

for train, test in kfold.split(df['SentimentText'], y):
    model = keras.models.Sequential([
        keras.layers.Embedding(max_dictionary_size, 64, input_length=max_length),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Conv1D(32, 7, padding='same', activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Conv1D(32, 3, padding='same', activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Conv1D(32, 3, padding='same', activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Conv1D(32, 3, padding='same', activation='relu'),
        keras.layers.Dropout(0.3),
        keras.layers.Conv1D(2, 2),
        keras.layers.GlobalAveragePooling1D(),
        keras.layers.Dense(1, activation="sigmoid")                                
    ])

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    model.fit(X_t[train],y[train], batch_size=8, epochs=10,  validation_data=(X_t[test], y[test]), callbacks=[early_stopping])
    scores = model.evaluate(X_t[test], y[test])
    results.append(scores[1])
    fold += 1
print(f"Average accuracy = {sum(results)/fold * 100:0.2f} %")

# IMDB

In [None]:
path = "../data/Imdb50KLemmatized.tsv"
imdbData = pd.read_csv(path, sep='\t', header=0, encoding="utf-8", doublequote=False, escapechar="\\")
imdbData = imdbData.drop(['id'], axis=1)
row_sizes = imdbData['SentimentText'].str.split().str.len()
imdbData['SentimentText'] = imdbData['SentimentText'].str.lower()
print(f"Words count: {pd.Series.sum(row_sizes)}")
imdbData

## Preprocessing

In [None]:
df = imdbData
df['review_lenght'] = np.array(list(map(lambda x: len(x.split()), df['SentimentText'])))
median = df['review_lenght'].median()
mean = df['review_lenght'].mean()
_max_length = df['review_lenght'].max()

In [None]:
fig, ax = plt.subplots()
sb.distplot( df['review_lenght'],  bins=int(_max_length/4),
            hist_kws={"alpha": 0.9, "color": "blue"}, ax=ax,
            kde_kws={"color": "black", 'linewidth': 3})
ax.set_xlim(left=0, right=_max_length/4)
ax.set_xlabel('Počet slov v recenzi')
ymax = 0.008
plt.ylim(0, ymax)
ax.plot([mean, mean], [0, ymax], '--', label=f'průměr = {mean:.2f}', linewidth=3)
ax.plot([median, median], [0, ymax], '--',
        label=f'median = {median:.2f}', linewidth=3)
ax.plot([_max_length, _max_length], [0, ymax], '--', label=f'max = {_max_length}', linewidth=0)
# ax.set_title('Distribuce slov v recenzích', fontsize=20)
plt.legend()
plt.show()

In [None]:
# tokenization and padding
max_dictionary_size = 10000
tokenizer = Tokenizer(num_words=max_dictionary_size)
tokenizer.fit_on_texts(df['SentimentText'])
list_tokenized_train = tokenizer.texts_to_sequences(df['SentimentText'])
max_length = 400
X_t = pad_sequences(list_tokenized_train, maxlen=max_length, padding='post')
y = imdbData['Sentiment']
len(tokenizer.index_word)

## LSTM 1

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
fold = 0
results = list()

early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                        min_delta=0,
                                        patience=3,
                                        verbose=1,
                                        mode='auto',
                                        restore_best_weights=True)

for train, test in kfold.split(df['SentimentText'], y):
    print(f"******* Fold {fold + 1} ***********")
    model = keras.models.Sequential([
            keras.layers.Embedding(max_dictionary_size, 64, input_length=max_length),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.4),
            keras.layers.Bidirectional(keras.layers.LSTM(100, return_sequences=True)),
            keras.layers.GlobalMaxPooling1D(),
            keras.layers.BatchNormalization(),
            keras.layers.Dense(100),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    model.fit(X_t[train],y[train], batch_size=64, epochs=10, validation_data=(X_t[test], y[test]), callbacks=[early_stopping])
    scores = model.evaluate(X_t[test], y[test])
    results.append(scores[1])
    fold += 1
print(f"Average accuracy = {sum(results)/fold * 100:0.2f} %")

## LSTM 2

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
fold = 0
results = list()

early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                        min_delta=0,
                                        patience=3,
                                        verbose=1,
                                        mode='auto',
                                        restore_best_weights=True)

for train, test in kfold.split(df['SentimentText'], y):
    print(f"******* Fold {fold + 1} ***********")
    model = keras.models.Sequential([
            keras.layers.Embedding(max_dictionary_size, 64, input_length=max_length, mask_zero=True),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.4),
            keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
            keras.layers.BatchNormalization(),
            keras.layers.MaxPooling1D(),
            keras.layers.Dropout(0.3),
            keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
            keras.layers.GlobalMaxPooling1D(),
            keras.layers.BatchNormalization(),
            keras.layers.Dense(100),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    model.fit(X_t[train],y[train], batch_size=64, epochs=10, validation_data=(X_t[test], y[test]), callbacks=[early_stopping])
    scores = model.evaluate(X_t[test], y[test])
    results.append(scores[1])
    fold += 1
print(f"Average accuracy = {sum(results)/fold * 100:0.2f} %")

## CNN 1

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
fold = 0
results = list()
filters = 250
kernel_size = 3

early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                        min_delta=0,
                                        patience=3,
                                        verbose=1,
                                        mode='auto',
                                        restore_best_weights=True)

for train, test in kfold.split(df['SentimentText'], y):
    print(f"******* Fold {fold + 1} ***********")
    model = keras.models.Sequential([
        keras.layers.Embedding(max_dictionary_size, 50, input_length=max_length),
        keras.layers.Dropout(0.3),
        keras.layers.Conv1D(filters, kernel_size, activation="relu"),
        keras.layers.GlobalMaxPooling1D(),
        keras.layers.Dense(250),
        keras.layers.Dropout(0.2),
        keras.layers.Activation("relu"),
        keras.layers.Dense(1, activation="sigmoid")                                
    ])

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    model.fit(X_t[train],y[train], batch_size=32, epochs=10, validation_data=(X_t[test], y[test]), callbacks=[early_stopping])
    scores = model.evaluate(X_t[test], y[test])
    results.append(scores[1])
    fold += 1
print(f"Average accuracy = {sum(results)/fold * 100:0.2f} %")


## CNN 2

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
fold = 0
results = list()
filters = 250
kernel_size = 3

early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                        min_delta=0,
                                        patience=3,
                                        verbose=1,
                                        mode='auto',
                                        restore_best_weights=True)

for train, test in kfold.split(df['SentimentText'], y):
    print(f"******* Fold {fold + 1} ***********")
    model = keras.models.Sequential([
        keras.layers.Embedding(max_dictionary_size, 64, input_length=max_length),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Conv1D(32, 7, padding='same', activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Conv1D(32, 3, padding='same', activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Conv1D(32, 3, padding='same', activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Conv1D(32, 3, padding='same', activation='relu'),
        keras.layers.Dropout(0.3),
        keras.layers.Conv1D(2, 2),
        keras.layers.GlobalAveragePooling1D(),
        keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    model.fit(X_t[train],y[train], batch_size=32, epochs=10, validation_data=(X_t[test], y[test]), callbacks=[early_stopping])
    scores = model.evaluate(X_t[test], y[test])
    results.append(scores[1])
    fold += 1
print(f"Average accuracy = {sum(results)/fold * 100:0.2f} %")