# Imports

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import tensorflow.compat.v2 as tf 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow.keras as keras
from sklearn.model_selection import StratifiedKFold
from gensim.models.phrases import Phrases, Phraser

tf.version.VERSION

In [None]:
import nltk
nltk.download('stopwords')

# Preprocessing methods

In [None]:
stop_words = set(stopwords.words("english")) 
def remove_stop_words(text):
    text = [word for word in text.split() if not word in stop_words]
    text = " ".join(text)
    return text

def remove_punctuation(text):
    text = re.sub(r'[^\w\s]','',text)
    return text

In [None]:
## Tokenization and padding
def vectorize_text(_data, _dict_size: int, _max_length: int):   
    max_dictionary_size = _dict_size
    tokenizer = Tokenizer(num_words=max_dictionary_size)
    tokenizer.fit_on_texts(_data['SentimentText'])
    list_tokenized_train = tokenizer.texts_to_sequences(_data['SentimentText'])
    print(f"Max length = {_max_length}")
    X_t = pad_sequences(list_tokenized_train, maxlen=_max_length, padding='post')
    print(len(tokenizer.index_word))
    return X_t

In [None]:
def join_phrases(text: str, ngrams):
    return " ".join(ngrams[text.split()])

In [None]:
## Create phrases of bigrams / trigrams and vectorize text + padding 
def vectorize_ngrams(_data, _dict_size: int, _max_length: int, ngram_size: int, threshold: int):
    all_reviews = _data['SentimentText'].values
    all_reviews = np.array(list(map(lambda x: x.split(), all_reviews)))

    ngrams = Phrases(sentences=all_reviews, threshold=threshold)
    if ngram_size == 3:
        ngrams = Phrases(sentences=ngrams[all_reviews])
    elif ngram_size > 3:
        raise ValueError("Not implemented for this ngram size!")
    phraser = Phraser(ngrams)
    text_ngrams = _data['SentimentText'].apply(lambda x: join_phrases(x, phraser))
    tokenizer = Tokenizer(num_words=_dict_size, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
    tokenizer.fit_on_texts(text_ngrams)
    list_tokenized_train = tokenizer.texts_to_sequences(text_ngrams)
    X_t = pad_sequences(list_tokenized_train, maxlen=_max_length, padding='post')
    len(tokenizer.index_word)
    return X_t

# Process metod

In [None]:
## Text preprocessing + model training + evaluation
def preprocess_train(data, functions, classifier, _dict_size, word_ngrams=1, words_per_review=None, threshold=10):
    # copy data frame
    _data = pd.DataFrame(data['SentimentText'])  # Reviews
    y = data['Sentiment']  # Sentiment
    # apply preprocessing methods
    for function in functions:
        _data['SentimentText'] = _data['SentimentText'].apply(lambda x: function(x))
    _row_sizes = _data['SentimentText'].str.split().str.len()
    print(f"Words count: {pd.Series.sum(_row_sizes)}")
    print(_data)
    print(f"Words ngrams: {word_ngrams}")
    # Get longest review (words)
    _data['review_lenght'] = np.array(list(map(lambda x: len(x.split()), _data['SentimentText'])))
    # set max review length
    if words_per_review is None:
        max_length = _data['review_lenght'].max()
    else:
        max_length = words_per_review
    # Vectorize reviews
    if word_ngrams == 1:
        X_data = vectorize_text(_data, _dict_size, max_length)
    else:
        X_data = vectorize_ngrams(_data, _dict_size, max_length, word_ngrams, threshold)
    # train and evaluate
    result = classifier(X_data, y, max_length)
    return result   # return average accuracy

# YELP

## Load Data

In [None]:
path = "../data/yelp_labelled.txt"
yelpData = pd.read_csv(path, sep='\t', header=0, encoding="utf-8")
row_sizes = yelpData['SentimentText'].str.split().str.len()
yelpData['SentimentText'] = yelpData['SentimentText'].str.lower()
print(f"Words count: {pd.Series.sum(row_sizes)}")
max_dictionary_size = 2071
yelpData

## Load lemmatized Data

In [None]:
path = "../data/YelpLemmatized.txt"
yelpDataLem = pd.read_csv(path, sep='\t', header=0, encoding="utf-8")
row_sizes = yelpDataLem['SentimentText'].str.split().str.len()
print(f"Words count: {pd.Series.sum(row_sizes)}")
yelpDataLem

## LSTM create and train model for yelp

In [None]:
def lstm_yelp(_data, _targets, max_length): 
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    fold = 0
    results = list()

    early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                        min_delta=0,
                                        patience=4,
                                        verbose=1,
                                        mode='auto',
                                        restore_best_weights=True)

    for train, test in kfold.split(_data, _targets):
        print(f"******* Fold {fold + 1} ***********")
        model = keras.models.Sequential([
            keras.layers.Embedding(max_dictionary_size, 16, input_length=max_length),
            keras.layers.Bidirectional(keras.layers.LSTM(16, return_sequences=True)),
            keras.layers.GlobalMaxPooling1D(),
            keras.layers.Dense(16),
            keras.layers.Dense(1, activation="sigmoid")                                
        ])

        model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
        model.fit(_data[train],_targets[train], batch_size=8, epochs=10, verbose=0, validation_data=(_data[test], _targets[test]), callbacks=[early_stopping])
        scores = model.evaluate(_data[test], _targets[test])
        results.append(scores[1])
        fold += 1
    avg = sum(results)/fold * 100
    print(f"Average accuracy = {avg:0.2f} %")
    return avg

## CNN create and train model for yelp

In [None]:
def cnn_yelp(_data, _targets, max_length):
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    fold = 0
    results = list()
    filters = 64
    kernel_size = 3
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                    min_delta=0,
                                    patience=4,
                                    verbose=1,
                                    mode='auto',
                                    restore_best_weights=True)
    for train, test in kfold.split(_data,_targets):
        print(f"******* Fold {fold + 1} ***********")
        model = keras.models.Sequential([
            keras.layers.Embedding(max_dictionary_size, 16, input_length=max_length),
            keras.layers.Conv1D(filters, kernel_size, activation="relu"),
            keras.layers.GlobalMaxPooling1D(),
            keras.layers.Dense(64),
            keras.layers.Activation("relu"),
            keras.layers.Dense(1, activation="sigmoid")                                
        ])

        model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
        model.fit(_data[train],_targets[train], batch_size=8, epochs=10, verbose=0, validation_data=(_data[test], _targets[test]), callbacks=[early_stopping])
        scores = model.evaluate(_data[test], _targets[test])
        results.append(scores[1])
        fold += 1
    avg = sum(results)/fold * 100
    print(f"Average accuracy = {avg:0.2f} %")
    return avg

## Remove punctuation

In [None]:
lstm_result = preprocess_train(yelpData, [remove_punctuation], lstm_yelp, max_dictionary_size)

In [None]:
cnn_result = preprocess_train(yelpData, [remove_punctuation], cnn_yelp, max_dictionary_size)

## Remove stopwords

In [None]:
lstm_result = preprocess_train(yelpData, [remove_stop_words], lstm_yelp, max_dictionary_size)

In [None]:
cnn_result = preprocess_train(yelpData, [remove_stop_words], cnn_yelp, max_dictionary_size)

## Lemmatization

In [None]:
lstm_result = preprocess_train(yelpDataLem, [], lstm_yelp, 1771)

In [None]:
cnn_result = preprocess_train(yelpDataLem, [], cnn_yelp, 1771)

## Remove stop words AND remove punctuation

In [None]:
lstm_result = preprocess_train(yelpDataLem, [remove_stop_words, remove_punctuation], lstm_yelp, 1691)

In [None]:
cnn_result = preprocess_train(yelpData, [remove_stop_words, remove_punctuation], cnn_yelp, 1691)

## Remove stop words AND Lemmatization

In [None]:
lstm_result = preprocess_train(yelpDataLem, [remove_stop_words], lstm_yelp, 1693)

In [None]:
lstm_result = preprocess_train(yelpDataLem, [remove_stop_words], lstm_yelp, 1693)

## N-GRAMS

In [None]:
for i in range(1, 4):
    for j in [10, 20, 40]:
        scores = preprocess_train(yelpDataLem, [], lstm_yelp, max_dictionary_size, word_ngrams=i, threshold=j)

In [None]:
for i in range(1, 4):
    for j in [10, 20, 40]:
        scores = preprocess_train(yelpDataLem, [], cnn_yelp, 1771, word_ngrams=i, threshold=j)

# IMDB

## Load data

In [None]:
path = "../data/imdb_50k.tsv"
imdbData = pd.read_csv(path, sep='\t', header=0, encoding="utf-8", doublequote=False, escapechar="\\")
imdbData = imdbData.drop(['id'], axis=1)
row_sizes = imdbData['SentimentText'].str.split().str.len()
imdbData['SentimentText'] = imdbData['SentimentText'].str.lower()
print(f"Words count: {pd.Series.sum(row_sizes)}")
max_dictionary_size = 10000
max_review_words = 400
imdbData

## Load lemmatized data

In [None]:
path = "../data/Imdb50KLemmatized.tsv"
imdbDataLem = pd.read_csv(path, sep='\t', header=0, encoding="utf-8", doublequote=False, escapechar="\\")
imdbDataLem = imdbDataLem.drop(['id'], axis=1)
row_sizes = imdbDataLem['SentimentText'].str.split().str.len()
imdbDataLem['SentimentText'] = imdbDataLem['SentimentText'].str.lower()
print(f"Words count: {pd.Series.sum(row_sizes)}")
imdbDataLem

## LSTM method for imdb

In [None]:
def lstm_imdb(_data, _targets, max_length): 
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    fold = 0
    results = list()

    early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                        min_delta=0,
                                        patience=3,
                                        verbose=1,
                                        mode='auto',
                                        restore_best_weights=True)

    for train, test in kfold.split(_data, _targets):
        print(f"******* Fold {fold + 1} ***********")
        model = keras.models.Sequential([
            keras.layers.Embedding(max_dictionary_size, 16, input_length=max_length),
            keras.layers.Bidirectional(keras.layers.LSTM(16, return_sequences=True)),
            keras.layers.GlobalMaxPooling1D(),
            keras.layers.Dense(16),
            keras.layers.Dense(1, activation="sigmoid")                        
        ])

        model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
        model.fit(_data[train],_targets[train], batch_size=64, epochs=10, verbose=0, validation_data=(_data[test], _targets[test]), callbacks=[early_stopping])
        scores = model.evaluate(_data[test], _targets[test])
        results.append(scores[1])
        fold += 1
    avg = sum(results)/fold * 100
    print(f"Average accuracy = {avg:0.2f} %")
    return avg

## CNN method for imdb

In [None]:
def cnn_imdb(_data, _targets, max_length):
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    fold = 0
    results = list()
    filters = 64
    kernel_size = 3

    early_stopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                        min_delta=0,
                                        patience=3,
                                        verbose=1,
                                        mode='auto',
                                        restore_best_weights=True)

    for train, test in kfold.split(_data,_targets):
        print(f"******* Fold {fold + 1} ***********")
        model = keras.models.Sequential([
            keras.layers.Embedding(max_dictionary_size, 16, input_length=max_length),
            keras.layers.Conv1D(filters, kernel_size, activation="relu"),
            keras.layers.GlobalMaxPooling1D(),
            keras.layers.Dense(64),
            keras.layers.Activation("relu"),
            keras.layers.Dense(1, activation="sigmoid")                                  
        ])

        model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
        model.fit(_data[train],_targets[train], batch_size=32, epochs=10, verbose=0, validation_data=(_data[test], _targets[test]), callbacks=[early_stopping])
        scores = model.evaluate(_data[test], _targets[test])
        results.append(scores[1])
        fold += 1
    avg = sum(results)/fold * 100
    print(f"Average accuracy = {avg:0.2f} %")
    return avg

## Remove punctuation

In [None]:
lstm_result = preprocess_train(imdbData, [remove_punctuation], lstm_imdb, max_dictionary_size, words_per_review=max_review_words)

In [None]:
cnn_result = preprocess_train(imdbData, [remove_punctuation], cnn_imdb, max_dictionary_size, words_per_review=max_review_words)

## Remove stopwords

In [None]:
lstm_result = preprocess_train(imdbData, [remove_stop_words], lstm_imdb, max_dictionary_size, words_per_review=max_review_words)

In [None]:
cnn_result = preprocess_train(imdbData, [remove_stop_words], cnn_imdb, max_dictionary_size, words_per_review=max_review_words)

## Lemmatization

In [None]:
lstm_result = preprocess_train(imdbDataLem, [], lstm_imdb, max_dictionary_size, words_per_review=max_review_words)
lstm_result

In [None]:
cnn_result = preprocess_train(imdbDataLem, [], cnn_imdb, max_dictionary_size, words_per_review=max_review_words)

## Remove stopwords AND remove punctuation

In [None]:
lstm_result = preprocess_train(imdbData, [remove_stop_words, remove_punctuation], lstm_imdb, max_dictionary_size, words_per_review=max_review_words)

In [None]:
cnn_result = preprocess_train(imdbData, [remove_stop_words, remove_punctuation], cnn_imdb, max_dictionary_size, words_per_review=max_review_words)

## Remove stopwords AND Lemmatization

In [None]:
lstm_result = preprocess_train(imdbDataLem, [remove_stop_words], lstm_imdb, max_dictionary_size, words_per_review=max_review_words)

In [None]:
cnn_result = preprocess_train(imdbDataLem, [remove_stop_words], cnn_imdb, max_dictionary_size, words_per_review=max_review_words)

## N-grams

In [None]:
for i in range(1, 4):
    for j in [10, 20, 40]:
        print(f"LTSM ngram {i}, threshold {j}")
        scores = preprocess_train(imdbDataLem, [], lstm_imdb, max_dictionary_size, words_per_review=max_review_words, word_ngrams=i, threshold=j)

In [None]:
for i in range(1, 4):
    for j in [10, 20, 40]:
        print(f"CNN ngram {i}, threshold {j}")
        scores = preprocess_train(imdbDataLem, [], cnn_imdb, max_dictionary_size, words_per_review=max_review_words, word_ngrams=i, threshold=j)