In [17]:
import tensorflow as tf
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.utils.np_utils import to_categorical


from sklearn.model_selection import train_test_split

import os
import csv
import re

## Preprocess

In [18]:
# os.getcwd()

sentences = []
labels = []

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews


with open('../data/IMDB Dataset.csv') as file:
    csv_reader = csv.reader(file, delimiter=',')
    next(csv_reader, None)

    for row in csv_reader:
        labels.append(row[1])
        sentence = row[0]
        
        for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ").replace("  ", " ")
            
        # Remove punctuations and numbers
        sentence = re.sub(r'[^a-zA-Z]', ' ', sentence)
        # Single character removal
        sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
        # Removing multiple spaces
        sentence = re.sub(r'\s+', ' ', sentence)
        
        sentences.append(sentence)

sentences = preprocess_reviews(sentences)
        

In [19]:
# Find max length in sentences
longest_string = max(sentences, key=len)
print(longest_string)


match tag team table match bubba ray spike dudley vs eddie guerrero chris benoit bubba ray spike dudley started things off tag team table match eddie guerrero chris benoit according rules match opponents go tables order get win benoit guerrero heated early taking turns hammering first spike bubba ray german suplex benoit bubba took wind dudley brother spike tried help brother referee restrained benoit guerrero ganged corner with benoit stomping away bubba guerrero set table outside spike dashed ring somersaulted top rope onto guerrero outside after recovering taking care spike guerrero slipped table ring helped wolverine set up the tandem set double superplex middle rope put bubba table spike knocked table right brother came crashing down guerrero benoit propped another table corner tried irish whip spike it bubba dashed blocked brother bubba caught fire lifted opponents back body drops bubba slammed guerrero spike stomped wolverine off top rope bubba held benoit bay spike soar wassup 

In [20]:
'''
2. Hyperparameters
'''
# vocab_size = 5000
embedding_dim = 128
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"


In [67]:
# Train Test split
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.20, random_state=42) 

In [68]:
tokenizer = Tokenizer(num_words=5000, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
# tokenizer.fit_on_texts(X_test)

word_index = tokenizer.word_index

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, maxlen=max_length, padding=padding_type, truncating=trunc_type)
X_test = pad_sequences(X_test, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Labels tokenizer - DO NOT DO THIS
# labels_tokenizer = Tokenizer()
# labels_tokenizer.fit_on_texts(labels)
# # # Convert labels to integer encoding
# y_train = labels_tokenizer.texts_to_sequences(y_train)
# y_test = labels_tokenizer.texts_to_sequences(y_test)

y_train = np.array(list(map(lambda x: 1 if x=='positive' else 0, y_train)))
y_test = np.array(list(map(lambda x: 1 if x=='positive' else 0, y_test)))

# Convert sequences to numpy array
X_train, X_test = np.array(X_train), np.array(X_test)
# y_train, y_test = np.array(y_train), np.array(y_test)

In [65]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1
vocab_size

51621

# NOTE: 
Difference between (i) tokenizing labels and converting it to sequences vs. (ii) simply converting it to 0s and 1s.

- Model was getting weird result if (i), loss was decreasing to negatives but accuracy does not increase. 

Fit on text on train, test, or both?
- You MUST use the same tokenizer in training and test data. It's not guaranteed that train and test data will have the same words with same frequencies, so each dataset will create a different dictionary when fit_on_texts is ran. The main idea of dividing your dataset into train and test is to evaluate your model for future unkown situations in a objetive way. That's said, if you fit your tokenizer on whole dataset you are somehow biasing your model. For a good evaluation of your model, you have to take in account the UNK tokens. So as any other kind of "feature extraction" the best practices are to ONLY FIT ON TRAIN and apply to all.



In [50]:
X_train.shape

(40000, 100)

## Create an LSTM model
Sentiment analysis is a seq2vec problem. Keep this in mind when creating the input and output dimensions

In [69]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')    
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 128)          6607488   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_5 (Dense)              (None, 24)                3096      
_________________________________________________________________
flatten_3 (Flatten)          (None, 24)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 25        
Total params: 6,709,425
Trainable params: 6,709,425
Non-trainable params: 0
_________________________________________________________________


In [70]:
num_epochs = 5

history = model.fit(X_train, y_train, batch_size=128, validation_split=0.2, epochs=num_epochs, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Evaluate on Test set

In [71]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(X_test, y_test, batch_size=128)
print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for all samples")
predictions = model.predict(X_test)
predictions = [1 if pred >= 0.5 else 0 for pred in predictions]
# print(predictions)
print("predictions shape:", predictions.shape)

Evaluate on test data
test loss, test acc: [0.4877476990222931, 0.8443999886512756]
Generate predictions for all samples


AttributeError: 'list' object has no attribute 'shape'

## Save model


In [72]:
model.save('bi-lstm-50k-movie-reviews.h5')

In [74]:
from sklearn import metrics
print('Metrics for 50000 IMDb reviews')
# Confusion Matrix - Validation predict with gt y_test
print('Confusion Matrix: \n', metrics.confusion_matrix(y_test, predictions))

# Classification Report
print(metrics.classification_report(y_test, predictions))

# Accuracy score
print('Test Accuracy score: ', metrics.accuracy_score(y_test, predictions))

Confusion Matrix: 
 [[4129  832]
 [ 724 4315]]
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      4961
           1       0.84      0.86      0.85      5039

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

Test Accuracy score:  0.8444


In [31]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

ModuleNotFoundError: No module named 'matplotlib'