# Bag of Words Meets Bags of Popcorn

## Imports & Reading the Data

In [1]:
import re
import numpy as np
import pandas as pd
from string import punctuation, printable, digits

from keras.preprocessing.text import Tokenizer
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential

from matplotlib import pyplot as plt
plt.style.use('ggplot')

import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
# Load data
train_df = pd.read_csv('../input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip', delimiter = "\t", encoding = 'utf-8')
test_df = pd.read_csv('../input/word2vec-nlp-tutorial/testData.tsv.zip', delimiter = "\t", encoding = 'utf-8')

combined_dfs = [train_df, test_df]

Since all reviews have been divided into two balanced groups by their sentiments (Each containing 12500 reviews), we need not to worry about the metrics we are going to use. (*Accuracy* for instance be misleading on an imbalanced dataset!)

## Preprocessings

In [3]:
# Remove HTML tags
def remove_html(text: str, replacement = ' ') -> str:
    return re.sub(r'<.*?>', replacement, text)

# Remove non-ASCII characters
def filter_printables(text: str) -> str:
    return ''.join(filter(lambda x: x in printable, text))

# Remove numbers from the string
def remove_numbers(string: str) -> str:
    return string.translate({ord(d): None for d in digits})

# Remove two backslashes
def remove_double_backslashes(text: str) -> str:
    return text.replace('\\', '')

# Remove all punctuations
def remove_punctuation(string: str, repl: str = '') -> str:
    return string.translate(str.maketrans('', '', punctuation))

# Fix multiple spacings
def fix_spacing(string: str) -> str:
    return ' '.join(string.split())

# Lemmatize all words
def lemmatize(string: str) -> str:
    return ' '.join([token.lemma_ for token in nlp(string)])

In [4]:
for df in combined_dfs:
    df['review'] = df['review'].apply(remove_html)
    df['review'] = df['review'].apply(remove_numbers)
    df['review'] = df['review'].apply(filter_printables)
    df['review'] = df['review'].apply(remove_double_backslashes)
    df['review'] = df['review'].apply(remove_punctuation)
    # df['review'] = df['review'].apply(lemmatize)
    df['review'] = df['review'].apply(fix_spacing)

In [5]:
words_counts = train_df['review'].apply(lambda x: len(x.split())).values
plt.hist(words_counts)
plt.show()

In [18]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(
  num_words = 1000,       # the maximum number of words to keep (Only the most common num_words-1 words will be kept)
  lower = True,           # boolean. Whether to convert the texts to lowercase.
  split = ' ',            # str. Separator for word splitting.
  char_level = False,     # if True, every character will be treated as a token.
  oov_token = None        # replaces out-of-vocabulary words during text_to_sequence calls with oov_token
)
tokenizer.fit_on_texts(train_df['review'])   # can be a list of strings

# Modes: 'binary', 'count', 'freq', 'tfidf'
X_train = tokenizer.texts_to_matrix(train_df['review'], mode = 'binary')
X_test = tokenizer.texts_to_matrix(test_df['review'], mode = 'binary')

y_train = train_df['sentiment']

In [19]:
model = Sequential()
model.add(Dense(units = 64, activation = 'relu', input_shape = (X_train.shape[1],)))
model.add(Dense(units = 1, activation = 'relu'))

model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)
model.summary()

In [20]:
# Define callbacks
callbacks = [
    EarlyStopping(patience = 3),
    ModelCheckpoint(
        filepath = 'model.h5',
        save_best_only = True,
        monitor = 'val_accuracy',
        verbose = 1
    )
]

# Train the model
history = model.fit(
    X_train,
    y_train,
    epochs = 10,
    batch_size = 32,
    validation_split = 0.2,
    verbose = 2,
    callbacks = callbacks
)

In [21]:
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (10, 5))
epochs = [i for i in range(len(history.history['loss']))]

# Loss
ax[0].plot(epochs, history.history['loss'], label = 'loss')
ax[0].plot(epochs, history.history['val_loss'], label = 'val_loss')
ax[0].legend()

# Accuracy
ax[1].plot(epochs, history.history['accuracy'], label = 'accuracy')
ax[1].plot(epochs, history.history['val_accuracy'], label = 'val_accuracy')
ax[1].legend()

plt.show()