# Bag of Words Meets Bags of Popcorn

In [2]:
# General Python packages
import re
import spacy
import string
import pandas as pd
from collections import Counter

# Keras packages
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense, Embedding

nlp = spacy.load('en_core_web_sm')   # Also 'en_core_web_md' and 'en_core_web_lg'.

In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/Kaggle/Bag-of-Words-Meets-Bags-of-Popcorn/Data/Raw/labeledTrainData.tsv', delimiter = "\t")
test_df = pd.read_csv('/content/drive/MyDrive/Kaggle/Bag-of-Words-Meets-Bags-of-Popcorn/Data/Raw/testData.tsv', delimiter = "\t")
combined = [train_df, test_df]

train_df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


# Text Preprocessing

In [4]:
def remove_punctuations(text):
    return text.replae(string.punctuation, ' ')

def clean(review):    
    
    # Early manual fix
    review = review.replace("\\", '')
    
    # Combine Filters for HTML, Paranteshis, Non-ASCII and Numbers
    filters = [r'<\w+ ?/>', r'\([^()]*\)', r'[^\x00-\x7F]+', r'(\d+)']
    review = re.sub(r'|'.join(filters), ' ', review)
        
    # Convert to lowercase
    review = review.lower()

    # Lemmatize and remove punctuations
    review = ' '.join([token.lemma_ for token in nlp(review) if (not token.is_punct) and (not token.is_stop)])    
    
    # Late Manual fix
    review = review.replace("\'s", '')
    review = review.replace("-PRON-", '')
    
    # Remove remaining punctuations
    review = review.replace(string.punctuation, ' ')
    
    # Filter-out words with less than 3 character
    review = ' '.join([word for word in review.split() if len(word) > 2])
    
    return review

In [None]:
for i, df in enumerate(combined):
    df['cleaned_review'] = df['review'].apply(clean)
    print(f'{i + 1}th dataset cleaned.')

# Building the vocabulary

In [None]:
vocab = Counter()

for i, review in train_df.iterrows():
    vocab.update(review['cleaned_review'].split())

# Print most common words
print('Top 5 most common words:', vocab.most_common(5))

vocab_size = len(vocab)
print('Vocabulary size:', vocab_size)

In [None]:
# Tokenize reviews
def tokenize(sentences, vocab_size, mode):
    tokenizer = Tokenizer(num_words = vocab_size, oov_token = '?')
    tokenizer.fit_on_texts(sentences)
    return tokenizer.texts_to_matrix(sentences, mode = mode)

# Modes: 'binary', 'count', 'freq', 'tfidf'
X_train = tokenize(train_df['cleaned_review'], vocab_size, 'binary')
X_test = tokenize(test_df['cleaned_review'], vocab_size, 'binary')

In [None]:
# Padding the encodings
def pad(encoding, maxlen, padding = 'pre', truncating = 'post'):    
    return pad_sequences (
        sequences = encoding,
        maxlen = maxlen,
        padding = padding,
        truncating = truncating,
        value = 0
    )

maxlen = 500
X_train = pad(encoding = X_train)
X_test = pad(encoding = X_test)

In [None]:
# Model structure
model = Sequential()
model.add(Dense(units = 64, activation = 'relu', inpu_dim = (X_train.shape[0], )))
model.add(Dense(units = 32, activation = 'relu'))
model.add(Dense(units = 1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
plot_model(model = model, show_shapes = True)

In [None]:
# Train the Model
history = model.fit(
    x = ,
    y = ,
    epochs = 10,
    batch_size = 64,
    validation_split = 0.2
)

In [None]:
# Plot the Model
epochs = [i for i in range(len(history.history['loss']))]

def plot_subplot(axs, metric, val_metric):
    ''' Plot a single subplot '''

    axs.set_title('Analysis of ' + metric)
    axs.plot(epochs, history.history[metric], label = metric)
    axs.plot(epochs, history.history[val_metric], label = val_metric)
    axs.legend()

fig, axs = plt.subplots(1, len(metrics), figsize = (18, 5))

for i, metric in enumerate(['accuracy', 'loss']):
    plot_subplot(axs[i], metric, 'val_' + metric)

In [None]:
# Predict and Submission
y_pred = model.predict(X_test)