In [2]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_context('notebook')

import gensim
import spacy
import string
import re
from bs4 import BeautifulSoup 
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

from time import time

import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv( "data/testData.tsv", header=0, delimiter="\t", quoting=3)

print("Train: %d, Test: %d\n"
      % (train["review"].size, test["review"].size))

Train: 25000, Test: 25000



In [4]:
def clean_text(text, remove_stopwords=True):
    '''Clean the text, with the option to remove stopwords'''
    text = BeautifulSoup(text).get_text()
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        text = [w for w in text if not w in stopwords]
    
    text = " ".join(text)

    text = re.sub(r"[^a-z]", " ", text)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    text = re.sub(r"  ", " ", text)
    
    return(text)

def extract_sentiment(s):
    s = re.sub(r"[^0-9]", " ", s)
    n = int(s.split()[1])
    if n <= 5:
        return 0
    else:
        return 1
train['review'] = train.apply(lambda x: clean_text(x['review']), axis=1)
test['review'] = test.apply(lambda x: clean_text(x['review']), axis=1)
test['sentiment'] = test.apply(lambda x: extract_sentiment(x['id']), axis=1)

In [5]:
from gensim.models.word2vec import Word2Vec

word2vec_model = Word2Vec.load("model/word2vec_imdb_reviews_300d.bin")

In [6]:
def create_average_vec(doc):
    average = np.zeros((text_dim,), dtype='float32')
    num_words = 0.
    for word in doc.split():
        if word in word2vec_model.wv.vocab:
            average = np.add(average, word2vec_model[word])
            num_words += 1.
    if num_words != 0.:
        average = np.divide(average, num_words)
    return average

In [7]:
# Create word vectors
text_dim = 300

data = np.zeros((train.shape[0], text_dim), dtype="float32")
for i in range(len(train['review'])):
    data[i] = create_average_vec(train['review'][i])
    
labels = np.asarray(train["sentiment"])
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (25000, 300)
Shape of label tensor: (25000,)


In [8]:
data = np.expand_dims(data, axis=2)

In [9]:
# Split the data into a training set and a validation set
training_samples = 20000
validation_samples = 5000

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

In [10]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Conv1D, LSTM, MaxPooling1D, GlobalMaxPooling1D, Dropout
from keras.initializers import glorot_normal
model = Sequential()

model.add(Conv1D(32, kernel_size=4, activation='relu', kernel_initializer='glorot_normal', input_shape=(300,1)))
model.add(MaxPooling1D(2))
model.add(Dropout(0.3))
model.add(LSTM(16, dropout=0.2, recurrent_dropout=0.5))

model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_2 (Conv1D)            (None, 297, 32)           160       
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 148, 32)           0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 148, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                3136      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 3,313
Trainable params: 3,313
Non-trainable params: 0
_________________________________________________________________


In [11]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=100,
                    validation_data=(x_val, y_val))

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure()
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.savefig("cnn-acc-w2v.png")

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.savefig("cnn-loss-w2v.png")

In [12]:
def extract_sentiment(s):
    s = re.sub(r"[^0-9]", " ", s)
    n = int(s.split()[1])
    if n <= 5:
        return 0
    else:
        return 1

test['sentiment'] = test.apply(lambda x: extract_sentiment(x['id']), axis=1)
test_prep = preprocess_text(test['review'])
sequences = tokenizer.texts_to_sequences(test_prep)
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test["sentiment"])
model.evaluate(x_test, y_test)



[0.34844527755737303, 0.84899999999999998]