In [4]:
import re
from tqdm import tqdm
from sklearn.utils import shuffle
import numpy as np
from tqdm import tqdm
import bz2
from keras.layers import *
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [5]:
def splitReviewsLabels(lines):
    """Separate the label and the rest of the reviews.

    Args:
        lines(list of str): List of Amazon product reviews, including the label.

    Return:
        reviews(list of str): List of text reviews, without the labels.
        labels(list of one-hot arrays): List of one-hot encoded labels.
    """
    reviews = []
    labels = []
    for review in tqdm(lines):
        rev = reviewToX(review)
        label = reviewToY(review)
        reviews.append(rev[:512])
        labels.append(label)
    return reviews, labels

In [6]:
def reviewToY(review):
    """Isolates the label from a review and one-hot encodes it.

    Example:
        "__label__1 Disappointing: Boring game." -> [1,0]

    Args:
        review(str): Text of the review, including the label.

    Return:
        One-hot encoded label.
    """    
    return [1,0] if review.split(' ')[0] == '__label__1' else [0,1] 

In [7]:
def reviewToX(review):
    """Returns the review after excluding the label.

    Exclude the label, turn the text into lower case and
    remove some of the symbols of url that can be found in it.

    Example:
        "__label__1 Disappointing: Boring game." -> "Disappointing: Boring game."

    Args:
        review(str): Text of the review, including the label.

    Return:
        Text of the review without the label.
    """ 
    review = review.split(' ', 1)[1][:-1].lower()
    review = re.sub('\d','0',review)
    if 'www.' in review or 'http:' in review or 'https:' in review or '.com' in review:
        review = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", review)
    return review

In [15]:
# Loading the reviews from the file.
train_lines = bz2.BZ2File('../input/train.ft.txt.bz2').readlines()
test_lines = bz2.BZ2File('../input/test.ft.txt.bz2').readlines()
train_lines[0]

b'__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n'

In [13]:
# Separating the reviews from the labels.
reviews_train, y_train = splitReviewsLabels(train_lines)
reviews_test, y_test = splitReviewsLabels(test_lines)
y_train[0], reviews_train[0]

100%|██████████| 3600000/3600000 [01:03<00:00, 56472.31it/s]
100%|██████████| 400000/400000 [00:06<00:00, 61113.90it/s]


([0, 1],
 'stuning even for the non-gamer: this sound track was beautiful! it paints the senery in your mind so well i would recomend it even to people who hate vid. game music! i have played the game chrono cross but out of all of the games i have ever played it has the best music! it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. it would impress anyone who cares to listen! ^_^')

In [16]:
# Shuffling in case the reviews are put in a specific order.
reviews_train, y_train = shuffle(reviews_train, y_train)
reviews_test, y_test = shuffle(reviews_test, y_test)

In [17]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [18]:
# Max number of different words to keep, based on word frequency.
max_features = 8192
# Max number of words in a review. If the review is long, it will be cut.
maxlen = 128
# Size of the embedding vector for each word.
embed_size = 64

In [19]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(reviews_train)

In [21]:
token_train = tokenizer.texts_to_sequences(reviews_train)
token_test = tokenizer.texts_to_sequences(reviews_test)
token_train[0], reviews_train[0]

([14,
  763,
  594,
  994,
  4367,
  86,
  16,
  52,
  171,
  329,
  8,
  105,
  43,
  50,
  4528,
  32,
  2560,
  20,
  2131,
  10,
  165,
  12,
  92,
  5595,
  40,
  7,
  54,
  1154,
  51,
  1684,
  72,
  99,
  80,
  763,
  594,
  994,
  576,
  16,
  10,
  20,
  681,
  9,
  14,
  22,
  31,
  22,
  53,
  53,
  9,
  97,
  335,
  5,
  335,
  3,
  40,
  7,
  54,
  47,
  57,
  25,
  41,
  121,
  31,
  155,
  464,
  3,
  23,
  1,
  121,
  86,
  155,
  3,
  21,
  1910,
  265,
  4,
  246,
  7,
  204,
  10],
 "not r.e.m.'s best, but it's worth buying.: this album has no chart topper like losing my religion in 0000. that cd, entitled out of time sold more copies than any other r.e.m. release, but in my opinion is not as good as up. up is better cover to cover and out of time had only one or two good songs. hope and lotus are the two best songs and have recieved quite a bit of airplay here in greensboro.")

In [22]:
x_train = pad_sequences(token_train, maxlen=maxlen, padding='post')
x_test = pad_sequences(token_test, maxlen=maxlen, padding='post')
x_train, reviews_train

(array([[  14,  763,  594, ...,    0,    0,    0],
        [ 477, 2609,    8, ...,    0,    0,    0],
        [  78,    6,  251, ...,    0,    0,    0],
        ...,
        [  68, 1463,    8, ...,    0,    0,    0],
        [  30, 3421,   14, ...,    0,    0,    0],
        [   4,  125,  922, ...,    0,    0,    0]], dtype=int32),
 ["not r.e.m.'s best, but it's worth buying.: this album has no chart topper like losing my religion in 0000. that cd, entitled out of time sold more copies than any other r.e.m. release, but in my opinion is not as good as up. up is better cover to cover and out of time had only one or two good songs. hope and lotus are the two best songs and have recieved quite a bit of airplay here in greensboro.",
  'throrough, complete explanations: this book is an excellent introduction to hypnotism, with thorough and complete step-by step explanations. it also suggests many funny aplications of hyponis for stage shows. the book is poorly written and has several spelli

In [23]:
# Fully-convolutional neural network.
input = Input(shape=(maxlen,))
net = Embedding(max_features, embed_size)(input)
net = Dropout(0.2)(net)
net = BatchNormalization()(net)

net = Conv1D(32, 7, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net1 = BatchNormalization()(net)

net = Conv1D(2, 1)(net)
net = GlobalAveragePooling1D()(net)
output = Activation('softmax')(net)
model = Model(inputs = input, outputs = output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 64)           524288    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128, 64)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 128, 64)           256       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 128, 32)           14368     
_________________________________________________________________
batch_normalization_2 (Batch (None, 128, 32)           128       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 128, 32)           3104      
__________

In [24]:
model.fit(x_train, y_train, batch_size=2048, epochs=5, validation_split=0.1)

Train on 3240000 samples, validate on 360000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f57297079b0>

In [25]:
model.evaluate (x_test, y_test)



[0.15263902871131896, 0.944105]