In [None]:
import re
from tqdm import tqdm
from sklearn.utils import shuffle
import numpy as np
from tqdm import tqdm
import bz2
from keras.layers import *
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [None]:
def splitReviewsLabels(lines):
    """Separate the label and the rest of the reviews.

    Args:
        lines(list of str): List of Amazon product reviews, including the label.

    Return:
        reviews(list of str): List of text reviews, without the labels.
        labels(list of one-hot arrays): List of one-hot encoded labels.
    """
    reviews = []
    labels = []
    for review in tqdm(lines):
        rev = reviewToX(review)
        label = reviewToY(review)
        reviews.append(rev[:512])
        labels.append(label)
    return reviews, labels

In [None]:
def reviewToY(review):
    """Isolates the label from a review and one-hot encodes it.

    Example:
        "__label__1 Disappointing: Boring game." -> [1,0]

    Args:
        review(str): Text of the review, including the label.

    Return:
        One-hot encoded label.
    """    
    return [1,0] if review.split(' ')[0] == '__label__1' else [0,1] 

In [None]:
def reviewToX(review):
    """Returns the review after excluding the label.

    Exclude the label, turn the text into lower case and
    remove some of the symbols of url that can be found in it.

    Example:
        "__label__1 Disappointing: Boring game." -> "Disappointing: Boring game."

    Args:
        review(str): Text of the review, including the label.

    Return:
        Text of the review without the label.
    """ 
    review = review.split(' ', 1)[1][:-1].lower()
    review = re.sub('\d','0',review)
    if 'www.' in review or 'http:' in review or 'https:' in review or '.com' in review:
        review = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", review)
    return review

In [None]:
# Loading the reviews from the file.
train_lines = bz2.BZ2File('../input/train.ft.txt.bz2').readlines()
test_lines = bz2.BZ2File('../input/test.ft.txt.bz2').readlines()
train_lines[0]

In [None]:
# Separating the reviews from the labels.
reviews_train, y_train = splitReviewsLabels(train_lines)
reviews_test, y_test = splitReviewsLabels(test_lines)
y_train[0], reviews_train[0]

In [None]:
# Shuffling in case the reviews are put in a specific order.
reviews_train, y_train = shuffle(reviews_train, y_train)
reviews_test, y_test = shuffle(reviews_test, y_test)

In [None]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
# Max number of different words to keep, based on word frequency.
max_features = 8192
# Max number of words in a review. If the review is long, it will be cut.
maxlen = 128
# Size of the embedding vector for each word.
embed_size = 64

In [None]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(reviews_train)

In [None]:
token_train = tokenizer.texts_to_sequences(reviews_train)
token_test = tokenizer.texts_to_sequences(reviews_test)
token_train[0], reviews_train[0]

In [None]:
# Pad the end of the sequences with 0 so that they are all the same length.
x_train = pad_sequences(token_train, maxlen=maxlen, padding='post')
x_test = pad_sequences(token_test, maxlen=maxlen, padding='post')

In [None]:
# Fully-convolutional neural network.
input = Input(shape=(maxlen,))
net = Embedding(max_features, embed_size)(input)
net = Dropout(0.2)(net)
net = BatchNormalization()(net)

net = Conv1D(32, 7, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net1 = BatchNormalization()(net)

net = Conv1D(2, 1)(net)
net = GlobalAveragePooling1D()(net)
output = Activation('softmax')(net)

model = Model(inputs = input, outputs = output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

In [None]:
model.fit(x_train, y_train, batch_size=2048, epochs=5, validation_split=0.1)

In [None]:
model.evaluate (x_test, y_test)