In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from sklearn.preprocessing import OneHotEncoder
from keras.layers import Activation, Dense

Using TensorFlow backend.


In [2]:
NGRAM_RANGE = 2
MAX_FEAT = 20000
MAX_LEN = 400
BATCH_SIZE = 32
EMBEDDING_DIM = 50
EPOCHS = 5

In [3]:
# Read Train
train_data = pd.read_csv("amazon_review_full_csv/train.csv", header=None, names=['category', 'title', 'text'])
train_data.dropna(axis=0, how='any', inplace=True)
train_data.head()

Unnamed: 0,category,title,text
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...


In [4]:
train_data.category.value_counts()

5    599995
1    599991
4    599988
3    599980
2    599979
Name: category, dtype: int64

In [5]:
# Merge title and main post
train_data.text = train_data.title + " " + train_data.text
train_data.drop('title', axis=1, inplace=True)

print(train_data.shape)
train_data.head()

(2999933, 2)


Unnamed: 0,category,text
0,3,more like funchuck Gave this to my dad for a g...
1,5,Inspiring I hope a lot of people hear this cd....
2,5,The best soundtrack ever to anything. I'm read...
3,4,Chrono Cross OST The music of Yasunori Misuda ...
4,5,Too good to be true Probably the greatest soun...


In [6]:
# Extract data
train_texts = train_data.text.tolist()
y_train = train_data.category.as_matrix()

In [7]:
%%time
# Train tokenizer (test only)
tokenizer = Tokenizer(MAX_FEAT)
tokenizer.fit_on_texts(train_texts)

CPU times: user 3min 58s, sys: 232 ms, total: 3min 58s
Wall time: 3min 58s


In [8]:
%%time
# Transform training-data
x_train = tokenizer.texts_to_sequences(train_texts)

CPU times: user 2min 48s, sys: 864 ms, total: 2min 49s
Wall time: 2min 49s


In [9]:
# Read Test
test_data = pd.read_csv("amazon_review_full_csv/test.csv", header=None, names=['category', 'title', 'text'])
test_data.dropna(axis=0, how='any', inplace=True)
test_data.head()

Unnamed: 0,category,title,text
0,1,mens ultrasheer,"This model may be ok for sedentary types, but ..."
1,4,Surprisingly delightful,This is a fast read filled with unexpected hum...
2,2,"Works, but not as advertised",I bought one of these chargers..the instructio...
3,2,Oh dear,I was excited to find a book ostensibly about ...
4,2,Incorrect disc!,"I am a big JVC fan, but I do not like this mod..."


In [10]:
# Merge title and main post
test_data.text = test_data.title + " " + test_data.text
test_data.drop('title', axis=1, inplace=True)

print(test_data.shape)
test_data.head()

(649990, 2)


Unnamed: 0,category,text
0,1,mens ultrasheer This model may be ok for seden...
1,4,Surprisingly delightful This is a fast read fi...
2,2,"Works, but not as advertised I bought one of t..."
3,2,Oh dear I was excited to find a book ostensibl...
4,2,"Incorrect disc! I am a big JVC fan, but I do n..."


In [11]:
# Extract data
test_texts = test_data.text.tolist()
y_test = test_data.category.as_matrix()

In [12]:
%%time
# Transform test-data
x_test = tokenizer.texts_to_sequences(test_texts)

CPU times: user 37.9 s, sys: 184 ms, total: 38.1 s
Wall time: 38.1 s


In [13]:
def create_ngram_set(input_list, ngram_value=2):
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

In [14]:
def add_ngram(sequences, token_indice, ngram_range=2):
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for i in range(len(new_list) - ngram_range + 1):
            for ngram_value in range(2, ngram_range + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)
    return new_sequences

In [15]:
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

2999933 train sequences
649990 test sequences
Average train sequence length: 78
Average test sequence length: 78


In [16]:
# https://github.com/fchollet/keras/blob/master/examples/imdb_fasttext.py

In [17]:
%%time
if NGRAM_RANGE > 1:
    print('Adding {}-gram features'.format(NGRAM_RANGE))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, NGRAM_RANGE + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = MAX_FEAT + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, NGRAM_RANGE)
    x_test = add_ngram(x_test, token_indice, NGRAM_RANGE)
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

Adding 2-gram features
Average train sequence length: 156
Average test sequence length: 154
CPU times: user 8min 28s, sys: 5.51 s, total: 8min 33s
Wall time: 8min 34s


In [18]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (2999933, 400)
x_test shape: (649990, 400)


In [19]:
# One-hot encode y for keras
enc = OneHotEncoder(categorical_features='all')
one_hot = enc.fit(np.expand_dims(y_train, axis=-1))
# Transform
y_train = one_hot.transform(np.expand_dims(y_train, axis=-1)).toarray()    
y_test = one_hot.transform(np.expand_dims(y_test, axis=-1)).toarray()       

In [20]:
print('Build model...')
#https://arxiv.org/pdf/1607.01759.pdf
# fastText, h = 10, bigram -> should get 60.2 on validation
model = Sequential()
model.add(Embedding(MAX_FEAT,
                    EMBEDDING_DIM,
                    input_length=MAX_LEN))
model.add(GlobalAveragePooling1D())
# Hidden=10, what's the activation?
model.add(Dense(10, activation='tanh'))
# 5-class mutually-exclusive output
model.add(Dense(5, activation='softmax'))

Build model...


In [21]:
# Categorical cross-entropy 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
print(y_train.shape, y_test.shape)

(2999933, 5) (649990, 5)


In [None]:
model.fit(x_train,
          y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_data=(x_test, y_test))

Train on 2999933 samples, validate on 649990 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f372fc270b8>