In [1]:
import itertools
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer
import string
from collections import Counter
string.punctuation
stopwords.words('english')

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM

from sklearn.model_selection import train_test_split
from sklearn import metrics 

import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
set_session(tf.Session(config=config))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
review_raw = pd.read_csv('amazon-fine-food-reviews/Reviews.csv')

In [3]:
review = review_raw.dropna()

In [4]:
def remove_punc(s):
    return s.translate(None, string.punctuation)

In [5]:
def remove_stopword(text):
    return [word for word in text if word not in stopwords.words('english')]

In [6]:
def lem(tokens):
    lemmatizer = WordNetLemmatizer()
    lem_tokens = [lemmatizer.lemmatize(token.decode('utf-8')) for token in tokens]
    return lem_tokens

In [7]:
review.loc[:, 'token'] = review['Text'].apply(lambda s : str(s).lower()).apply(remove_punc).apply(word_tokenize)
review.loc[:, 'token'] = review['token'].apply(lem)
review.loc[:, 'token'] = review['token'].map(lambda s : ' '.join(s))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [11]:
max_vocab = 20000
vocab_size = max_vocab + 1
maxlen = 50
batch_size = 128

In [14]:
def prepare_features(df):

    Y = df['Score']

    sentences = []
    for s in df['token']:
        sentences.append(s)

    sentences = [s.split(' ') for s in sentences]
    word_counts = Counter(itertools.chain(*sentences))
    vocabulary_inv = [x[0] for x in word_counts.most_common(max_vocab)]
    vocabulary_inv = list(sorted(vocabulary_inv))
    vocabulary = {x : i + 1 for i, x in enumerate(vocabulary_inv)}

    X = np.array([[vocabulary.get(word) for word in sentence if word in vocabulary] for sentence in sentences])
    X = sequence.pad_sequences(X, maxlen = maxlen)

    return X, Y, vocabulary

In [16]:
X, Y, vocabulary = prepare_features(review)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

In [27]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=maxlen))
model.add(LSTM(25))
model.add(Dense(12))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('linear'))

In [28]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])

model.fit(X_train, y_train, batch_size=batch_size, epochs=1, validation_data=(X_test, y_test), verbose=1)

Train on 397888 samples, validate on 170524 samples
Epoch 1/1


<keras.callbacks.History at 0x7f303dfee190>

In [29]:
y_test_pred = model.predict(X_test, verbose = 0)
y_test_pred.shape = (X_test.shape[0], )

In [30]:
acc = metrics.accuracy_score((y_test > 3.0).astype(int), (y_test_pred > 3.0).astype(int))
recall = metrics.recall_score((y_test > 3.0).astype(int), (y_test_pred > 3.0).astype(int))
precision = metrics.precision_score((y_test > 3.0).astype(int), (y_test_pred > 3.0).astype(int))
f1score = metrics.f1_score((y_test > 3.0).astype(int), (y_test_pred > 3.0).astype(int))
auc = metrics.roc_auc_score((y_test > 3.0).astype(int), (y_test_pred > 3.0).astype(int))
print 'Accuracy: %.2f, Recall: %.2f, Precision: %.2f, F1 Score: %.2f, AUC: %.2f' % (acc, recall, precision, f1score, auc)

Accuracy: 0.89, Recall: 0.96, Precision: 0.90, F1 Score: 0.93, AUC: 0.79


In [31]:
metrics.confusion_matrix((y_test > 3.0).astype(int), (y_test_pred > 3.0).astype(int))

array([[ 23185,  14197],
       [  4705, 128437]])