In [7]:
"""
An attempt to analyse food reviews with a simple Neural Network.
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
import string
import re
from collections import Counter

from keras import models
from keras import layers
from keras import regularizers

"""
The reviews is a rather big file, 
so it might be convenient to take only part of it
according to computer performance and immediate task.
GEFORCE RTX 2000 series takes a couple of minutes on following parameters. 
But on a laptop calculation might take up to 20 min, so please adjust, if necessary.
"""

DATA_SET_SIZE = 100000
TEST_SIZE = 2000
WORD_INDEX_SIZE = 10000

def main():
    """
    Get the data from the file on computer (need to be downloaded), 
    take relevant columns and rows according to the DATA_SET_SIZE.
    """
    
    # load data, take the size to be fed to the model and make a word index dictionary
    data = pd.read_csv('Reviews.csv')
    data = data.loc[:, ['Score', 'Text']]
    data = shuffle(data)
    data = data.iloc[:DATA_SET_SIZE, :]
    word_index = get_word_index(WORD_INDEX_SIZE) 
    
    # break dataset on train and test data
    test = data.iloc[:TEST_SIZE, :]
    data = data.iloc[TEST_SIZE:, :]
    
    # prepare train and test data and labels for the model
    test_data = test['Text']
    test_data_seq = get_sequences(test_data)
    X_test = encode_review(test_data_seq)

    test_labels = test['Score']
    test_labels = get_labels(test_labels)
    one_hot_test_labels = to_one_hot(test_labels)

    train_data = data['Text']
    train_data_seq = get_sequences(train_data)
    X_train = encode_review(train_data_seq)

    train_labels = data['Score']
    train_labels = get_labels(train_labels)
    one_hot_train_labels = to_one_hot(train_labels)

    # get and print the model score
    model = get_model()
    test_score = model.evaluate(X_test, one_hot_test_labels)
    print("Model metrics names: {}".format(model.metrics_names))
    print("Test score: {}".format(test_score))

def clean_review(review):
    """
    Implement simple data review cleaning with dropping HTML tags
    and punctuation. Clean words are put into a list.
    
    Example: 
    "Product recieved is as adrertised. <br />" => 
    ['product', 'recieved', 'is', 'as', 'advertised']
    """
    
    cl_review = []
    review = re.sub("<.*?>", " ", review)
    review = ''.join([i for i in review if not i.isdigit()])
    review = review.split()
    for word in review:
        word = word.lower()
        word = word.strip(string.punctuation)
        if len(word) > 0:
            cl_review.append(word)
    return cl_review

def get_word_index(num_most_common=1000):
    """
    Count words in the word list and make a dictionary with words as keys.
    Return the dictionary.
    
    Example:
    ['product', 'recieved', 'is', 'as', 'advertised', ...] =>
    dict({"product": 10, "recieved": 5, "is": 20, "as": 12, "advertised": 3, ...})
    """
    
    list_of_words = []
    for i, review in enumerate(data['Text']):
        review = clean_review(review)
        for word in review:
            list_of_words.append(word)

    word_fq = Counter(list_of_words).most_common(num_most_common)

    word_index = {}
    for i, (w, c) in enumerate(word_fq):
        word_index[w] = i
    return word_index
    
def get_sequences(data):
    """
    Encode words that constitute a review with an order number from the word_index dictionary.
    
    Example:
    ['product', 'recieved', 'is', 'as', 'advertised'] =>
    [43, 12, 66, 102, 4]
    """
    
    sequences = []
    for i, rev in data.iteritems():
        rev = clean_review(rev)
        review = []
        for word in rev:
            try:
                word_index[word]
            except KeyError:
                pass
            else:
                review.append(word_index[word])
        sequences.append(review)
    return np.array(sequences)
    
def encode_review(sequences, dimension=WORD_INDEX_SIZE):
    """
    Make a sparse matrix to encode a review with 0 and 1.
    
    Example:
    [43, 12, 66, 102, 4] =>
    [0, 0, 0, .... 1, 0, 0, ... 1, 0, 1]
    """
    
    result = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        result[i, sequence] = 1.
    return result

def get_labels(data):
    """
    Take labels 1 - 5 from dataset. (1 is a negative, 5 is a positive review)
    """
    
    labels = []
    for i, label in data.iteritems():
        labels.append(label-1)
    return np.array(labels)

def to_one_hot(labels, dimension=5):
    """
    Sparse labels.
    
    Example:
    1
    3
    =>
    1, 0, 0, 0, 0
    0, 0, 1, 0, 0
    """
    
    result = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        result[i, label] = 1.
    return result

def get_model():
    """
    Build and fit a Keras Sequential model with 3 layers 
    and softmax activation as advised for multiclass classification.
    """
    
    model = models.Sequential()
    model.add(layers.Dense(8, kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.001), 
                           activation='relu', input_shape=(WORD_INDEX_SIZE, )))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.001), 
                           activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(5, activation='softmax'))
    model.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])
    model.fit(X_train, one_hot_train_labels, epochs=5, batch_size=125, verbose=1)
    return model

main()

   

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model metrics names: ['loss', 'acc']
Test score: [1.5492809972763062, 0.6275]
