In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image
from keras.utils.vis_utils import plot_model
from keras import Model
%matplotlib inline

In [None]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, "rb")
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient="index")

df = getDF("data/reviews_Musical_Instruments_5.gz")

In [None]:
df = df[["reviewerID","asin","overall", "reviewText", "unixReviewTime"]]
df = df.dropna()
df.overall = df.overall.astype(int)
df.reviewText = df.reviewText.str.lower()
df.reviewerID = df.reviewerID.astype("category").cat.codes.values
df.asin = df.asin.astype("category").cat.codes.values
df = df.sort_values(["reviewerID","unixReviewTime"]).reset_index(drop=True)

In [None]:
seqs=[]
tars=[]
maxlen=0
for user in df.reviewerID.unique() :
    i_list = df[df.reviewerID==user].asin.values
    tar = list(df[df.reviewerID==user].overall[1 :].values)
    seq = []
    for i in range(1, len(i_list)) :
        seq.append(i_list[ :i+1])
    if len(seq) > maxlen :
        maxlen = len(seq)
    seqs.extend(seq)
    tars.extend(tar)

In [None]:
from keras.preprocessing import sequence
seqs = sequence.pad_sequences(seqs)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense

In [None]:
item_size = df.asin.max()+1
embed_size = 100

In [None]:
model = Sequential()
model.add(Embedding(item_size, embed_size, input_length=maxlen+1, mask_zero = True))
model.add(GRU(32, recurrent_dropout=0.2, return_sequences=True))
model.add(GRU(32, recurrent_dropout=0.2, return_sequences=False))
model.add(Dense(32))
model.add(Dense(1))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["mae"])
model.summary()
plot_model(model, to_file="model.png", show_shapes=True)
Image("model.png")

In [None]:
history = model.fit(seqs, np.asarray(tars), batch_size=256, epochs=30, validation_split=0.2)

In [None]:
plt.plot(history.history["val_mean_absolute_error"])

# 2. Ask the GRU

In [None]:
max_features = 2000
embed_size = 100
n_items = len(df.asin.unique())
n_users = len(df.reviewerID.unique())

In [None]:
import collections
import os 
import nltk

maxlen = 0
word_freqs = collections.Counter()
num_recs = 0
for sentence in df.reviewText.values:
    words = nltk.word_tokenize(sentence)
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        word_freqs[word] += 1
    num_recs += 1

In [None]:
word2index = {x[0]: i+2 for i, x in enumerate(word_freqs.most_common(max_features))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v:k for k, v in word2index.items()}

In [None]:
x = np.empty((len(df),), dtype=list)
i=0

for review in df.reviewText.values:
    words = nltk.word_tokenize(review)
    seqs = []
    for word in words:
        if word in word2index:
            seqs.append(word2index[word])
        else:
            seqs.append(word2index["UNK"])
    x[i] = seqs
    i += 1
x = sequence.pad_sequences(x, maxlen=maxlen)

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Embedding, GRU, Add

In [None]:
# GRU Layers
gru_in = Input(shape=(maxlen,))
gru_em = Embedding(max_features+2, embed_size, input_length=maxlen, mask_zero=True)(gru_in)
gru1 = GRU(32, recurrent_dropout=0.2, return_sequences=True)(gru_em)
gru2 = GRU(32, recurrent_dropout=0.2, return_sequences=False)(gru1)
g_func = Dense(embed_size)(gru2)

#item Layers
item_input = Input(shape=[1],name="Item")
item_embed = Embedding(n_items + 1, embed_size)(item_input)
item_vec = Flatten()(item_embed)

merge = Add()([g_func, item_vec])
out = Dense(1)(merge)

model = Model([gru_in, item_input], out)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["mae"])
model.summary()
plot_model(model, to_file="model.png", show_shapes=True)
Image("model.png")

In [None]:
history = model.fit([x, df.asin.values],df.overall.values, batch_size=256, epochs=30, validation_split=0.2)

In [None]:
plt.plot(history.history["val_mean_absolute_error"])