# imports

In [1]:
from gensim.models import Word2Vec
from gensim.parsing.porter import PorterStemmer
from gensim.utils import simple_preprocess
import math
import numpy as np
import os
import pandas as pd
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras
porter_stemmer = PorterStemmer()

# constants

In [2]:
BATCH_SIZE = 100
EPOCHS = 30
LEARNING_RATE = 0.001
REVIEW = "../data/review.json" # NOT STORED IN REPOSITORY; SEE ARTICLE TO CREATE
THRESHOLDS = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
W2V_MODEL = "../data/word2vec.model" # NOT STORED IN REPOSITORY; GENERATED BY NOTEBOOK

# fetch

In [3]:
working_df = pd.read_json(REVIEW)
working_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


# prepare

In [4]:
working_df["is_unhappy"] = working_df.apply(lambda row: 1 if row["stars"] <= 2 else 0, axis=1)
working_df = pd.concat([
    working_df[working_df["is_unhappy"] == 1].head(10000),
    working_df[working_df["is_unhappy"] == 0].head(10000),
])
working_df["tokens"] = working_df.apply(lambda row: simple_preprocess(row["text"], deacc=True), axis=1)
working_df["stemmed_tokens"] = working_df.apply(lambda row: [porter_stemmer.stem(token) for token in row["tokens"]], axis=1)
working_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,is_unhappy,tokens,stemmed_tokens
5,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31,1,"[am, long, term, frequent, customer, of, this,...","[am, long, term, frequent, custom, of, thi, es..."
29,elqRpX9T3YwL07uLNtN3Bg,-sryo4gDYxbZ1T5Bz4l5Bw,ltBBYdNzkeKdCNPDAsxwAA,2,0,0,0,I at least have to give this restaurant two st...,2015-02-02 04:29:13,1,"[at, least, have, to, give, this, restaurant, ...","[at, least, have, to, give, thi, restaur, two,..."
41,RB8UpF_kT2xoOC51OzXEeA,EZjT2qJN0mOXypMAqZdSrQ,A2q7d-CBM2-81tVkmS4JMw,2,1,1,0,"Straight to the point, it's cheap, it tastes a...",2017-07-08 18:58:42,1,"[straight, to, the, point, it, cheap, it, tast...","[straight, to, the, point, it, cheap, it, tast..."
46,Ki90-hauSIfW_7sBuBBqfA,Z2cOL3n9V8NoguJ-uU_Nvw,j8JOZvfeHEfUWq3gEz6ABQ,2,2,3,0,NEVER AGAIN. This is a so called restaurant th...,2014-06-11 14:55:14,1,"[never, again, this, is, so, called, restauran...","[never, again, thi, is, so, call, restaur, tha..."
47,TcCcHzc3L6Aboq3DteEfZA,OuatwND396ZQxm2zK8WlUQ,jNL5KUPz2-tHUJM__ysSaw,1,1,0,0,If you want to pay for everything a la carte t...,2014-08-24 20:14:12,1,"[if, you, want, to, pay, for, everything, la, ...","[if, you, want, to, pai, for, everyth, la, car..."


In [5]:
w2v_model = Word2Vec.load(W2V_MODEL)
working_df["vector"] = working_df.apply(lambda row: (np.mean([w2v_model.wv[token] for token in row["stemmed_tokens"]], axis=0)).tolist(), axis=1)
working_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,is_unhappy,tokens,stemmed_tokens,vector
5,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31,1,"[am, long, term, frequent, customer, of, this,...","[am, long, term, frequent, custom, of, thi, es...","[0.02060765027999878, 0.011286740191280842, 0...."
29,elqRpX9T3YwL07uLNtN3Bg,-sryo4gDYxbZ1T5Bz4l5Bw,ltBBYdNzkeKdCNPDAsxwAA,2,0,0,0,I at least have to give this restaurant two st...,2015-02-02 04:29:13,1,"[at, least, have, to, give, this, restaurant, ...","[at, least, have, to, give, thi, restaur, two,...","[0.04443991929292679, -0.00906819012016058, 0...."
41,RB8UpF_kT2xoOC51OzXEeA,EZjT2qJN0mOXypMAqZdSrQ,A2q7d-CBM2-81tVkmS4JMw,2,1,1,0,"Straight to the point, it's cheap, it tastes a...",2017-07-08 18:58:42,1,"[straight, to, the, point, it, cheap, it, tast...","[straight, to, the, point, it, cheap, it, tast...","[0.03929130733013153, -0.01877831481397152, 0...."
46,Ki90-hauSIfW_7sBuBBqfA,Z2cOL3n9V8NoguJ-uU_Nvw,j8JOZvfeHEfUWq3gEz6ABQ,2,2,3,0,NEVER AGAIN. This is a so called restaurant th...,2014-06-11 14:55:14,1,"[never, again, this, is, so, called, restauran...","[never, again, thi, is, so, call, restaur, tha...","[0.006395045202225447, -0.0010961530497297645,..."
47,TcCcHzc3L6Aboq3DteEfZA,OuatwND396ZQxm2zK8WlUQ,jNL5KUPz2-tHUJM__ysSaw,1,1,0,0,If you want to pay for everything a la carte t...,2014-08-24 20:14:12,1,"[if, you, want, to, pay, for, everything, la, ...","[if, you, want, to, pai, for, everyth, la, car...","[0.0489298552274704, 0.001193392788991332, 0.0..."


In [6]:
working_df = working_df.sample(frac=1, random_state=42).reset_index(drop=True)
length_all = len(working_df)
length_train = math.trunc(length_all * 0.8)
end_train = length_train - 1
start_test = length_train
train_df = working_df.iloc[:end_train]
test_df = working_df.iloc[start_test:]

# train

In [23]:
inputs = keras.Input(shape=(1000,))
hidden = keras.layers.Dense(128, activation=keras.activations.relu)(inputs)
outputs = keras.layers.Dense(1, activation=keras.activations.sigmoid)(hidden)
model = keras.Model(inputs, outputs)

model.compile(
    optimizer=keras.optimizers.RMSprop(learning_rate=LEARNING_RATE),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=[
        keras.metrics.FalseNegatives(THRESHOLDS),
        keras.metrics.FalsePositives(THRESHOLDS),
        keras.metrics.TrueNegatives(THRESHOLDS),
        keras.metrics.TruePositives(THRESHOLDS),
    ],
)

train_vectors=[]
for _, row in train_df.iterrows():
    train_vectors.append(row["vector"])

history = model.fit(
    x=np.array(train_vectors),
    y=train_df["is_unhappy"].values,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
)

Epoch 1/30
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - false_negatives_15: 1170.9596 - false_positives_15: 1861.1088 - loss: 0.6010 - true_negatives_15: 2178.4070 - true_positives_15: 2888.8914
Epoch 2/30
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - false_negatives_15: 694.3675 - false_positives_15: 1201.6666 - loss: 0.4339 - true_negatives_15: 2835.1223 - true_positives_15: 3368.2102
Epoch 3/30
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - false_negatives_15: 590.4544 - false_positives_15: 1033.6915 - loss: 0.3897 - true_negatives_15: 2980.7122 - true_positives_15: 3494.5085
Epoch 4/30
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - false_negatives_15: 559.0538 - false_positives_15: 943.8613 - loss: 0.3712 - true_negatives_15: 3084.9648 - true_positives_15: 3511.4866
Epoch 5/30
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - false_ne

# evaluate