# imports

In [1]:
from gensim.models import Word2Vec
from gensim.parsing.porter import PorterStemmer
from gensim.utils import simple_preprocess
import math
import pandas as pd
porter_stemmer = PorterStemmer()

# constants

In [2]:
# NOT STORED IN REPOSITORY; SEE ARTICLE TO CREATE
REVIEW = "../data/review.json"

# fetch

In [3]:
working_df = pd.read_json(REVIEW)
working_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


# prepare

In [9]:
working_df["sentiment"] = working_df.apply(lambda row: -1 if row["stars"] <= 2 else 0 if row["stars"] == 3 else 1, axis=1)
working_df = pd.concat([
    working_df[working_df["sentiment"] == 1].head(10000),
    working_df[working_df["sentiment"] == 0].head(10000),
    working_df[working_df["sentiment"] == -1].head(10000),
])
working_df["tokenized_text"] = working_df.apply(lambda row: simple_preprocess(row["text"], deacc=True), axis=1)
working_df["stemmed_tokens"] = working_df.apply(lambda row: [porter_stemmer.stem(token) for token in row["tokenized_text"]], axis=1)
working_df["stemmed_tokens"].head()

0    [sooooo, good, fav, spot, downtown, classi, in...
4    [realli, clean, super, cute, cafe, veri, frien...
5    [doug, havlin, is, terrif, he, gave, me, so, m...
6    [veri, knowledg, staff, can, help, you, do, ju...
7    [love, be, abl, to, stop, in, and, order, juic...
Name: stemmed_tokens, dtype: object

In [5]:
# working_df = working_df.sample(frac=1, random_state=42).reset_index(drop=True)
# length_all = len(working_df)
# length_train = math.trunc(length_all * 0.8)
# end_train = length_train - 1
# start_test = length_train
# train_df = working_df.iloc[:end_train]
# test_df = working_df.iloc[start_test:]

In [13]:
OUTPUT = "../data/word2vec.model"
stemmed_tokens = pd.Series(working_df['stemmed_tokens']).values
model = Word2Vec(sentences=stemmed_tokens, vector_size=1000, window=3, min_count=1, workers=3, sg=1)

In [14]:
model.save(OUTPUT)

In [23]:
len(model.wv)

26780

In [26]:
model.wv.key_to_index["clean"] 

219

In [30]:
model.wv.get_vecattr("clean", "count")

2494

In [35]:
vector = model.wv["clean"]
# print(vector)

In [36]:
# working_df = working_df.sample(frac=1, random_state=42).reset_index(drop=True)
# length_all = len(working_df)
# length_train = math.trunc(length_all * 0.8)
# end_train = length_train - 1
# start_test = length_train
# train_df = working_df.iloc[:end_train]
# test_df = working_df.iloc[start_test:]

In [37]:
# train_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,sentiment,tokenized_text,stemmed_tokens
0,b1EOL_C6TN6vFSrIhP_Q2w,ZJXnI2XgKHYbgha1bW7zcw,o7cD_sDA-Evj3Oh0zeH9nQ,4,1,0,1,I personally prefer a clean broth without too ...,2016-01-04 03:35:29,1,"[personally, prefer, clean, broth, without, to...","[person, prefer, clean, broth, without, too, m..."
1,L8emSDH_bKSCrANEpdY-Sg,EW3wSJd9WXWkPEXjKlBwWA,qWZimITbVwNSGJZsuM47-Q,2,0,0,0,"I've been here a few times now, each time I'm ...",2016-01-20 00:50:38,-1,"[ve, been, here, few, times, now, each, time, ...","[ve, been, here, few, time, now, each, time, i..."
2,_RNgAZn0FmjcXKBoq2WGHg,9AP02BSPR6ro7AIM9U5_Jw,qDkbudGRfmJvf7trQZea4g,1,0,0,0,We showed up for our reservation and were shov...,2013-10-07 18:10:22,-1,"[we, showed, up, for, our, reservation, and, w...","[we, show, up, for, our, reserv, and, were, sh..."
3,1NHVZJbBKkciIbhUUL3krA,mk7YUpkF3CZfkYmD_P3PUA,_ic0Q-ptC259Kpu8lIWxTw,2,1,0,1,The salad dressings come in little plastic cas...,2015-06-09 18:11:57,-1,"[the, salad, dressings, come, in, little, plas...","[the, salad, dress, come, in, littl, plastic, ..."
4,yB2nf_-H1lilnqCtKr9I9Q,3310ONFwWDOskab3qXW3eQ,i7syG2Gyr_qVPR1GwehzuA,4,1,0,1,Above average steaks - great service - excelle...,2013-05-29 00:54:17,1,"[above, average, steaks, great, service, excel...","[abov, averag, steak, great, servic, excel, wi..."


In [64]:
# working_df["tokenized_text"] = working_df.apply(lambda row: simple_preprocess(row["text"], deacc=True), axis=1)

#         model_vector = (np.mean([sg_w2v_model[token] for token in row['stemmed_tokens']], axis=0)).tolist()
specific_row = working_df.iloc[0]
print(specific_row["stemmed_tokens"])

['person', 'prefer', 'clean', 'broth', 'without', 'too', 'much', 'spice', 'or', 'herb', 'so', 'it', 'hard', 'for', 'me', 'to', 'find', 'the', 'right', 'pho', 'tri', 'about', 'other', 'place', 'befor', 'stumbl', 'upon', 'thi', 'on', 'month', 'ago', 'want', 'to', 'try', 'sever', 'time', 'befor', 'write', 'review', 'sinc', 'consist', 'is', 'import', 'their', 'broth', 'is', 'awesom', 'enough', 'flavor', 'to', 'keep', 'dig', 'in', 'tender', 'meat', 'fresh', 'munk', 'bean', 'basil', 'and', 'not', 'so', 'quick', 'yet', 'effici', 'servic', 'also', 'tri', 'their', 'charbroil', 'pork', 'and', 'it', 'realli', 'good', 'just', 'right', 'wish', 'the', 'place', 'is', 'tad', 'cleaner', 'henc', 'star', 'rather', 'than', 'overal', 'it', 'is', 'great', 'place', 'if', 'you', 're', 'in', 'the', 'mood', 'for', 'pho']


In [65]:
stuff = ["person", "prefer"]
print(stuff)

['person', 'prefer']


In [66]:
# vector = model.wv["clean"]
import numpy as np
wow = (np.mean([model.wv[token] for token in stuff], axis=0)).tolist()

In [68]:
working_df["vector"] = working_df.apply(lambda row: (np.mean([model.wv[token] for token in row["stemmed_tokens"]], axis=0)).tolist(), axis=1)
working_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,sentiment,tokenized_text,stemmed_tokens,vector
0,b1EOL_C6TN6vFSrIhP_Q2w,ZJXnI2XgKHYbgha1bW7zcw,o7cD_sDA-Evj3Oh0zeH9nQ,4,1,0,1,I personally prefer a clean broth without too ...,2016-01-04 03:35:29,1,"[personally, prefer, clean, broth, without, to...","[person, prefer, clean, broth, without, too, m...","[0.054152704775333405, 0.008001298643648624, 0..."
1,L8emSDH_bKSCrANEpdY-Sg,EW3wSJd9WXWkPEXjKlBwWA,qWZimITbVwNSGJZsuM47-Q,2,0,0,0,"I've been here a few times now, each time I'm ...",2016-01-20 00:50:38,-1,"[ve, been, here, few, times, now, each, time, ...","[ve, been, here, few, time, now, each, time, i...","[0.014958715997636318, 0.006754523608833551, 0..."
2,_RNgAZn0FmjcXKBoq2WGHg,9AP02BSPR6ro7AIM9U5_Jw,qDkbudGRfmJvf7trQZea4g,1,0,0,0,We showed up for our reservation and were shov...,2013-10-07 18:10:22,-1,"[we, showed, up, for, our, reservation, and, w...","[we, show, up, for, our, reserv, and, were, sh...","[0.0454595722258091, 0.023290393874049187, 0.0..."
3,1NHVZJbBKkciIbhUUL3krA,mk7YUpkF3CZfkYmD_P3PUA,_ic0Q-ptC259Kpu8lIWxTw,2,1,0,1,The salad dressings come in little plastic cas...,2015-06-09 18:11:57,-1,"[the, salad, dressings, come, in, little, plas...","[the, salad, dress, come, in, littl, plastic, ...","[0.044708579778671265, 0.010944676585495472, 0..."
4,yB2nf_-H1lilnqCtKr9I9Q,3310ONFwWDOskab3qXW3eQ,i7syG2Gyr_qVPR1GwehzuA,4,1,0,1,Above average steaks - great service - excelle...,2013-05-29 00:54:17,1,"[above, average, steaks, great, service, excel...","[abov, averag, steak, great, servic, excel, wi...","[0.03659708425402641, 0.03086364082992077, 0.0..."


In [72]:
working_df["label"] = working_df.apply(lambda row: 0 if row["stars"] <= 2 else 1 if row["stars"] == 3 else 2, axis=1)

In [75]:
train_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,sentiment,tokenized_text,stemmed_tokens,vector,label
0,MilGJaBTFhHDWtFXAlaVRw,v5EtqtueF8XLFAhvOlRKPw,DXYhpEv17mWqSqFjau7vhg,1,1,0,0,Horrible !!! I love to eat and very seldom com...,2016-07-16 11:52:05,-1,"[horrible, love, to, eat, and, very, seldom, c...","[horribl, love, to, eat, and, veri, seldom, co...","[0.05428465083241463, 0.012657237239181995, 0....",0
1,ppOlRi1FaRM9C3zqkVhdTw,m63EheDvB63iTYq4yq0R9g,ZpXZgnBrNodHH8pKpOK_Kg,2,0,0,0,Wouldn't honor a Local Flavor deal for their S...,2017-07-09 23:30:17,-1,"[wouldn, honor, local, flavor, deal, for, thei...","[wouldn, honor, local, flavor, deal, for, thei...","[0.042723655700683594, 0.027790984138846397, 0...",0
2,RRZ6pRYa9WXV6nWoiwWAYg,3Bbv8SxJDgaIt61W-76CxQ,01Vnpln40zgSK6H6pAcd5Q,3,6,0,0,First off the display is beautifully done and ...,2011-05-27 22:23:16,0,"[first, off, the, display, is, beautifully, do...","[first, off, the, displai, is, beautifulli, do...","[0.05113774165511131, 0.0077386945486068726, 0...",1
3,0tqEZSwZH8lZE5t3PXf4QA,9pj8GGO_d83EQvf5GWJcIQ,Z8uwE4woiskwAWLyPTs5lw,4,1,0,0,I decided it was finally time to get my eyebro...,2015-07-16 21:17:26,1,"[decided, it, was, finally, time, to, get, my,...","[decid, it, wa, final, time, to, get, my, eyeb...","[0.026883767917752266, 0.02633053995668888, 0....",2
4,g80lH23MpHPl82decJsmMw,El_0YeWmqFfNtrkp0hCV3g,1UNorskttXSWuEcgzKzXcA,5,0,0,0,The food was delicious . The staff was friendl...,2018-05-29 03:51:30,1,"[the, food, was, delicious, the, staff, was, f...","[the, food, wa, delici, the, staff, wa, friend...","[0.04506313055753708, 0.01628216914832592, 0.0...",2


In [106]:
working_df["is_in"] = working_df.apply(lambda row: 0 if row["stars"] <= 3 else 1, axis=1)

working_df.head()




Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,sentiment,tokenized_text,stemmed_tokens,vector,label,is_in
0,MilGJaBTFhHDWtFXAlaVRw,v5EtqtueF8XLFAhvOlRKPw,DXYhpEv17mWqSqFjau7vhg,1,1,0,0,Horrible !!! I love to eat and very seldom com...,2016-07-16 11:52:05,-1,"[horrible, love, to, eat, and, very, seldom, c...","[horribl, love, to, eat, and, veri, seldom, co...","[0.05428465083241463, 0.012657237239181995, 0....",0,0
1,ppOlRi1FaRM9C3zqkVhdTw,m63EheDvB63iTYq4yq0R9g,ZpXZgnBrNodHH8pKpOK_Kg,2,0,0,0,Wouldn't honor a Local Flavor deal for their S...,2017-07-09 23:30:17,-1,"[wouldn, honor, local, flavor, deal, for, thei...","[wouldn, honor, local, flavor, deal, for, thei...","[0.042723655700683594, 0.027790984138846397, 0...",0,0
2,RRZ6pRYa9WXV6nWoiwWAYg,3Bbv8SxJDgaIt61W-76CxQ,01Vnpln40zgSK6H6pAcd5Q,3,6,0,0,First off the display is beautifully done and ...,2011-05-27 22:23:16,0,"[first, off, the, display, is, beautifully, do...","[first, off, the, displai, is, beautifulli, do...","[0.05113774165511131, 0.0077386945486068726, 0...",1,0
3,0tqEZSwZH8lZE5t3PXf4QA,9pj8GGO_d83EQvf5GWJcIQ,Z8uwE4woiskwAWLyPTs5lw,4,1,0,0,I decided it was finally time to get my eyebro...,2015-07-16 21:17:26,1,"[decided, it, was, finally, time, to, get, my,...","[decid, it, wa, final, time, to, get, my, eyeb...","[0.026883767917752266, 0.02633053995668888, 0....",2,1
4,g80lH23MpHPl82decJsmMw,El_0YeWmqFfNtrkp0hCV3g,1UNorskttXSWuEcgzKzXcA,5,0,0,0,The food was delicious . The staff was friendl...,2018-05-29 03:51:30,1,"[the, food, was, delicious, the, staff, was, f...","[the, food, wa, delici, the, staff, wa, friend...","[0.04506313055753708, 0.01628216914832592, 0.0...",2,1


In [126]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras

inputs = keras.Input(shape=(1000,))
hidden = keras.layers.Dense(128, activation=keras.activations.relu)(inputs)
outputs = keras.layers.Dense(1, activation=keras.activations.sigmoid)(hidden)
model = keras.Model(inputs, outputs)


LEARNING_RATE = 0.001
THRESHOLDS = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]

model.compile(
    optimizer=keras.optimizers.RMSprop(learning_rate=LEARNING_RATE),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=[
        keras.metrics.FalseNegatives(THRESHOLDS),
        keras.metrics.FalsePositives(THRESHOLDS),
        keras.metrics.TrueNegatives(THRESHOLDS),
        keras.metrics.TruePositives(THRESHOLDS),
    ],
)


BATCH_SIZE = 100

train_pairs=[]
for index, row in working_df.iterrows():
    train_pairs.append(row["vector"])

EPOCHS = 30

history = model.fit(
    x=np.array(train_pairs),
    y=working_df["is_in"].values,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
)

Epoch 1/30
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - false_negatives_3: 2894.1543 - false_positives_3: 2487.4470 - loss: 0.5784 - true_negatives_3: 7552.6626 - true_positives_3: 2165.4038
Epoch 2/30
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - false_negatives_3: 1863.4374 - false_positives_3: 1547.4624 - loss: 0.4286 - true_negatives_3: 8551.7861 - true_positives_3: 3136.9812
Epoch 3/30
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - false_negatives_3: 1576.1157 - false_positives_3: 1431.1860 - loss: 0.3970 - true_negatives_3: 8628.5117 - true_positives_3: 3463.8542
Epoch 4/30
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - false_negatives_3: 1448.5797 - false_positives_3: 1380.8068 - loss: 0.3818 - true_negatives_3: 8704.6279 - true_positives_3: 3565.6531
Epoch 5/30
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - false_negatives_3: 1