# imports

In [1]:
from gensim.models import Word2Vec
from gensim.parsing.porter import PorterStemmer
from gensim.utils import simple_preprocess
import math
import numpy as np
import os
import pandas as pd
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras
porter_stemmer = PorterStemmer()

# constants

In [2]:
REVIEW = "../data/review.json" # NOT STORED IN REPOSITORY; SEE ARTICLE TO CREATE
W2V_MODEL = "../data/word2vec.model" # NOT STORED IN REPOSITORY; GENERATED BY NOTEBOOK
W2V_WORKERS = 8 # SET TO NUMBER OF CORES ON WORKSTATION

# fetch

In [3]:
working_df = pd.read_json(REVIEW)
working_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


# prepare

In [4]:
working_df["is_unhappy"] = working_df.apply(lambda row: 1 if row["stars"] <= 2 else 0, axis=1)
working_df = pd.concat([
    working_df[working_df["is_unhappy"] == 1].head(10000),
    working_df[working_df["is_unhappy"] == 0].head(10000),
])

In [5]:
working_df["tokens"] = working_df.apply(lambda row: simple_preprocess(row["text"], deacc=True), axis=1)
working_df["tokens"].head()

5     [am, long, term, frequent, customer, of, this,...
29    [at, least, have, to, give, this, restaurant, ...
41    [straight, to, the, point, it, cheap, it, tast...
46    [never, again, this, is, so, called, restauran...
47    [if, you, want, to, pay, for, everything, la, ...
Name: tokens, dtype: object

In [6]:
working_df["stemmed_tokens"] = working_df.apply(lambda row: [porter_stemmer.stem(token) for token in row["tokens"]], axis=1)
working_df["stemmed_tokens"].head()

5     [am, long, term, frequent, custom, of, thi, es...
29    [at, least, have, to, give, thi, restaur, two,...
41    [straight, to, the, point, it, cheap, it, tast...
46    [never, again, thi, is, so, call, restaur, tha...
47    [if, you, want, to, pai, for, everyth, la, car...
Name: stemmed_tokens, dtype: object

# train

In [8]:
w2v_model = Word2Vec(
    sentences=working_df['stemmed_tokens'].values,
    vector_size=1000,
    window=3,
    min_count=1,
    workers=W2V_WORKERS,
    sg=1,
)
w2v_model.save(W2V_MODEL)