In [8]:
import collections
import numpy as np
import pandas as pd
import random

from gensim.models import Doc2Vec, Phrases
from gensim.models.doc2vec import LabeledSentence
from pprint import pprint

from gensim.parsing.preprocessing import STOPWORDS as stop_words
letters = list('abcdefghijklmnopqrstuvwxyz')
numbers = list('123456789')
stop_words = stop_words.union(set(letters)).union(set(numbers))

In [9]:
df = pd.read_json('./data/combined_review.json', lines=True)

In [10]:
import string
import re
RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)

def preprocess(text):
    # Remove all punctuation and make all lowercase 
    return RE_PUNCT.sub(" ", text).lower().split()

In [11]:
def make_movie_doc(text, title):
    doctag = '_'.join(preprocess(title))    
    docwords = preprocess(text)
    return LabeledSentence(docwords, [doctag])

DOCS = [make_movie_doc(text, title) for text, title in zip(df.text.tolist(), df.name.tolist())]

random.shuffle(DOCS)

In [12]:
print(DOCS[:10])

[LabeledSentence(words=['like', 'jason', 'i', 'picked', 'up', 'some', 'noshbox', 'today', 'flatbread', 'chicken', 'and', 'a', 'side', 'of', 'the', 'rosemary', 'tots', 'the', 'fb', 'chicken', 'was', 'divine', 'char', 'grilled', 'chicken', 'chunks', 'with', 'charred', 'red', 'peppers', 'and', 'onions', 'on', 'a', 'semi', 'sweet', 'flatbread', 'topped', 'with', 'melted', 'cheese', 'and', 'a', 'sweet', 'yet', 'tangy', 'ranch', 'ish', 'sauce', 'winner', 'i', 'had', 'my', 'tots', 'to', 'go', 'so', 'they', 'were', 'also', 'a', 'bit', 'mushy', 'when', 'opened', 'at', 'the', 'office', 'such', 'is', 'the', 'fate', 'that', 'befalls', 'many', 'a', 'fried', 'item', 'my', 'only', 'request', 'is', 'that', 'nb', 'offer', 'a', 'half', 'side', 'of', 'tots', 'for', '2', '00', 'or', 'even', '3', '00', 'the', '4', '00', 'tots', 'were', 'a', 'bit', 'much', 'in', 'quantity', 'and', 'price', 'when', 'ordered', 'with', 'a', 'sandwich', 'ps', 'next', 'time', 'noshbox', 'offers', 'their', 'fish', 'hoagie', 'get'

In [16]:
model = Doc2Vec(dm=0, dbow_words=1, min_count=4, negative=3,
                hs=0, sample=1e-4, window=10, size=100, workers=4)

model.build_vocab(DOCS)
model.train(DOCS, total_examples=len(DOCS), epochs=1)

13600731

In [24]:
# Find words similar to query word
pprint(model.most_similar('asian'))

[('oriental', 0.8147902488708496),
 ('chinese', 0.8061826229095459),
 ('koreans', 0.7873468399047852),
 ('ethnic', 0.7829676866531372),
 ('chan', 0.7819612622261047),
 ('changs', 0.7797924280166626),
 ('japanese', 0.7785859107971191),
 ('philippines', 0.7761833667755127),
 ('indo', 0.7690566182136536),
 ('westernized', 0.767048716545105)]


In [20]:
# Find movies similar to query word
vec = model['korean']
pprint(model.docvecs.most_similar([vec]))

[('lim_ga_ne', 0.7808727622032166),
 ('taste_of_korea', 0.7759152054786682),
 ('korean_bbq', 0.7651158571243286),
 ('mi_hyang', 0.7572174072265625),
 ('sizzle_korean_barbeque', 0.7529715299606323),
 ('new_seoul_korean', 0.7525289058685303),
 ('sharon_garden', 0.7517707347869873),
 ('ka_chi', 0.7493730783462524),
 ('nuri_village', 0.7468250393867493),
 ('la_maison_bulgogi', 0.7466059923171997)]


In [22]:
# Find movies similar to a query movie
for name in df.name.tolist()[:1]:
    print("\nQuery: %s" % name)
    doctag = '_'.join(preprocess(name))
    pprint(model.docvecs.most_similar(doctag))


Query: The Tea Emporium
[('fahrenheit_coffee', 0.9294430017471313),
 ('bonsai_hill_fine_tea_gifts', 0.9266071319580078),
 ('golden_mint_coffee_tea_company', 0.9241845011711121),
 ('ephemeris_tea_room', 0.924126148223877),
 ('peter_s_yard', 0.9225086569786072),
 ('tea_time', 0.9147256016731262),
 ('arabica_coffee_house', 0.911689043045044),
 ('hex_coffee', 0.9109300374984741),
 ('the_common', 0.9075560569763184),
 ('coffee_tea_express', 0.9073807001113892)]


In [23]:
# Find words similar to a query movie
for name in df.name.tolist()[:1]:
    print("\nQuery: %s" % name)
    doctag = '_'.join(preprocess(name))
    vec = model.docvecs[doctag]
    pprint(model.most_similar([vec]))


Query: The Tea Emporium
[('caffeinated', 0.9258222579956055),
 ('intelligentsia', 0.9104077816009521),
 ('thermos', 0.9008877873420715),
 ('davids', 0.895863950252533),
 ('mochas', 0.8921820521354675),
 ('steeping', 0.8899515271186829),
 ('girly', 0.8795261383056641),
 ('capp', 0.8789957165718079),
 ('frappes', 0.8786147832870483),
 ('tazo', 0.8757609128952026)]
