In [2]:
import gensim
import os
from gensim.models.word2vec import LineSentence
from gensim.models.fasttext import FastText

## The Yelp Dataset
[**The Yelp Dataset**](https://www.yelp.com/dataset_challenge/) is a dataset published by the business review service [Yelp](http://yelp.com) for academic research and educational purposes. I really like the Yelp dataset as a subject for machine learning and natural language processing demos, because it's big (but not so big that you need your own data center to process it), well-connected, and anyone can relate to it &mdash; it's largely about food, after all!

In [5]:
import os

start_data_dir = '/Users/aliosha/Development/nlp/'
data_directory = os.path.join(start_data_dir, 'data', 'yelp_dataset')

businesses_filepath = os.path.join(data_directory, 'business.json')

with open(businesses_filepath, encoding='utf_8') as f:
    first_business_record = f.readline() 

print(first_business_record)

{"business_id": "FYWN1wneV18bWNgQjJ2GNg", "name": "Dental by Design", "neighborhood": "", "address": "4855 E Warner Rd, Ste B9", "city": "Ahwatukee", "state": "AZ", "postal_code": "85044", "latitude": 33.3306902, "longitude": -111.9785992, "stars": 4.0, "review_count": 22, "is_open": 1, "attributes": {"AcceptsInsurance": true, "ByAppointmentOnly": true, "BusinessAcceptsCreditCards": true}, "categories": ["Dentists", "General Dentistry", "Health & Medical", "Oral Surgeons", "Cosmetic Dentists", "Orthodontists"], "hours": {"Friday": "7:30-17:00", "Tuesday": "7:30-17:00", "Thursday": "7:30-17:00", "Wednesday": "7:30-17:00", "Monday": "7:30-17:00"}}



In [6]:
review_json_filepath = os.path.join(data_directory, 'review.json')

with open(review_json_filepath, encoding='utf_8') as f:
    first_review_record = f.readline()
    
print(first_review_record)

{"review_id":"v0i_UHJMo_hPBq9bxWvW4w","user_id":"bv2nCi5Qv5vroFiqKGopiw","business_id":"0W4lkclzZThpx3V65bVgig","stars":5,"date":"2016-05-28","text":"Love the staff, love the meat, love the place. Prepare for a long line around lunch or dinner hours. \n\nThey ask you how you want you meat, lean or something maybe, I can't remember. Just say you don't want it too fatty. \n\nGet a half sour pickle and a hot pepper. Hand cut french fries too.","useful":0,"funny":0,"cool":0}



In [7]:
import json

restaurant_ids = set()

# open the businesses file
with open(businesses_filepath, encoding='utf_8') as f:
    
    # iterate through each line (json record) in the file
    for business_json in f:
        
        # convert the json record to a Python dict
        business = json.loads(business_json)
        
        # if this business is not a restaurant, skip to the next one
        if u'Restaurants' not in business[u'categories']:
            continue
            
        # add the restaurant business id to our restaurant_ids set
        restaurant_ids.add(business[u'business_id'])

# turn restaurant_ids into a frozenset, as we don't need to change it anymore
restaurant_ids = frozenset(restaurant_ids)

# print the number of unique restaurant ids in the dataset
print('{:,}'.format(len(restaurant_ids)), u'restaurants in the dataset.')

54,618 restaurants in the dataset.


In [7]:
intermediate_directory = os.path.join(data_directory, 'intermediate')

review_txt_filepath = os.path.join(intermediate_directory, 'review_text_all.txt')

In [9]:
if not os.path.exists(intermediate_directory):
    os.makedirs(intermediate_directory)

In [45]:
%%time

max_rev = 50000
# set process to False if you don't want to process the data
process = True
# process = False

review_count = 0

if process:
    with open(review_txt_filepath, 'w+', encoding='utf_8') as review_txt_file:

        with open(review_json_filepath, encoding='utf_8') as review_json_file:

            for review_json in review_json_file:
                review = json.loads(review_json)

                # if this review is not about a restaurant, skip to the next one
                if review[u'business_id'] not in restaurant_ids:
                    continue

                # write the restaurant review as a line in the new file
                # escape newline characters in the original review text
                review_txt_file.write(review[u'text'].replace('\n', '\\n') + '\n')
                review_count += 1
                if review_count >= max_rev:
                    break

    print(u'''Text from {:,} restaurant reviews
              written to the new txt file.'''.format(review_count))
    
else:
    with open(review_txt_filepath, encoding='utf_8') as review_txt_file:
        for review_count, line in enumerate(review_txt_file):
            pass
        
    print(u'Text from {:,} restaurant reviews in the txt file.'.format(review_count + 1))

Text from 50,000 restaurant reviews
              written to the new txt file.
CPU times: user 1.28 s, sys: 156 ms, total: 1.43 s
Wall time: 1.65 s


In [8]:
review_txt_filepath

'/Users/aliosha/Development/nlp/data/yelp_dataset/intermediate/review_text_all.txt'

In [23]:
# Set file names for train and test data
data = LineSentence(review_txt_filepath)

model_gensim = FastText(size=100)

# build the vocabulary
model_gensim.build_vocab(data)

In [25]:
%%time

# train the model
model_gensim.train(data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter)

print(model_gensim)

  This is separate from the ipykernel package so we can avoid doing imports until


FastText(vocab=13161, size=100, alpha=0.025)
CPU times: user 1min 44s, sys: 1.03 s, total: 1min 45s
Wall time: 1min 6s


In [46]:
%%time

from gensim.models.wrappers.fasttext import FastText as FT_wrapper

# Set FastText home to the path to the FastText executable
ft_home = '/Users/aliosha/Development/nlp/fastText/fasttext'

# train the model
model_wrapper = FT_wrapper.train(ft_home, review_txt_filepath)

print(model_wrapper)

FastText(vocab=32586, size=100, alpha=0.025)
CPU times: user 7.2 s, sys: 917 ms, total: 8.12 s
Wall time: 2min 6s


In [47]:
# saving a model trained via Gensim's fastText implementation
model_gensim.save('saved_model_gensim')
loaded_model = FT_gensim.load('saved_model_gensim')
print(loaded_model)


FastText(vocab=13161, size=100, alpha=0.025)


In [48]:
# saving a model trained via fastText wrapper
model_wrapper.save('saved_model_wrapper')
loaded_model = FT_wrapper.load('saved_model_wrapper')
print(loaded_model)

FastText(vocab=32586, size=100, alpha=0.025)


In [49]:
print('night' in model_wrapper.wv.vocab)
print('nights' in model_wrapper.wv.vocab)
print(model_wrapper['night'])
print(model_wrapper['nights'])

True
True
[-2.7185776   1.4365803  -1.6437114   2.0584633  -1.4260191  -4.0511084
 -0.9091086   0.8413393   0.39338198 -0.24548973 -1.1307178   0.52726805
  0.8045797  -0.68693244 -1.812846    2.205074    3.255425    1.0463287
  1.4344546  -0.12507983  0.09863514  3.122248   -1.0183789  -1.6110741
  2.0402436  -0.2760119   2.0748363  -0.84881026  0.27457434  1.6650326
 -0.7523171  -0.71506804  1.4569962   3.4606004  -0.08815765  1.7500066
  3.950676    1.4748651   2.8572576   2.6455152   4.744443   -3.354194
  2.1705937  -2.9115553  -2.9871023   0.89958733 -1.0711738   3.0827706
  1.8265618   3.1305425  -2.5459406  -0.502866    1.7525264   0.58584493
 -1.4412364  -0.24747092  1.7170135   1.5159978   3.5333445  -0.40977845
  0.37035015 -0.71257204 -2.1066566  -1.9206173   0.9679985  -0.6458595
 -3.1274252   0.85532665  0.19122182  0.06846409 -1.3375468   0.16227752
 -0.20098454  0.42042786  2.0671573  -1.1599432   2.0633118   1.1976726
  0.71644056 -1.7480453  -1.755339    0.12139568  0

In [50]:
model_wrapper.similarity("night", "nights")


0.9151657960448711

In [51]:
model_wrapper.most_similar("cheesburger")

[('cheeseburger', 0.9603231549263),
 ('cheeseburger,', 0.9294769763946533),
 ('Cheeseburger', 0.9247754812240601),
 ('burger"', 0.914174497127533),
 ('burger)', 0.9139341711997986),
 ('Cheeseburger,', 0.9134862422943115),
 ('burger?', 0.9113364219665527),
 ('burger:', 0.9105531573295593),
 ('cheeseburgers', 0.9074652194976807),
 ('burger!', 0.8963555097579956)]

In [35]:
def word_algebra(add=[], subtract=[], topn=1, model=model_wrapper):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = model.wv.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print( term)

In [52]:
word_algebra(['cheese', 'burger'])

cheeseburger


In [41]:
word_algebra(['lunch', 'breakfast'], topn=7)

breakfast/brunch
lunch!
lunch,
Smunch
Lunch
lunch/dinner
brunch


In [43]:
word_algebra(['lunch', 'night'], subtract=['day'], topn=7)

lunch!
lunch,
brunch
Lunch
lunch/dinner
brunch,
lunch.


In [44]:
word_algebra(['lunch', 'night'], subtract=['day'], topn=7, model=model_gensim)

lunch!
brunch
brunch,
lunch,
Lunch
lunch/dinner
Brunch


In [54]:
from gensim.models import Word2Vec

word2vec_filepath = os.path.join(intermediate_directory, 'word2vec_model_all')

In [56]:
%%time


# initiate the model and perform the first epoch of training
word2vec = Word2Vec(data, size=100, window=5,
                    min_count=20, sg=1, workers=4)

word2vec.save(word2vec_filepath)



In [58]:

# perform another 11 epochs of training
for i in range(1,12):
    word2vec.train(data, total_examples=word2vec.corpus_count, epochs=word2vec.epochs)
    word2vec.save(word2vec_filepath)
        

print(u'{} training epochs so far.'.format(word2vec.train_count))

12 training epochs so far.


In [None]:
# load the finished model from disk
word2vec = Word2Vec.load(word2vec_filepath)
word2vec.init_sims()

In [59]:
word_algebra(['lunch', 'breakfast'], topn=7, model=word2vec)

brunch
breakfast,
breakfast.
lunch,
lunch.
dinner
weekday


In [60]:
word_algebra(['lunch', 'night'], subtract=['day'], topn=7, model=word2vec)

dinner
lunch,
Wednesday
lunch.
Friday
weeknight
evening
