# Load the reviews

In [3]:
from utils.preprocess import JSONLoader
fields = ['business_id']
city = ['Toronto']
categories = ['Burgers','Seafood','Italian','Chinese','Japanese']

business = 'business.json'
review = 'review.json'
data_dir = 'data/raw'

jl = JSONLoader(business, data_dir, fields = fields, encoding = 'utf-8')
jl.set_condition(city=city, categories=categories)
f_b, business_id = jl.sample(10000000)
business_id = set([i[0] for i in business_id])
print('The number of businesses: %i' % len(business_id))

The number of businesses: 2051


In [4]:
fields = ['business_id','text']
jl = JSONLoader(review, data_dir, fields = fields)
jl.set_condition(business_id = business_id)
f_, rv = jl.sample(10000000)
print('Total number of reviews: %i' % len(rv)) # The total number of review texts

Total number of reviews: 96936


# Build the TF-IDF Model

In [7]:
from nltk.corpus import stopwords
from collections import defaultdict
from gensim import corpora, models, similarities
from nltk.stem import WordNetLemmatizer
from nltk import wordpunct_tokenize
import pickle

stops = set(stopwords.words('english'))
# remove common words and tokenize
documents = [r[-1] for r in rv]
l = WordNetLemmatizer()
texts = [[l.lemmatize(word).lower() for word in wordpunct_tokenize(document)
          if word not in stops and word.lower() not in stops] for document in documents]

# use the words that appears at least 5 times
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 5] for text in texts]

dictionary = corpora.Dictionary(texts)
print(dictionary)
corpus = [dictionary.doc2bow(doc) for doc in texts]
tfidf = models.TfidfModel(corpus)

# Save
with open('./data/tfidf_dictionary', 'wb') as f:
    pickle.dump(dictionary, f)
with open('./data/tfidf_model', 'wb') as f:
    pickle.dump(tfidf, f)

Dictionary(19171 unique tokens: ['smashed', 'burger', 'done', 'properly', 'heart']...)


In [9]:
import pickle

with open('./data/tfidf_dictionary', 'rb') as f:
    dictionary = pickle.load(f)
with open('./data/tfidf_model', 'rb') as f:
    tfidf = pickle.load(f)
    
tfidf[corpus[-1]]

[(7, 0.009567939006763143),
 (15, 0.17553062693704843),
 (67, 0.07845205361880085),
 (71, 0.04842777718301017),
 (79, 0.03400475771086676),
 (83, 0.09400398480219363),
 (103, 0.18256978114525738),
 (154, 0.12641186621990233),
 (173, 0.11263122890774996),
 (190, 0.06943477493147623),
 (194, 0.08580686193168809),
 (200, 0.06859922188742085),
 (219, 0.17502463441017685),
 (261, 0.1190168271379765),
 (273, 0.0831025697114241),
 (298, 0.20023219261938524),
 (299, 0.20479590417924645),
 (306, 0.18898107831723246),
 (324, 0.14749251085289328),
 (351, 0.12394813451953766),
 (364, 0.09319522474716169),
 (377, 0.1414989059679592),
 (597, 0.10658468032178077),
 (601, 0.10895111746218249),
 (626, 0.12069712125800734),
 (661, 0.12440289616146395),
 (689, 0.12967026164077458),
 (696, 0.1354249917052643),
 (700, 0.144770859493165),
 (855, 0.17278932296947688),
 (967, 0.14418150900767712),
 (1017, 0.12520564316441013),
 (1041, 0.24073491659736185),
 (1122, 0.181230637297361),
 (1127, 0.205505284165475

In [10]:
new_doc = 'this chinese restaurant is awesome'
new_vec = dictionary.doc2bow(new_doc.lower().split())
index = similarities.SparseMatrixSimilarity(tfidf[corpus[:100]], len(dictionary))
sims = index[tfidf[new_vec]] # similarity between new_vec and all other corpus
print(list(enumerate(sims)))

[(0, 0.045629315), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.0), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.0), (17, 0.042376243), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.016292443), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.0), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.0), (42, 0.019063195), (43, 0.0), (44, 0.0), (45, 0.045981325), (46, 0.14004759), (47, 0.0), (48, 0.0), (49, 0.0), (50, 0.0), (51, 0.0), (52, 0.0), (53, 0.0), (54, 0.078764737), (55, 0.0), (56, 0.0), (57, 0.0068733585), (58, 0.015946809), (59, 0.028438997), (60, 0.0), (61, 0.0), (62, 0.1467528), (63, 0.0), (64, 0.0), (65, 0.0), (66, 0.010729043), (67, 0.0), (68, 0.016546333), (69, 0.011385167), (70, 0.0), (71, 0.0), (72, 0.0), (73, 0.0), (74, 0.0), (75, 0.05114264), (76, 0.0), (77, 0.0), (78, 0.0), (79, 0.16243352), (80, 

# Build TF-IDF vectors for a specific category

https://radimrehurek.com/gensim/tutorial.html

In [11]:
# Takes quite long

import pickle
categories =['Burgers','Seafood','Italian','Chinese','Japanese']
data_dir = 'data/raw'
with open('./data/tfidf_dictionary', 'rb') as f:
    dictionary = pickle.load(f)
with open('./data/tfidf_model', 'rb') as f:
    tfidf = pickle.load(f)    

def get_category_doc(category, data_dir, dictionary):
    from utils.preprocess import JSONLoader
    fields = ['business_id']
    city = ['Toronto']
    categories = [category]
    business = 'business.json'
    review = 'review.json'

    jl = JSONLoader(business, data_dir, fields = fields, encoding = 'utf-8')
    jl.set_condition(city=city, categories=categories)
    f_b, business_id = jl.sample(10000000)
    business_id = set([i[0] for i in business_id])
    fields = ['business_id','text']
    jl = JSONLoader(review, data_dir, fields = fields)
    jl.set_condition(business_id = business_id)
    f_, rv = jl.sample(10000000)
    
    from itertools import chain
    rv_flat = list(chain(*[r[-1].lower().split() for r in rv]))
    doc = dictionary.doc2bow(rv_flat)
    
    return doc

doc_cat = [get_category_doc(cat, data_dir, dictionary) for cat in categories]

with open('./data/category_doc', 'wb') as f:
    pickle.dump(doc_cat, f)

In [12]:
import pickle
with open('./data/category_doc', 'rb') as f:
    doc_cat = pickle.load(f)
new_doc = 'this chinese restaurant is awesome'
new_vec = dictionary.doc2bow(new_doc.lower().split())

def cosine_sim(v, doc_cat, tfidf, dictionary):
    from gensim import similarities
    index = similarities.SparseMatrixSimilarity(tfidf[doc_cat], len(dictionary))
    sims = index[tfidf[v]] # similarity between v and all other corpus
    
    return sims

categories = ['Burgers','Seafood','Italian','Chinese','Japanese']
for i, s in enumerate(cosine_sim(new_vec, doc_cat, tfidf, dictionary)):
    print(categories[i], s)

Burgers 0.0103107
Seafood 0.0192852
Italian 0.0131561
Chinese 0.0581643
Japanese 0.0134848


# Build TF-IDF vectors for each businesses

In [10]:
import pickle

def get_business_doc(data_dir, dictionary):
    '''
    Returns:
        business_doc (dict): Dictionary of (business_id, tf-idf vector).
        The tf-idf vector is calculated for the reviews for the businesses in Toronto,
        and whose categories are one of the 'Burgers','Seafood','Italian','Chinese','Japanese'.
    '''
    from utils.preprocess import JSONLoader
    fields = ['business_id']
    city = ['Toronto']
    categories = ['Burgers','Seafood','Italian','Chinese','Japanese']
    business = 'business.json'
    review = 'review.json'

    jl = JSONLoader(business, data_dir, fields = fields, encoding = 'utf-8')
    jl.set_condition(city=city, categories=categories)
    f_b, business_id = jl.sample(10000000)
    business_id = set([i[0] for i in business_id])
    
    # Get the reviews for the businesses in Toronto, and whose
    # categories are 'Burgers','Seafood','Italian','Chinese','Japanese'
    fields = ['business_id','text']
    jl = JSONLoader(review, data_dir, fields = fields)
    jl.set_condition(business_id = business_id)
    f_, rv = jl.sample(10000000)
    
    # dicionary of business, docs pair
    from collections import defaultdict
    business_doc = defaultdict(lambda: [])
    for b_id, b_rv in rv:
        doc = dictionary.doc2bow(b_rv.lower().split())
        business_doc[b_id].extend(doc)
    
    return business_doc

doc_business = get_business_doc(data_dir, dictionary)
doc_business = dict(doc_business)

with open('./data/business_doc', 'wb') as f:
    pickle.dump(doc_business, f)

# Calculate the tf-idf between the categories and businesses

In [16]:
import pickle
from gensim import similarities

categories =['Burgers','Seafood','Italian','Chinese','Japanese']
data_dir = 'data/raw'
with open('./data/tfidf_dictionary', 'rb') as f:
    dictionary = pickle.load(f)
with open('./data/tfidf_model', 'rb') as f:
    tfidf = pickle.load(f)    
with open('./data/category_doc', 'rb') as f:
    doc_cat = pickle.load(f)
with open('./data/business_doc', 'rb') as f:
    doc_business = pickle.load(f)

def cosine_sim(v, doc_cat, tfidf, dictionary):
    index = similarities.SparseMatrixSimilarity(tfidf[doc_cat], len(dictionary))
    sims = index[tfidf[v]] # similarity between v and all other corpus
    
    return sims

# Test
b_id_test = list(doc_business.keys())[0]

for i, s in enumerate(cosine_sim(doc_business[b_id_test], doc_cat, tfidf, dictionary)):
    print(categories[i], s)

from utils.preprocess import JSONLoader
fields = ['business_id', 'categories']
city = ['Toronto']
business_id = [b_id_test]

business = 'business.json'
data_dir = 'data/raw'

jl = JSONLoader(business, data_dir, fields = fields, encoding = 'utf-8')
jl.set_condition(city=city, categories=categories, business_id = business_id)
_, b = jl.sample(10000000)
print(b)

Burgers 0.197789
Seafood 0.147769
Italian 0.139211
Chinese 0.137562
Japanese 0.139398
[['JB8-8TtNYX-vLqN7cz-zHA', ['Burgers', 'Restaurants']]]


In [17]:
import numpy as np

def sim_by_business(doc_business, cat):
    column = categories.index(cat)
    sim = dict()
    for b_id in doc_business:
        sim[b_id] = cosine_sim(doc_business[b_id], [doc_cat[column]], tfidf, dictionary)[0]
    
    return sorted(sim.items(), key=lambda x: -x[1])

import pickle
sims = dict()
for cat in categories:
    sims = sim_by_business(doc_business, cat)
    with open('./data/cos_sim_%s' % cat, 'wb') as f:
        pickle.dump(sims, f)

# Load the final tf-idf similarity between the businesses / categories

In [17]:
import pickle
categories =['Burgers','Seafood','Italian','Chinese','Japanese']
sims = dict()
for cat in categories:
    with open('./data/cos_sim_%s' % cat, 'rb') as f:
        sims[cat] = pickle.load(f)

In [18]:
sims['Chinese'][:10]

[('zgQHtqX0gqMw1nlBZl2VnQ', 3.5928981),
 ('RtUvSWO_UZ8V3Wpj0n077w', 3.4973285),
 ('O1TvPrgkK2bUo5O5aSZ7lw', 3.467016),
 ('f5O7v_X_jCg2itqacRfxhg', 2.9844866),
 ('fGurvC5BdOfd5MIuLUQYVA', 2.9722822),
 ('BUcTdN-rNE8urCCQuxSOQA', 2.9080408),
 ('DE89UdHFMCN6DtYWZuer5A', 2.8275206),
 ('_xAJZOKBMPOe47p1MphB2w', 2.7320778),
 ('RwRNR4z3kY-4OsFqigY5sw', 2.4913545),
 ('OllK5_S-7svgSwbUfx1xYA', 2.4909453)]