# Load the reviews

In [1]:
from utils.preprocess import JSONLoader
fields = ['business_id']
city = ['Toronto']
categories = ['Burgers','Seafood','Italian','Chinese','Japanese']

business = 'business.json'
review = 'review.json'
data_dir = 'data/dataset'

jl = JSONLoader(business, data_dir, fields = fields, encoding = 'utf-8')
jl.set_condition(city=city, categories=categories)
f_b, business_id = jl.sample(10000000)
business_id = set([i[0] for i in business_id])
print(len(business_id))
print(jl.condition)
print(jl.fields)

2051
{'city': {'Toronto'}, 'categories': {'Seafood', 'Japanese', 'Chinese', 'Italian', 'Burgers'}}
['business_id']


In [2]:
fields = ['business_id','text']
jl = JSONLoader(review, data_dir, fields = fields)
jl.set_condition(business_id = business_id)
f_, rv = jl.sample(10000000)
print(len(rv)) # The total number of review texts

96936


# Build the TF-IDF Model

In [3]:
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
# remove common words and tokenize
documents = [r[-1] for r in rv]
texts = [[word for word in document.lower().split() if word not in stops]
         for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 5] for text in texts]

In [4]:
from gensim import corpora
dictionary = corpora.Dictionary(texts)
print(dictionary)
corpus = [dictionary.doc2bow(doc) for doc in texts]

import pickle
with open('./data/tfidf_dictionary', 'wb') as f:
    pickle.dump(dictionary, f)
    
from gensim import corpora, models, similarities
import pickle
with open('./data/tfidf_dictionary', 'rb') as f:
    dictionary = pickle.load(f)
tfidf = models.TfidfModel(corpus)
with open('./data/tfidf_model', 'wb') as f:
    pickle.dump(tfidf, f)

Dictionary(38627 unique tokens: ['smashed', 'burgers', 'done', 'properly', 'heart']...)


In [5]:
import pickle
with open('./data/tfidf_model', 'rb') as f:
    tfidf = pickle.load(f)
tfidf[corpus[-1]]

[(14, 0.1659690338338441),
 (65, 0.08172837263365922),
 (70, 0.04824371869141254),
 (78, 0.03512631239538303),
 (160, 0.13690132609190717),
 (183, 0.13225873142384892),
 (199, 0.06682204117194206),
 (212, 0.055173113407135835),
 (237, 0.1503173389266708),
 (339, 0.16043657281149443),
 (340, 0.16239727437512821),
 (352, 0.15166070532564258),
 (372, 0.11676911834871256),
 (406, 0.09922037988337243),
 (490, 0.1470240025479285),
 (505, 0.19235607503939242),
 (728, 0.0978379601618346),
 (768, 0.0966187101253497),
 (805, 0.09890238801833307),
 (827, 0.07452444317271396),
 (830, 0.12241572481030523),
 (843, 0.11111357457224348),
 (851, 0.11659097105824347),
 (852, 0.18718103515580228),
 (922, 0.1690244502630934),
 (1015, 0.11831252053158219),
 (1081, 0.15220678330509943),
 (1247, 0.12581503630716068),
 (1323, 0.1168079520720305),
 (1356, 0.19455387001170182),
 (1487, 0.1573194100126931),
 (1492, 0.16649951677839323),
 (1539, 0.16518951251215008),
 (1549, 0.14092805828084956),
 (1573, 0.140150

In [6]:
new_doc = 'this chinese restaurant is awesome'
new_vec = dictionary.doc2bow(new_doc.lower().split())
index = similarities.SparseMatrixSimilarity(tfidf[corpus[:100]], len(dictionary))
sims = index[tfidf[new_vec]] # similarity between new_vec and all other corpus
print(list(enumerate(sims)))

[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.0), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.0), (17, 0.040174279), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.021136817), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.0), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.0), (42, 0.023891667), (43, 0.0), (44, 0.0), (45, 0.037597351), (46, 0.15014948), (47, 0.0), (48, 0.0), (49, 0.0), (50, 0.0), (51, 0.0), (52, 0.0), (53, 0.0), (54, 0.048511535), (55, 0.0), (56, 0.0), (57, 0.0), (58, 0.023922624), (59, 0.034806591), (60, 0.0), (61, 0.0), (62, 0.13383804), (63, 0.0), (64, 0.0), (65, 0.0), (66, 0.016036035), (67, 0.0), (68, 0.02094641), (69, 0.015173888), (70, 0.0), (71, 0.0), (72, 0.0), (73, 0.0), (74, 0.0), (75, 0.038370732), (76, 0.0), (77, 0.0), (78, 0.0), (79, 0.15158795), (80, 0.014049986), (8

# Build TF-IDF vectors for a specific category

https://radimrehurek.com/gensim/tutorial.html

In [8]:
# Takes quite long

categories =['Burgers','Seafood','Italian','Chinese','Japanese']
data_dir = 'data/dataset'
with open('./data/tfidf_dictionary', 'rb') as f:
    dictionary = pickle.load(f)
with open('./data/tfidf_model', 'rb') as f:
    tfidf = pickle.load(f)    

def get_category_doc(category, data_dir, dictionary):
    from utils.preprocess import JSONLoader
    fields = ['business_id']
    city = ['Toronto']
    categories = [category]
    business = 'business.json'
    review = 'review.json'

    jl = JSONLoader(business, data_dir, fields = fields, encoding = 'utf-8')
    jl.set_condition(city=city, categories=categories)
    f_b, business_id = jl.sample(10000000)
    business_id = set([i[0] for i in business_id])
    fields = ['business_id','text']
    jl = JSONLoader(review, data_dir, fields = fields)
    jl.set_condition(business_id = business_id)
    f_, rv = jl.sample(10000000)
    
    from itertools import chain
    rv_flat = list(chain(*[r[-1].lower().split() for r in rv]))
    doc = dictionary.doc2bow(rv_flat)
    
    return doc

doc_cat = [get_category_doc(cat, data_dir, dictionary) for cat in categories]

import pickle
with open('./data/category_doc', 'wb') as f:
    pickle.dump(doc_cat, f)

In [9]:
import pickle
with open('./data/category_doc', 'rb') as f:
    doc_cat = pickle.load(f)
new_doc = 'this chinese restaurant is awesome'
new_vec = dictionary.doc2bow(new_doc.lower().split())

def cosine_sim(v, doc_cat, tfidf, dictionary):
    from gensim import similarities
    index = similarities.SparseMatrixSimilarity(tfidf[doc_cat], len(dictionary))
    sims = index[tfidf[v]] # similarity between v and all other corpus
    
    return sims

categories = ['Burgers','Seafood','Italian','Chinese','Japanese']
for i, s in enumerate(cosine_sim(new_vec, doc_cat, tfidf, dictionary)):
    print(categories[i], s)

Burgers 0.0305375
Seafood 0.0575153
Italian 0.0448352
Chinese 0.145104
Japanese 0.0421291


# Build TF-IDF vectors for each businesses

In [10]:
def get_business_doc(data_dir, dictionary):
    '''
    Returns:
        business_doc (dict): Dictionary of (business_id, tf-idf vector).
        The tf-idf vector is calculated for the reviews for the businesses in Toronto,
        and whose categories are one of the 'Burgers','Seafood','Italian','Chinese','Japanese'.
    '''
    from utils.preprocess import JSONLoader
    fields = ['business_id']
    city = ['Toronto']
    categories = ['Burgers','Seafood','Italian','Chinese','Japanese']
    business = 'business.json'
    review = 'review.json'

    jl = JSONLoader(business, data_dir, fields = fields, encoding = 'utf-8')
    jl.set_condition(city=city, categories=categories)
    f_b, business_id = jl.sample(10000000)
    business_id = set([i[0] for i in business_id])
    
    # Get the reviews for the businesses in Toronto, and whose
    # categories are 'Burgers','Seafood','Italian','Chinese','Japanese'
    fields = ['business_id','text']
    jl = JSONLoader(review, data_dir, fields = fields)
    jl.set_condition(business_id = business_id)
    f_, rv = jl.sample(10000000)
    
    # dicionary of business, docs pair
    from collections import defaultdict
    business_doc = defaultdict(lambda: [])
    for b_id, b_rv in rv:
        doc = dictionary.doc2bow(b_rv.lower().split())
        business_doc[b_id].extend(doc)
    
    return business_doc

doc_business = get_business_doc(data_dir, dictionary)
doc_business = dict(doc_business)

In [11]:
import pickle
with open('./data/business_doc', 'wb') as f:
    pickle.dump(doc_business, f)

In [12]:
import pickle
with open('./data/business_doc', 'rb') as f:
    doc_business = pickle.load(f)

In [13]:
cosine_sim(doc_business[list(doc_business.keys())[0]], doc_cat, tfidf, dictionary)

array([ 0.77067816,  0.44591644,  0.41924542,  0.38793871,  0.3955414 ], dtype=float32)

# Calculate the tf-idf between the categories and businesses

In [14]:
import pickle

categories =['Burgers','Seafood','Italian','Chinese','Japanese']
data_dir = 'data/dataset'
with open('./data/tfidf_dictionary', 'rb') as f:
    dictionary = pickle.load(f)
with open('./data/tfidf_model', 'rb') as f:
    tfidf = pickle.load(f)    
with open('./data/category_doc', 'rb') as f:
    doc_cat = pickle.load(f)
with open('./data/business_doc', 'rb') as f:
    doc_business = pickle.load(f)

def cosine_sim(v, doc_cat, tfidf, dictionary):
    from gensim import similarities
    index = similarities.SparseMatrixSimilarity(tfidf[doc_cat], len(dictionary))
    sims = index[tfidf[v]] # similarity between v and all other corpus
    
    return sims

b_id_test = list(doc_business.keys())[0]

for i, s in enumerate(cosine_sim(doc_business[b_id_test], doc_cat, tfidf, dictionary)):
    print(categories[i], s)

Burgers 0.770678
Seafood 0.445916
Italian 0.419245
Chinese 0.387939
Japanese 0.395541


In [15]:
cosine_sim(doc_business[b_id_test], [doc_cat[0]], tfidf, dictionary)[0]

0.77067816

In [16]:
import numpy as np

def sim_by_business(doc_business, cat):
    column = categories.index(cat)
    sim = dict()
    for b_id in doc_business:
        sim[b_id] = cosine_sim(doc_business[b_id], [doc_cat[column]], tfidf, dictionary)[0]
    
    return sorted(sim.items(), key=lambda x: -x[1])

import pickle
sims = dict()
for cat in categories:
    sims = sim_by_business(doc_business, cat)
    with open('./data/cos_sim_%s' % cat, 'wb') as f:
        pickle.dump(sims, f)

# Load the final tf-idf similarity between the businesses / categories

In [17]:
import pickle
categories =['Burgers','Seafood','Italian','Chinese','Japanese']
sims = dict()
for cat in categories:
    with open('./data/cos_sim_%s' % cat, 'rb') as f:
        sims[cat] = pickle.load(f)

In [18]:
sims['Chinese'][:10]

[('zgQHtqX0gqMw1nlBZl2VnQ', 3.5928981),
 ('RtUvSWO_UZ8V3Wpj0n077w', 3.4973285),
 ('O1TvPrgkK2bUo5O5aSZ7lw', 3.467016),
 ('f5O7v_X_jCg2itqacRfxhg', 2.9844866),
 ('fGurvC5BdOfd5MIuLUQYVA', 2.9722822),
 ('BUcTdN-rNE8urCCQuxSOQA', 2.9080408),
 ('DE89UdHFMCN6DtYWZuer5A', 2.8275206),
 ('_xAJZOKBMPOe47p1MphB2w', 2.7320778),
 ('RwRNR4z3kY-4OsFqigY5sw', 2.4913545),
 ('OllK5_S-7svgSwbUfx1xYA', 2.4909453)]