# Contents
1. Find influential users
2. Find users who have influential friends
3. Find the restaurants in Toronto
4. Find the reviews for the restaurants where influential users gave high ratings to.

## 1. Find Influential Users

### Build the Graph

In [1]:
from utils.friendship import GraphBuilder
import pickle
import os
from os.path import join

### Test
user = 'user.json'
data_dir = 'data/raw'

# Sample 10 entries from the json file
fields = ['user_id','friends']
n_samples = 120000
G = GraphBuilder(user, data_dir).build_graph(n_samples, calculate_influence = True)

home = os.path.expanduser('~')
pkl = join(home, 'data/pkl/friendship.pkl')
with open(pkl, 'wb') as f:
    pickle.dump(G, f)

[0] N(nodes): 0
[0] N(edges): 0

[10000] N(nodes): 572098
[10000] N(edges): 1016194
[10000] N(nodes)/N(edges): 0.5630

[20000] N(nodes): 850060
[20000] N(edges): 1682199
[20000] N(nodes)/N(edges): 0.5053

[30000] N(nodes): 1094474
[30000] N(edges): 2318126
[30000] N(nodes)/N(edges): 0.4721

[40000] N(nodes): 1298589
[40000] N(edges): 2885366
[40000] N(nodes)/N(edges): 0.4501

[50000] N(nodes): 1551141
[50000] N(edges): 3527089
[50000] N(nodes)/N(edges): 0.4398

[60000] N(nodes): 1773430
[60000] N(edges): 4131718
[60000] N(nodes)/N(edges): 0.4292

[70000] N(nodes): 1875605
[70000] N(edges): 4405655
[70000] N(nodes)/N(edges): 0.4257

[80000] N(nodes): 2065953
[80000] N(edges): 4963206
[80000] N(nodes)/N(edges): 0.4163

[90000] N(nodes): 2280149
[90000] N(edges): 5642561
[90000] N(nodes)/N(edges): 0.4041

[100000] N(nodes): 2455635
[100000] N(edges): 6230531
[100000] N(nodes)/N(edges): 0.3941

[110000] N(nodes): 2582623
[110000] N(edges): 6638750
[110000] N(nodes)/N(edges): 0.3890

[12000

### Load the Graph

In [2]:
import pickle
import os
from os.path import join

home = os.path.expanduser('~')
pkl = join(home, 'data/pkl/friendship.pkl')
with open(pkl, 'rb') as f:
    G = pickle.load(f)

Degree Inspection

In [3]:
import numpy as np
G_degrees = np.array(sorted(map(lambda x: x[1], G.degree()))[::-1])
print('Total number of nodes: %i' % G_degrees.shape[0])
print('maximum node degree: %i' % max(G_degrees))
for d in range(11):
    n_nodes = (G_degrees == d).sum()
    s = 'The number of nodes with degree %i: %i ' % (d, n_nodes)
    print(s)

Total number of nodes: 2745604
maximum node degree: 8350
The number of nodes with degree 0: 0 
The number of nodes with degree 1: 1803117 
The number of nodes with degree 2: 421422 
The number of nodes with degree 3: 166610 
The number of nodes with degree 4: 85169 
The number of nodes with degree 5: 50730 
The number of nodes with degree 6: 33417 
The number of nodes with degree 7: 23312 
The number of nodes with degree 8: 17319 
The number of nodes with degree 9: 13231 
The number of nodes with degree 10: 10536 


Get the top-K nodes with the highest degree centrality

In [4]:
import networkx as nx
dc = nx.degree_centrality(G)
K = 100000 # top_K
top_K = sorted(dc.items(), key = lambda x: -x[1])[:K]

import pickle
from os.path import join

home = os.path.expanduser('~')
pkl = join(home, './data/friends_top100000.pkl')
with open(pkl, 'wb') as f:
    pickle.dump(set([n for n,c in top_K]), f)

## 2. Find the restaurants in Toronto

In [6]:
from utils.preprocess import JSONLoader
fields = ['business_id','categories','city']
# fields = None
city = ['Toronto']
categories =['Restaurants', 'Food', 'Sandwiches', 'Fast Food', 'American (Traditional)', 'Pizza',
                 'Italian', 'Burgers', 'Mexican','Breakfast & Brunch', 'American (New)', 'Chinese', 'Specialty Food', 'Bakeries', 'Desserts',
                 'Japanese', 'Ice Cream & Frozen Yogurt', 'Chicken Wings', 'Seafood', 'Salad', 'Sushi Bars',
                 'Asian Fusion', 'Thai', 'Indian']

business = 'business.json'
data_dir = 'data/raw'
jl = JSONLoader(business, data_dir, fields = fields)
jl.set_condition(city=city, categories=categories)
f_b, restaurants = jl.sample(10000000)
print(len(restaurants))
rest_id = set([restaurant[0] for restaurant in restaurants])

8627


## 4. Find the restaurants where influential users gave high ratings

In [7]:
from utils.preprocess import JSONLoader
import pickle
import os
from os.path import join

# Load the influential users
home = os.path.expanduser('~')
pkl = join(home, './data/friends_top100000.pkl')
with open(pkl, 'rb') as f:
    influential = pickle.load(f)

# Load the reviews
review = 'review.json'
data_dir = 'data/raw'
fields = ['user_id','business_id','stars']
jl = JSONLoader(review, data_dir, fields = fields)
jl.set_condition(stars = [5], business_id = rest_id, user_id = influential)

f_, popular_reviews = jl.sample(1000000)
len(popular_reviews)

5745

In [8]:
import pickle
from os.path import join

best_restaurants = set([r[1] for r in popular_reviews])
home = os.path.expanduser('~')
pkl = join(home, './data/best_restaurants.pkl')
with open(pkl, 'wb') as f:
    pickle.dump(best_restaurants, f)

## 5. Get the location for the best restaurants

In [19]:
from utils.preprocess import JSONLoader
fields = ['business_id','latitude', 'longitude']
# fields = None
city = ['Toronto']
# categories =['Restaurants', 'Food', 'Sandwiches', 'Fast Food', 'American (Traditional)', 'Pizza',
#                  'Italian', 'Burgers', 'Mexican','Breakfast & Brunch', 'American (New)', 'Chinese', 'Specialty Food', 'Bakeries', 'Desserts',
#                  'Japanese', 'Ice Cream & Frozen Yogurt', 'Chicken Wings', 'Seafood', 'Salad', 'Sushi Bars',
#                  'Asian Fusion', 'Thai', 'Indian']
categories =['Burgers','Seafood','Italian','Chinese','Japanese']
business = 'business.json'
data_dir = 'data/raw'
jl = JSONLoader(business, data_dir, fields = fields)
jl.set_condition(city=city, categories=categories)
f_, rest = jl.sample(10000000)
print(len(rest))

2051


In [20]:
f_

['business_id', 'latitude', 'longitude']

In [21]:
rest

[['oMcKvfNGX2HleFxYg1VGqg', 43.6456871, -79.3904932],
 ['kLw_FmSiEqYH-MtFhDIUFQ', 43.647499, -79.3864714],
 ['0e-j5VcEn54EZT-FKCUZdw', 43.6452327, -79.5324224],
 ['-xEGQKqXZzvkdyVQilQtrg', 43.7751786, -79.4140328],
 ['YHXczxm4W3BkGT-z7vZBBw', 43.6610723, -79.3827591],
 ['TG-5UF3tHSP9xSq5L6b1Zg', 43.777442469, -79.3438917026],
 ['iBH1TeUrJn1VuL59J1StUg', 43.6519944, -79.4070239],
 ['YFEwCxc-9WRRVGgRv_-p7Q', 43.6562329, -79.3923184],
 ['0B3W6KxkD3o4W4l6cq735w', 43.65459, -79.38049],
 ['Q4orLuzDAB9TnucYXksgLQ', 43.650806931, -79.4507830963],
 ['NOz8W_cUV3Dw5yLgFkKLGw', 43.6514389, -79.475629],
 ['gTUaz3Wlh8eyQ3s7T-6txw', 43.8017281162, -79.2948411405],
 ['iByQmTmTdO7hP4n1grSSWQ', 43.6624171, -79.4228715],
 ['kqW_BKO3XCOx8ifbzQsnGA', 43.6837856, -79.3206366],
 ['TBOKIAMxv0OHKJbarNvSeg', 43.7983457, -79.1376521],
 ['1nTMWMa6v-eBKkPYA3gxkQ', 43.661442, -79.3789741],
 ['J4er8Ieb_vHaH4C9wC47qA', 43.6739595, -79.3961214],
 ['NOKaVqvi4hI8RgGjFCFIAg', 43.6537655, -79.4133199],
 ['hDy-uY7Vy_TZdGBz

In [17]:
home = os.path.expanduser('~')
pkl = join(home, './data/best_restaurants_loc.pkl')
with open(pkl, 'wb') as f:
    pickle.dump(rest, f)

In [13]:
import os
from os.path import join
import pickle

home = os.path.expanduser('~')
pkl = join(home, './data/best_restaurants_loc.pkl')
with open(pkl, 'rb') as f:
    rest = pickle.load(f)

from geopy.distance import vincenty
your_xy = (43.6744092, -79.4580342)

def sort_by_distance_from(your_xy):
    dist = []
    for r in rest:
        latitude = r[2]
        longitude = r[3]
        xy = (latitude, longitude)
        dist.append(vincenty(your_xy, xy).meters)

    rest_dist = sorted([rest[i] + [dist[i]] for i in range(len(rest))], key = lambda x: x[-1])
    fields = [0, -2, -1]
    rest_dist = [{r[0]:r[-1]} for r in rest_dist]
    
    return rest_dist

sort_by_distance_from(your_xy)

[{'Er8lBoKdYofhheu0Di93fw': 0.0},
 {'TxJmy66PFBM7w5DdpOXrNg': 475.2093234675301},
 {'un1QSG18sZtO3lSuuHILrg': 639.5515224232188},
 {'02Ycww2etUAxorZb5gIM8Q': 786.4574341142808},
 {'CPIZ2HxNOiBqpsL3VTFnjQ': 820.6275194110254},
 {'YmTj2VUS_csmL2cMnQ1qmw': 840.1644012945336},
 {'POkNk-y-sS2Od39kP5NN5Q': 866.8310297879779},
 {'LYuY6kXhZzFtxR51eUiRvQ': 960.2926605976562},
 {'X3cYjMpEMOntxl9gDe9N2Q': 967.4796740512478},
 {'2rLj1eok5cXmHr5BrGqbDw': 1034.241796315784},
 {'aY2NXhrFompJLNTefo9kEw': 1054.3678239086537},
 {'92JuUiprX3bYHa_3bIAOsg': 1076.4595723061075},
 {'I_wfQkB3ODm0RgcAPDg4Xg': 1082.534640178778},
 {'z0Q8-1RyCftpD4h2nIJwjw': 1097.4332110168693},
 {'2cYTy7YlmlS9uuuQI1Trzg': 1124.7419845449538},
 {'yx8FIqunok3XqfC9OWFxhA': 1158.0056640994667},
 {'_mtAXGkVLLB6I9NddO-whQ': 1170.5339017818794},
 {'fK1oj0dk9Bc6KsBk5mMDxg': 1177.541373768278},
 {'ZYTTggT0haYooWAX_T9Ufg': 1181.7893141036093},
 {'PzEhfwOztHKYYv_NPok2WQ': 1192.114781945605},
 {'6mUidMn4-TEj3_-LD3lsSQ': 1235.1818779688122}