# Contents
1. Find influential users
2. Find users who have influential friends
3. Find the restaurants in Toronto
4. Find the reviews for the restaurants where influential users gave high ratings to.

## 1. Find Influential Users

### Build the Graph

In [2]:
from utils.friendship import GraphBuilder

### Test
user = 'user.json'
data_dir = 'data/dataset'

# Sample 10 entries from the json file
fields = ['user_id','friends']
n_samples = 120000
G = GraphBuilder(user, data_dir).build_graph(n_samples)

[0] N(nodes): 0
[0] N(edges): 0

[10000] N(nodes): 572098
[10000] N(edges): 1016194
[10000] N(nodes)/N(edges): 0.5630

[20000] N(nodes): 850060
[20000] N(edges): 1682199
[20000] N(nodes)/N(edges): 0.5053

[30000] N(nodes): 1094474
[30000] N(edges): 2318126
[30000] N(nodes)/N(edges): 0.4721

[40000] N(nodes): 1298589
[40000] N(edges): 2885366
[40000] N(nodes)/N(edges): 0.4501

[50000] N(nodes): 1551141
[50000] N(edges): 3527089
[50000] N(nodes)/N(edges): 0.4398

[60000] N(nodes): 1773430
[60000] N(edges): 4131718
[60000] N(nodes)/N(edges): 0.4292

[70000] N(nodes): 1875605
[70000] N(edges): 4405655
[70000] N(nodes)/N(edges): 0.4257

[80000] N(nodes): 2065953
[80000] N(edges): 4963206
[80000] N(nodes)/N(edges): 0.4163

[90000] N(nodes): 2280149
[90000] N(edges): 5642561
[90000] N(nodes)/N(edges): 0.4041

[100000] N(nodes): 2455635
[100000] N(edges): 6230531
[100000] N(nodes)/N(edges): 0.3941

[110000] N(nodes): 2582623
[110000] N(edges): 6638750
[110000] N(nodes)/N(edges): 0.3890

[12000

In [4]:
import pickle
from os.path import join

home = os.path.expanduser('~')
pkl = join(home, 'data/friends_graph.pkl')
with open(pkl, 'wb') as f:
    pickle.dump(G, f)

### Load the Graph

In [1]:
import pickle
import os
from os.path import join

home = os.path.expanduser('~')
pkl = join(home, 'data/friends_graph.pkl')
with open(pkl, 'rb') as f:
    G = pickle.load(f)

Degree Inspection

In [2]:
import numpy as np
G_degrees = np.array(sorted(map(lambda x: x[1], G.degree()))[::-1])
print('Total number of nodes: %i' % G_degrees.shape[0])
print('maximum node degree: %i' % max(G_degrees))
for d in range(11):
    n_nodes = (G_degrees == d).sum()
    s = 'The number of nodes with degree %i: %i ' % (d, n_nodes)
    print(s)

Total number of nodes: 2745604
maximum node degree: 8350
The number of nodes with degree 0: 0 
The number of nodes with degree 1: 1803117 
The number of nodes with degree 2: 421422 
The number of nodes with degree 3: 166610 
The number of nodes with degree 4: 85169 
The number of nodes with degree 5: 50730 
The number of nodes with degree 6: 33417 
The number of nodes with degree 7: 23312 
The number of nodes with degree 8: 17319 
The number of nodes with degree 9: 13231 
The number of nodes with degree 10: 10536 


Get the top-K nodes with the highest degree centrality

In [5]:
import networkx as nx
dc = nx.degree_centrality(G)
K = 100000 # top_K
top_K = sorted(dc.items(), key = lambda x: -x[1])[:K]

In [6]:
top_K

[('ZIOCmdFaMIF56FR-nWr_2A', 0.0030412262807113775),
 ('F_5_UNX-wrAFCXuAkBZRDw', 0.002923583635361704),
 ('djxnI8Ux8ZYQJhiOQkrRhA', 0.002922855197929198),
 ('fgwI3rYHOv1ipfVfCSx7pg', 0.0028175959889321215),
 ('MeDuKsZcnI3IU2g7OlV-hQ', 0.0027356467777752283),
 ('5MCBLBxr10NLUKZ4AboAMg', 0.0025954225720178775),
 ('xsT4KZTu_KnOVavtuXn4RA', 0.002454105710111768),
 ('nkN_do3fJ9xekchVC-v68A', 0.002453377272679262),
 ('peuxbSQwXed-81cSqL7Ykw', 0.002443179148624182),
 ('YttDgOC9AlM4HcAlDsbB2A', 0.002422782900514022),
 ('1vXJWH7L0IMEz5-8aU3SOA', 0.002373249155103633),
 ('VHdY6oG2JPVNjihWhOooAQ', 0.002341926345505887),
 ('AHRrG3T1gJpHvtpZ-K0G_g', 0.0022785522888778896),
 ('9HGR8sU_zm15sI109H-SGQ', 0.0022734532268503496),
 ('dIIKEfOgo0KqUfGQvGikPg', 0.0022261047937374777),
 ('3zxy3LVBV3ttxoYbY4rQ8A', 0.002217363544547409),
 ('6tbXpUIU6upoeqWNDo9k_A', 0.0021933251092747202),
 ('w-w-k-QXosIKQ8HQVwU6IQ', 0.0021769352670433415),
 ('ACUVZ4SiN0gni7dzVDm9EQ', 0.002163094955825733),
 ('8DEyKVyplnOcSKx39va

In [7]:
import pickle
from os.path import join

home = os.path.expanduser('~')
pkl = join(home, 'data/friends_top100000.pkl')
with open(pkl, 'wb') as f:
    pickle.dump(set([n for n,c in top_K]), f)

## 2. Find the users with influential friends

In [1]:
import pickle
import os
from os.path import join

home = os.path.expanduser('~')
pkl = join(home, 'data/friends_top100000.pkl')
with open(pkl, 'rb') as f:
    influential = pickle.load(f)

In [2]:
from utils.preprocess import JSONLoader
user = 'user.json'
data_dir = 'data/dataset'
fields = ['user_id','friends']
jl = JSONLoader(user, data_dir, fields)
jl.set_condition(friends=influential)
f_b, json_b = jl.sample(1000000)

In [3]:
def popular_friends(node, popular):
    '''
    Args:
        node (list): the list that stores the id for the source node and the ids for all of its friends
        popular (set): set of ids for popular users
    '''
    source = node[0]
    popular_friends = [f for f in node[1] if f in popular]
    
    return popular_friends

In [4]:
popular_friend = dict([[node[0], popular_friends(node, influential)] for node in json_b])
del json_b
print(len(popular_friend))

366725


In [5]:
import pickle
from os.path import join

home = os.path.expanduser('~')
pkl = join(home, 'data/popular_friend.pkl')
with open(pkl, 'wb') as f:
    pickle.dump(popular_friend, f)

## 3. Find the restaurants in Toronto

In [1]:
from utils.preprocess import JSONLoader
fields = ['business_id','categories','city']
# fields = None
city = ['Toronto']
categories =['Restaurants', 'Food', 'Sandwiches', 'Fast Food', 'American (Traditional)', 'Pizza',
                 'Italian', 'Burgers', 'Mexican','Breakfast & Brunch', 'American (New)', 'Chinese', 'Specialty Food', 'Bakeries', 'Desserts',
                 'Japanese', 'Ice Cream & Frozen Yogurt', 'Chicken Wings, Seafood', 'Salad', 'Sushi Bars',
                 'Asian Fusion', 'Thai', 'Indian']

business = 'business.json'
data_dir = 'data/dataset'
jl = JSONLoader(business, data_dir, fields = fields)
jl.set_condition(city=city, categories=categories)
f_b, restaurants = jl.sample(10000000)
print(len(restaurants))
rest_id = set([restaurant[0] for restaurant in restaurants])

8627


## 4. Find the restaurants where influential users gave high ratings

In [2]:
from utils.preprocess import JSONLoader
import pickle
import os
from os.path import join

# Load the influential users
home = os.path.expanduser('~')
pkl = join(home, 'data/friends_top100000.pkl')
with open(pkl, 'rb') as f:
    influential = pickle.load(f)

# Load the reviews
review = 'review.json'
data_dir = 'data/dataset'
fields = ['user_id','business_id','stars']
jl = JSONLoader(review, data_dir, fields = fields)
jl.set_condition(stars = [5], business_id = rest_id, user_id = influential)

f_, popular_reviews = jl.sample(1000000)
len(popular_reviews)

5745

In [3]:
import pickle
from os.path import join

best_restaurants = set([r[1] for r in popular_reviews])
home = os.path.expanduser('~')
pkl = join(home, 'data/best_restaurants.pkl')
with open(pkl, 'wb') as f:
    pickle.dump(best_restaurants, f)

## 5. Get the location for the best restaurants

In [4]:
from utils.preprocess import JSONLoader
fields = ['business_id','categories','city', 'latitude', 'longitude']
# fields = None
city = ['Toronto']
categories =['Restaurants', 'Food', 'Sandwiches', 'Fast Food', 'American (Traditional)', 'Pizza',
                 'Italian', 'Burgers', 'Mexican','Breakfast & Brunch', 'American (New)', 'Chinese', 'Specialty Food', 'Bakeries', 'Desserts',
                 'Japanese', 'Ice Cream & Frozen Yogurt', 'Chicken Wings, Seafood', 'Salad', 'Sushi Bars',
                 'Asian Fusion', 'Thai', 'Indian']

business = 'business.json'
data_dir = 'data/dataset'
jl = JSONLoader(business, data_dir, fields = fields)
jl.set_condition(city=city, categories=categories, business_id=best_restaurants)
f_, rest = jl.sample(10000000)
print(len(rest))

1440


In [5]:
f_

['business_id', 'city', 'latitude', 'longitude', 'categories']

In [6]:
rest

[['Er8lBoKdYofhheu0Di93fw',
  'Toronto',
  43.6744092,
  -79.4580342,
  ['Restaurants', 'Canadian (New)']],
 ['zcWit_aSGR5wiunYB_Zapw',
  'Toronto',
  43.7036411,
  -79.4136884,
  ['Juice Bars & Smoothies', 'Desserts', 'Coffee & Tea', 'Food']],
 ['f8dJLMQ7UxV1mri5tbaJGA',
  'Toronto',
  43.6506671963,
  -79.3586906249,
  ['Desserts', 'Coffee & Tea', 'Bakeries', 'Food']],
 ['Fv621YzHSas5eYM8ppllhQ',
  'Toronto',
  43.6552653,
  -79.4141317,
  ['Vietnamese', 'Restaurants']],
 ['uHGXcZ7Tydcg3HZ74S7Efg',
  'Toronto',
  43.6466263,
  -79.3815001,
  ['Restaurants',
   'Event Planning & Services',
   'Venues & Event Spaces',
   'Canadian (New)']],
 ['BVxen382pORVZNQEDdQlKQ',
  'Toronto',
  43.7690889,
  -79.3849352,
  ['Restaurants', 'Middle Eastern']],
 ['kukAC7aIGvS-7JiS-_pSiw',
  'Toronto',
  43.779327,
  -79.138246,
  ['Desserts', 'Food', 'Bakeries']],
 ['37kk0IW6jL7ZlxZF6k2QBg',
  'Toronto',
  43.641948,
  -79.4065795,
  ['Spanish', 'French', 'Restaurants']],
 ['IV9xmEWafjYmAGnM_HUZGQ',


In [7]:
home = os.path.expanduser('~')
pkl = join(home, 'data/best_restaurants_loc.pkl')
with open(pkl, 'wb') as f:
    pickle.dump(rest, f)

In [2]:
import os
from os.path import join
import pickle

home = os.path.expanduser('~')
pkl = join(home, 'data/best_restaurants_loc.pkl')
with open(pkl, 'rb') as f:
    rest = pickle.load(f)

from geopy.distance import vincenty
your_xy = (43.6744092, -79.4580342)

def sort_by_distance_from(your_xy):
    dist = []
    for r in rest:
        latitude = r[2]
        longitude = r[3]
        xy = (latitude, longitude)
        dist.append(vincenty(your_xy, xy).meters)

    rest_dist = sorted([rest[i] + [dist[i]] for i in range(len(rest))], key = lambda x: x[-1])
    fields = [0, -2, -1]
    rest_dist = [{r[0]:r[-1]} for r in rest_dist]
    
    return rest_dist

sort_by_distance_from(your_xy)