# Search Final Project
#### Patrick Quyle, David Thompson, Kenneth Zhang

## Task 1

Recommend business to users


• Find a group of users who rated at least k businesses (e.g., k = 20)


• For each user, split the user rated businesses into two collections (training
and testing)


• For training set, you can use it for recommendation model generation,
e.g., generating query (for each user) or use it for collaborative filtering.


• For testing set, you will need to use it to evaluate the performance of the
proposed algorithms.


• You will need to report: 1. algorithm design (you need to use at least 2
algorithms to address this problem). 2. experiment design. 3. evaluation
result. 4. conclusion.


• Try to use some Text Feature (keyword, review, and tip), Proximity
Information, and/or Social Network features.

### Setup

In [2]:
# Import libraries
import random
import json
import sys

### Import data

In [2]:
f = open('./yelp/yelp_academic_dataset_business.json','r')

In [3]:
dataset = []
for i in range(1000):
    dataset.append(json.loads(f.readline()))

### Filter non-restaurants from businesses

In [4]:
fr = open('./yelp/academic_dataset_restaurants.json','w',encoding='utf-8')
f = open('./yelp/yelp_academic_dataset_business.json','r',encoding='utf-8')

i = 0

record = 'Z534 is cool'

while record:
    record = f.readline()
    if i % 20000 == 0:
        print(str(i)+" done")
#     print(record)
    if not record:
        break
    record_object = json.loads(record)
#     print(record_object)
    if not record_object['categories']:
        continue
    if 'Restaurants' in record_object['categories']:
        fr.write(record)
    
    i += 1
fr.close()


0 done
20000 done
40000 done
60000 done
80000 done
100000 done
120000 done
140000 done
160000 done
180000 done
200000 done


### Filter non-restaurants from reviews

In [5]:
# Make dictionary of restaurants

restaurants = {}
f = open('./yelp/academic_dataset_restaurants.json','r',encoding='utf-8')

record = 'Z534 is cool'

record = f.readline()
while record:
    record_object = json.loads(record)
    restaurants[record_object['business_id']] = None
    record = f.readline()
    
    




In [None]:
# Filter out non-restaurant reviews
    
fb = open('./yelp/academic_dataset_restaurant_reviews.json','w',encoding='utf-8')
f = open('./yelp/yelp_academic_dataset_review.json','r',encoding='utf-8')

i = 0

record = 'Z534 is cool'

while record:
    record = f.readline()
    if i % 20000 == 0:
        print(str(i)+" done")
#     print(record)
    if not record:
        break
    record_object = json.loads(record)
#     print(record_object)
    if record_object['business_id'] in restaurants:
        fb.write(record)
    i += 1
fb.close()

## Find group of users who rated at least k businesses, k = 10

### Find users for training/test data

In [3]:
# Find group of users who rated at least k businesses

f = open('./yelp/yelp_academic_dataset_user.json','r',encoding='utf-8')

# Minimum number of reviews
k = 30

# Number of users to find
n = 100000

users = 0

user_ids = {}

text = 'Z534 is awesome'
while users < n and text:
    text = f.readline()
    if not text:
        break
    u = json.loads(text)
    if u["review_count"] >= k:
        user_ids[u["user_id"]] = []
#         user_ids.append(u["user_id"])
        users += 1
        



In [4]:
# Find reviews made by these users

f = open('./yelp/academic_dataset_restaurant_reviews.json','r',encoding='utf-8')

# Number of reviews to look through
r = 3000000

i = 0

# user_list = []

while i < r:
    
    review = json.loads(f.readline())
    if review["user_id"] in user_ids:
        user_ids[review["user_id"]].append((review["business_id"],review["stars"]))
    i += 1

#### Filter out users who have fewer than 10 reviews

In [5]:
user_ids = {key: value for (key,value) in user_ids.items() if len(value) > 9}

#### Process by users
Convert user dictionary into matrix

In [6]:
user_list = list(user_ids.items())

In [7]:
len(user_list)

36220

#### Create test and training split

In [8]:
random.shuffle(user_list)

In [9]:
split = int(len(user_list) * 0.8)
train_users = user_list[:split]
test_users = user_list[split:]

#### Extract attributes from restaurants
    'RestaurantsAttire',
    'RestaurantsPriceRange2',
    'NoiseLevel',
    'Alcohol',
    'RestaurantsGoodForGroups',
    'RestaurantsDelivery',
    'RestaurantsTakeOut',
    'GoodForKids'

In [10]:
f = open('./yelp/academic_dataset_restaurants.json','r',encoding='utf-8')

rest = f.readline()
restaurants = {}


attributes = [
    'RestaurantsAttire',
    'RestaurantsPriceRange2',
    'NoiseLevel',
    'Alcohol',
    'RestaurantsGoodForGroups',
    'RestaurantsDelivery',
    'RestaurantsTakeOut',
    'GoodForKids'
]


default_value = {
    'RestaurantsAttire':"casual",
    'RestaurantsPriceRange2':'1',
    'NoiseLevel':"quiet",
    'Alcohol':"none",
    'RestaurantsGoodForGroups':'False',
    'RestaurantsDelivery':'False',
    'RestaurantsTakeOut':'False',
    'GoodForKids':'False'
}


numerical_value = {
    "": 0.0,
    "casual":0.0,
    "dressy":0.5,
    "formal":1.0,
    "quiet": 0.0,
    "average": 0.333,
    "loud": 0.666,
    "very_loud": 1.0,
    'none': 0.0,
    'beer_and_wine': 0.5,
    'full_bar': 1.0,
    'False': 0.0,
    'True': 1.0,
    '1': 0.0,
    '2': 0.333,
    '3': 0.666,
    '4': 1.0
}

while rest:
    rest_object = json.loads(rest)
    rest_id = rest_object['business_id']
    rest_attr = rest_object['attributes']
    
    if rest_attr:
        rest_vector = []
        rest_vector.append(rest_id)
        for i in attributes:
            if i in rest_attr:
                if rest_attr[i] != "None":
                    a = rest_attr[i]
                    if i in ['RestaurantsAttire','NoiseLevel','Alcohol']:
                        if a[0] == "u":
                            a = a[1:]
                        a = a[1:-1]
                    rest_vector.append(numerical_value[a])
                else:
                    rest_vector.append(numerical_value[default_value[i]])
            else:
                rest_vector.append(numerical_value[default_value[i]])
        restaurants[rest_id] = rest_vector
    rest = f.readline()
    
    
    


### Preview of vector representation of restaurant attributes

In [11]:
for i in list(restaurants.items())[:20]:
    print(i[1])

['pQeaRpvuhoEqudo3uymHIQ', 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
['CsLQLiRoafpJPJSkNX2h5Q', 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0]
['eBEfgOPG7pvFhb2wcG9I7w', 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
['lu7vtrp_bE9PnxWfA8g4Pg', 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0]
['9sRGfSVEfLhN_km60YruTA', 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0]
['vjTVxnsQEZ34XjYNS-XUpA', 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0]
['fnZrZlqW1Z8iWgTVDfv_MA', 0.0, 0.0, 0.333, 0.0, 1.0, 0.0, 1.0, 1.0]
['rVBPQdeayMYht4Uv_FOLHg', 0.0, 0.333, 0.333, 0.0, 1.0, 0.0, 1.0, 1.0]
['98hyK2QEUeI8v2y0AghfZA', 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
['fhNf_sg-XzZ3e7HEVGuOZg', 0.0, 0.333, 0.666, 0.0, 0.0, 0.0, 1.0, 0.0]
['LoRef3ChgZKbxUio-sHgQg', 0.0, 0.333, 0.333, 0.0, 1.0, 1.0, 1.0, 1.0]
['Ga2Bt7xfqoggTypWD5VpoQ', 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0]
['xFc50drSPxXkcLvX5ygqrg', 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0]
['tLpkSwdtqqoXwU0JAGnApw', 0.0, 0.0, 0.333, 0.0, 1.0, 0.0, 1.0, 1.0]
['Sd75ucXKoZUM2BEfBHFUOg', 0.0, 0.0, 0.0, 0.0,

### Algorithms: collaborative filtering and k-nearest neighbors

For each user, use their rating as a weight for how much a restaurant influences their preferences

In [12]:
# For each user, use their rating as a weight for how much a restaurant influences their preferences
user_preferences = {}
for i in range(len(user_list)):
    total_rating = 0
    for j in range(len(user_list[i][1])):
        total_rating += user_list[i][1][j][1]
    for j in range(len(user_list[i][1])):
        user_list[i][1][j] = (user_list[i][1][j][0],user_list[i][1][j][1]/total_rating)

In [13]:
for i in user_list:
    vec = [0,0,0,0,0,0,0,0]
    # For each review of this user
    for j in i[1]:
        for k in range(len(vec)):
            if j[0] in restaurants:
                vec[k] += restaurants[j[0]][k+1]
        for k in range(len(vec)):
            vec[k] *= j[1]
    
    
    user_preferences[i[0]] = vec

### Preview of vector representation of user preferences

In [25]:
for i in [(j[0],[round(k,4) for k in j[1]]) for j in list(user_preferences.items())[:10]]:
    print(i[0],i[1])

HAZfQgE3prIBgPDyilKK3A [0.0, 0.0212, 0.0403, 0.0604, 0.0639, 0.0, 0.0639, 0.0639]
m1IVpXClMox1VGw5hO2LhQ [0.0, 0.014, 0.014, 0.0415, 0.042, 0.042, 0.042, 0.042]
oCZyM99xoI3Xr8-fvqTS6A [0.0, 0.0, 0.0064, 0.0, 0.0197, 0.0006, 0.0197, 0.0197]
pUhVTOKAj74NPspaC4MGnA [0.0, 0.0033, 0.0035, 0.0053, 0.01, 0.0005, 0.0105, 0.0105]
YTuksHb6YsMI62BqEG_eRA [0.0049, 0.0065, 0.0034, 0.0099, 0.0099, 0.0, 0.0099, 0.0002]
HJVlmn40GRHaxfiqhuHEBg [0.0, 0.0297, 0.0277, 0.0833, 0.0774, 0.0064, 0.0833, 0.0774]
41XG-tkYXv7uHubdPgetJQ [0.0, 0.0001, 0.0331, 0.0455, 0.0044, 0.0912, 0.0953, 0.0044]
XTMm0nAf5Ds2W4tB0s0g-Q [0.0, 0.0064, 0.0132, 0.0198, 0.0198, 0.0, 0.0198, 0.0005]
0ojPj3T7Kc2bF6N5q-whXA [0.0, 0.0037, 0.0037, 0.0111, 0.0111, 0.0, 0.0111, 0.0111]
0jh6FM1kd6ecYs1ESjFZKA [0.0, 0.0457, 0.0458, 0.0154, 0.1374, 0.1361, 0.1374, 0.1374]


### Use Euclidean distance to determine "distance" between restaurant features

In [15]:
def euc_dist(vec1, vec2):
    vsum = 0
    for i in range(len(vec1)):
        vsum += (vec1[i] - vec2[i])**2
    return vsum**(0.5)

## Task 2

### Find restaurants that are open during your time

In [36]:
"stuff".replace('u','a')

'staff'

In [40]:
def parse_time(time_str):
    return [float(i) for i in time_str.replace(':','.').split('-')]

In [41]:
# Make dictionary of restaurants

restaurants = {}
f = open('./yelp/academic_dataset_restaurants.json','r',encoding='utf-8')

record = 'Z534 is cool'

record = f.readline()
while record:
    record_object = json.loads(record)
    if record_object['hours']:
        restaurants[record_object['business_id']] = [(i[0],parse_time(i[1])) for i in list(record_object['hours'].items())]
    record = f.readline()

In [42]:
restaurants

{'pQeaRpvuhoEqudo3uymHIQ': [('Monday', [11.3, 14.3]),
  ('Tuesday', [11.3, 14.3]),
  ('Wednesday', [11.3, 14.3]),
  ('Thursday', [11.3, 14.3]),
  ('Friday', [11.3, 14.3])],
 'eBEfgOPG7pvFhb2wcG9I7w': [('Monday', [11.0, 22.0]),
  ('Tuesday', [11.0, 22.0]),
  ('Wednesday', [11.0, 22.0]),
  ('Thursday', [11.0, 22.0]),
  ('Friday', [11.0, 22.0]),
  ('Saturday', [11.0, 22.0]),
  ('Sunday', [11.0, 21.0])],
 '9sRGfSVEfLhN_km60YruTA': [('Tuesday', [12.0, 21.0]),
  ('Wednesday', [12.0, 21.0]),
  ('Thursday', [12.0, 21.0]),
  ('Friday', [12.0, 0.0]),
  ('Saturday', [12.0, 0.0]),
  ('Sunday', [11.0, 21.0])],
 'vjTVxnsQEZ34XjYNS-XUpA': [('Monday', [10.0, 21.0]),
  ('Tuesday', [10.0, 21.0]),
  ('Wednesday', [10.0, 21.0]),
  ('Thursday', [10.0, 21.0]),
  ('Friday', [10.0, 21.0]),
  ('Saturday', [10.0, 21.0]),
  ('Sunday', [11.0, 18.0])],
 '98hyK2QEUeI8v2y0AghfZA': [('Monday', [11.0, 20.0]),
  ('Wednesday', [11.0, 20.0]),
  ('Thursday', [11.0, 20.0]),
  ('Friday', [11.0, 20.0]),
  ('Saturday', [11.0,