# Search Final Project
#### Patrick Quyle, David Thompson, Kenneth Zhang

## Task 1

Recommend business to users


• Find a group of users who rated at least k businesses (e.g., k = 20)


• For each user, split the user rated businesses into two collections (training
and testing)


• For training set, you can use it for recommendation model generation,
e.g., generating query (for each user) or use it for collaborative filtering.


• For testing set, you will need to use it to evaluate the performance of the
proposed algorithms.


• You will need to report: 1. algorithm design (you need to use at least 2
algorithms to address this problem). 2. experiment design. 3. evaluation
result. 4. conclusion.


• Try to use some Text Feature (keyword, review, and tip), Proximity
Information, and/or Social Network features.

### Setup

In [None]:
# Import libraries
import random
import json
import sys

### Import data

In [None]:
f = open('./yelp/yelp_academic_dataset_business.json','r')

In [None]:
dataset = []
for i in range(1000):
    dataset.append(json.loads(f.readline()))

### Filter non-restaurants from businesses

In [None]:
fr = open('./yelp/academic_dataset_restaurants.json','w',encoding='utf-8')
f = open('./yelp/yelp_academic_dataset_business.json','r',encoding='utf-8')

i = 0

record = 'Z534 is cool'

while record:
    record = f.readline()
    if i % 20000 == 0:
        print(str(i)+" done")
#     print(record)
    if not record:
        break
    record_object = json.loads(record)
#     print(record_object)
    if not record_object['categories']:
        continue
    if 'Restaurants' in record_object['categories']:
        fr.write(record)
    
    i += 1
fr.close()


### Filter non-restaurants from reviews

In [None]:
# Make dictionary of restaurants

restaurants = {}
f = open('./yelp/academic_dataset_restaurants.json','r',encoding='utf-8')

record = 'Z534 is cool'

record = f.readline()
while record:
    record_object = json.loads(record)
    restaurants[record_object['business_id']] = None
    record = f.readline()
    
    




In [None]:
# Filter out non-restaurant reviews
    
fb = open('./yelp/academic_dataset_restaurant_reviews.json','w',encoding='utf-8')
f = open('./yelp/yelp_academic_dataset_review.json','r',encoding='utf-8')

i = 0

record = 'Z534 is cool'

while record:
    record = f.readline()
    if i % 20000 == 0:
        print(str(i)+" done")
#     print(record)
    if not record:
        break
    record_object = json.loads(record)
#     print(record_object)
    if record_object['business_id'] in restaurants:
        fb.write(record)
    i += 1
fb.close()

## Find group of users who rated at least k businesses, k = 10

### Find users for training/test data

In [None]:
# Find group of users who rated at least k businesses

f = open('./yelp/yelp_academic_dataset_user.json','r',encoding='utf-8')

# Minimum number of reviews
k = 30

# Number of users to find
n = 100000

users = 0

user_ids = {}

text = 'Z534 is awesome'
while users < n and text:
    text = f.readline()
    if not text:
        break
    u = json.loads(text)
    if u["review_count"] >= k:
        user_ids[u["user_id"]] = []
#         user_ids.append(u["user_id"])
        users += 1
        



In [None]:
# Find reviews made by these users

f = open('./yelp/academic_dataset_restaurant_reviews.json','r',encoding='utf-8')

# Number of reviews to look through
r = 3000000

i = 0

# user_list = []

while i < r:
    
    review = json.loads(f.readline())
    if review["user_id"] in user_ids:
        user_ids[review["user_id"]].append((review["business_id"],review["stars"]))
    i += 1

#### Filter out users who have fewer than 10 reviews

In [None]:
user_ids = {key: value for (key,value) in user_ids.items() if len(value) > 9}

#### Process by users
Convert user dictionary into matrix

In [None]:
user_list = list(user_ids.items())

In [None]:
len(user_list)

#### Create test and training split

In [None]:
random.shuffle(user_list)

In [None]:
split = int(len(user_list) * 0.8)
train_users = user_list[:split]
test_users = user_list[split:]

#### Extract attributes from restaurants
    'RestaurantsAttire',
    'RestaurantsPriceRange2',
    'NoiseLevel',
    'Alcohol',
    'RestaurantsGoodForGroups',
    'RestaurantsDelivery',
    'RestaurantsTakeOut',
    'GoodForKids'

In [None]:
f = open('./yelp/academic_dataset_restaurants.json','r',encoding='utf-8')

rest = f.readline()
restaurants = {}


attributes = [
    'RestaurantsAttire',
    'RestaurantsPriceRange2',
    'NoiseLevel',
    'Alcohol',
    'RestaurantsGoodForGroups',
    'RestaurantsDelivery',
    'RestaurantsTakeOut',
    'GoodForKids'
]


default_value = {
    'RestaurantsAttire':"casual",
    'RestaurantsPriceRange2':'1',
    'NoiseLevel':"quiet",
    'Alcohol':"none",
    'RestaurantsGoodForGroups':'False',
    'RestaurantsDelivery':'False',
    'RestaurantsTakeOut':'False',
    'GoodForKids':'False'
}


numerical_value = {
    "": 0.0,
    "casual":0.0,
    "dressy":0.5,
    "formal":1.0,
    "quiet": 0.0,
    "average": 0.333,
    "loud": 0.666,
    "very_loud": 1.0,
    'none': 0.0,
    'beer_and_wine': 0.5,
    'full_bar': 1.0,
    'False': 0.0,
    'True': 1.0,
    '1': 0.0,
    '2': 0.333,
    '3': 0.666,
    '4': 1.0
}

while rest:
    rest_object = json.loads(rest)
    rest_id = rest_object['business_id']
    rest_attr = rest_object['attributes']
    
    if rest_attr:
        rest_vector = []
        rest_vector.append(rest_id)
        for i in attributes:
            if i in rest_attr:
                if rest_attr[i] != "None":
                    a = rest_attr[i]
                    if i in ['RestaurantsAttire','NoiseLevel','Alcohol']:
                        if a[0] == "u":
                            a = a[1:]
                        a = a[1:-1]
                    rest_vector.append(numerical_value[a])
                else:
                    rest_vector.append(numerical_value[default_value[i]])
            else:
                rest_vector.append(numerical_value[default_value[i]])
        restaurants[rest_id] = rest_vector
    rest = f.readline()
    
    
    


### Preview of vector representation of restaurant attributes

In [None]:
for i in list(restaurants.items())[:20]:
    print(i[1])

### Algorithms: collaborative filtering and k-nearest neighbors

For each user, use their rating as a weight for how much a restaurant influences their preferences

In [None]:
# For each user, use their rating as a weight for how much a restaurant influences their preferences
user_preferences = {}
for i in range(len(user_list)):
    total_rating = 0
    for j in range(len(user_list[i][1])):
        total_rating += user_list[i][1][j][1]
    for j in range(len(user_list[i][1])):
        user_list[i][1][j] = (user_list[i][1][j][0],user_list[i][1][j][1]/total_rating)

In [None]:
for i in user_list:
    vec = [0,0,0,0,0,0,0,0]
    # For each review of this user
    for j in i[1]:
        for k in range(len(vec)):
            if j[0] in restaurants:
                vec[k] += restaurants[j[0]][k+1]
        for k in range(len(vec)):
            vec[k] *= j[1]
    
    
    user_preferences[i[0]] = vec

### Preview of vector representation of user preferences

In [None]:
for i in [(j[0],[round(k,4) for k in j[1]]) for j in list(user_preferences.items())[:10]]:
    print(i[0],i[1])

### Use Euclidean distance to determine "distance" between restaurant features

In [None]:
def euc_dist(vec1, vec2):
    vsum = 0
    for i in range(len(vec1)):
        vsum += (vec1[i] - vec2[i])**2
    return vsum**(0.5)

## Task 2

### Find restaurants that are open during your time

In [None]:
"stuff".replace('u','a')

In [None]:
def parse_time(time_str):
    return [float(i) for i in time_str.replace(':','.').split('-')]

In [None]:
# Make dictionary of restaurants

restaurants = {}
f = open('./yelp/academic_dataset_restaurants.json','r',encoding='utf-8')

record = 'Z534 is cool'

record = f.readline()
while record:
    record_object = json.loads(record)
    if record_object['hours']:
        restaurants[record_object['business_id']] = [(i[0],parse_time(i[1])) for i in list(record_object['hours'].items())]
    record = f.readline()

In [None]:
restaurants