In [27]:
import pandas as pd
import random
import numpy as np
import implicit
import os
from sklearn.model_selection import train_test_split
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix, save_npz
import tarfile

In [28]:
raw_data_directory = "../data/raw"
filter_category = 'restaurant'
filter_city = 'philadelphia'
tar_filename = 'yelp_dataset.tar'

In [29]:
file = tarfile.open(os.path.join(raw_data_directory,tar_filename))
file.extractall(path=raw_data_directory)

In [None]:
business = pd.read_json(os.path.join(raw_data_directory, 'yelp_academic_dataset_business.json'), lines=True)
business = business[['business_id', 'name', 'city', 'categories']]
business.rename({'name': 'business_name'}, axis=1, inplace=True)

# drop na
business.dropna(axis=0, how='any', inplace=True)
str_cols = ['categories', 'city']

# lowercase string
business[str_cols] = business[str_cols].apply(func=lambda x: x.str.strip().str.lower(), axis=1)

restaurant = business[(business['categories'].str.contains(filter_category))]
restaurant.drop(axis=1, columns=['categories', 'city'], inplace=True)

# restaurant.to_csv(os.path.join(clean_data_directory, 'restaurant.csv'), index=False)
num_restaurant = restaurant.business_id.unique().shape[0]
print(f"Number of Restaurant {num_restaurant}")

review = pd.read_json(os.path.join(raw_data_directory, 'yelp_academic_dataset_review.json'), lines=True)
review = review[['user_id', 'business_id', 'stars']]
num_review = review.review_id.unique().shape[0]
print(f"Number of Review {num_review}")

# data = pd.merge(left=restaurant, right=review, how='inner', on='business_id')
# data.to_csv(os.path.join(clean_data_directory, 'review.csv'), index=False)

user = pd.read_json(os.path.join(raw_data_directory, 'yelp_academic_dataset_user.json'), lines=True)
user = user[['user_id', 'name']]
user.rename({'name': 'user_name'}, axis=1, inplace=True)

user_review = pd.merge(left=user, right=review, how='inner', on='user_id')

user_review_business = pd.merge(left=restaurant, right=user_review, how='inner', on='business_id')
#user_review_business.to_csv(os.path.join(clean_data_directory, 'user_business_review.csv'), index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Number of Restaurant 50793


In [None]:
user_review_business.groupby(['city']).count().max(level=0)

In [2]:
restaurant_df = pd.read_csv("../data/clean/restaurant.csv")

In [3]:
reviews_df = pd.read_csv("../data/clean/review.csv")

In [4]:
reviews_df.shape
reviews_df.head()

Unnamed: 0,business_id,name,user_id,stars
0,MTSW4McQd7CbVtyjqoe9mw,st honore pastries,6_SpY41LIHZuIaiDs5FMKA,4
1,MTSW4McQd7CbVtyjqoe9mw,st honore pastries,tCXElwhzekJEH6QJe3xs7Q,4
2,MTSW4McQd7CbVtyjqoe9mw,st honore pastries,WqfKtI-aGMmvbA9pPUxNQQ,5
3,MTSW4McQd7CbVtyjqoe9mw,st honore pastries,3-1va0IQfK-9tUMzfHWfTA,5
4,MTSW4McQd7CbVtyjqoe9mw,st honore pastries,EouCKoDfzaVG0klEgdDvCQ,4


In [5]:
unique_users = list(reviews_df['user_id'].unique())
unique_restaurant = list(reviews_df['business_id'].unique())
rating = reviews_df['stars'].tolist()

# converting users and restaurants into numerical ids
rows = reviews_df.user_id.astype(CategoricalDtype(categories=unique_users)).cat.codes
cols = reviews_df.business_id.astype(CategoricalDtype(categories=unique_restaurant)).cat.codes

reviews_df['users_id_code'] = rows
reviews_df['business_id_code'] = cols
sparse_restaurant_user = csr_matrix((rating, (cols, rows)), shape=(len(unique_restaurant), len(unique_users)))
sparse_user_restaurant = csr_matrix((rating, (rows, cols)), shape=(len(unique_users), len(unique_restaurant)))

In [6]:
ratings = sparse_user_restaurant
test_set = ratings.copy()
test_set[test_set != 0] = 1
training_set = ratings.copy()
user_restaurant_interaction = training_set.nonzero()
interaction_index_pair = list(zip(user_restaurant_interaction[0], user_restaurant_interaction[1]))
random.seed(0)
test_set_size = int(np.ceil(0.2 * len(interaction_index_pair)))
test_samples = random.sample(interaction_index_pair, test_set_size)
user_index = [index[0] for index in test_samples]
restaurant_index = [index[1] for index in test_samples]
training_set[user_index, restaurant_index] = 0
training_set.eliminate_zeros()

In [7]:
als_model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)
als_model.fit(training_set)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [15]:
user_id = 5858
restaurant = []
restaurant.append(reviews_df.name.loc[reviews_df.users_id_code == user_id].iloc[0])
print("Rated By User", user_id)
pd.DataFrame(restaurant)

Rated By User 5858


Unnamed: 0,0
0,mood cafe


In [9]:
user_id = 5858
ids, scores = als_model.recommend(user_id, sparse_user_restaurant[user_id])
restaurant = []
for id in ids:
    restaurant.append(reviews_df.name.loc[reviews_df.business_id_code == id].iloc[0])
print("Recommendation For User", )
pd.DataFrame(restaurant)

Recommendation For User


Unnamed: 0,0
0,amada
1,talula's garden
2,vedge
3,sabrina's café
4,alma de cuba
5,bistrot la minette
6,farmicia
7,tria cafe wash west
8,indeblue modern indian food & spirits
9,moshulu


In [None]:
train_data.shape

In [10]:
business_id = 12
ids, scores = als_model.similar_items(business_id)
restaurant = []
for id in ids:
    restaurant.append(reviews_df.name.loc[reviews_df.business_id_code == id].iloc[0])
print("Restaurants Similar to", reviews_df.name.loc[reviews_df.business_id_code == id].iloc[0])
pd.DataFrame(restaurant)

Restaurants Similar to chestnut hill brewing company


Unnamed: 0,0
0,baltic bakery
1,wawa
2,mercer cafe
3,express breakfast & lunch
4,eddie's pizza
5,papps pizza
6,little man's juice bar and grill
7,panda express
8,cosmic café and ciderhouse
9,chestnut hill brewing company
