## Obtain data

In [1]:
!wget https://d396qusza40orc.cloudfront.net/dataminingcapstone/YelpDataset/yelp_dataset.tar.gz
!tar -xvf /kaggle/working/yelp_dataset.tar.gz
!wget -O student_dn_annotations.txt https://d28rh4a8wq0iu5.cloudfront.net/dataminingcapstone/Tasks4and5/student_dn_annotations.txt


--2024-04-09 16:14:02--  https://d396qusza40orc.cloudfront.net/dataminingcapstone/YelpDataset/yelp_dataset.tar.gz
Resolving d396qusza40orc.cloudfront.net (d396qusza40orc.cloudfront.net)... 52.84.160.90, 52.84.160.182, 52.84.160.76, ...
Connecting to d396qusza40orc.cloudfront.net (d396qusza40orc.cloudfront.net)|52.84.160.90|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 443445047 (423M) [application/x-gzip]
Saving to: 'yelp_dataset.tar.gz'


2024-04-09 16:14:08 (84.4 MB/s) - 'yelp_dataset.tar.gz' saved [443445047/443445047]

yelp_dataset_challenge_academic_dataset/
yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_user.json
yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json
yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_checkin.json
yelp_dataset_challenge_academic_dataset/Dataset_Challenge_Academic_Dataset_Agreement.pdf
yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json
yelp_dataset_cha

## Dependencies

In [2]:
!pip install gensim
# !pip install



In [3]:
from gensim import corpora, matutils
from gensim.models import LdaModel
import json
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


### preprocessing

In [4]:
# transforming to df
REVIEW_SOURCE = "/kaggle/working/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json"
reviews_df = pd.read_json(REVIEW_SOURCE, lines=True)
# cuisines list
cuisine_url = '/kaggle/working/student_dn_annotations.txt'
with open(cuisine_url) as f:
    cuisine_lst = f.readlines()
    
cuisine_lst[:10]

['bisi bele bath\n',
 'margherita pizza\n',
 'penne\n',
 'carbonara spaghetti\n',
 'olive green\n',
 'grilled octopus\n',
 'red chili\n',
 'meat tender\n',
 'asti\n',
 'pani puri\n']

### Task4 dishes recommendations

In [5]:
cuisine_lst = [cuisine.replace('\n', '') for cuisine in cuisine_lst]

In [6]:
index_lst = [i for i in range(len(cuisine_lst))]

In [7]:
dish_dct = dict(zip(cuisine_lst, index_lst))
# for k, v in dish_dct.items():
#     print(k, v)
#     break

## Makingn the Counts of dish names in the reviews as a criterion

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cnt_vectorizer = CountVectorizer(stop_words="english", vocabulary=dish_dct)

In [10]:
dish_freqs = cnt_vectorizer.fit_transform(reviews_df['text'])

In [11]:
binary_matrix = (dish_freqs > 0).astype(int)


In [12]:
dish_freqs_lst = binary_matrix.sum(axis=0).tolist()
dish_freqs_lst = [freq for freq_sub in dish_freqs_lst for freq in freq_sub]


In [13]:
d = np.array([index_lst, cuisine_lst, dish_freqs_lst]).T
dish_df = pd.DataFrame(data=d, columns=['index', 'dish_name', 'count_in_reviews'])
dish_df.count_in_reviews = pd.to_numeric(dish_df['count_in_reviews'])
dish_df.set_index('index')
dish_df.head()

Unnamed: 0,index,dish_name,count_in_reviews
0,0,bisi bele bath,0
1,1,margherita pizza,0
2,2,penne,1488
3,3,carbonara spaghetti,0
4,4,olive green,0


In [14]:
top_10dish_df = dish_df.nlargest(n=10, columns='count_in_reviews')

In [15]:
binary_arr = binary_matrix.T.toarray()
dish_stars = binary_arr * reviews_df.stars.values


In [16]:
total_dish_stars = dish_stars.sum(axis=1)


In [17]:
dish_freqs_mx = np.array(dish_freqs_lst).astype(np.float32)
avg_dish_stars = total_dish_stars.astype(np.float32) 
avg_dish_stars.dtype

dtype('float32')

In [18]:
dish_stars = np.divide(avg_dish_stars, dish_freqs_mx, where=(dish_freqs_mx > 0), out=np.full_like(avg_dish_stars, -1))
penalized_dish_stars = np.where(dish_freqs_mx < 20000, -1, np.divide(avg_dish_stars, dish_freqs_mx, where=(dish_freqs_mx > 0)))

In [19]:
dish_df['avg_stars'] = penalized_dish_stars
dish_df['avg_stars'] = pd.to_numeric(dish_df['avg_stars'])


In [20]:
dish_df

Unnamed: 0,index,dish_name,count_in_reviews,avg_stars
0,0,bisi bele bath,0,-1.0
1,1,margherita pizza,0,-1.0
2,2,penne,1488,-1.0
3,3,carbonara spaghetti,0,-1.0
4,4,olive green,0,-1.0
...,...,...,...,...
2080,2080,dhokla,5,-1.0
2081,2081,litti,0,-1.0
2082,2082,murazzano,0,-1.0
2083,2083,peperoncino,16,-1.0


In [21]:
pop_dish_star = dish_df.nlargest(n=10, columns='avg_stars')

In [22]:
pop_dish = dish_df.nlargest(n=10, columns='count_in_reviews')

### Task5: Recommand Restaurants

In [23]:
business_df = pd.read_json('/kaggle/working/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json', lines=True)

In [24]:
rev_bus_df = pd.merge(reviews_df, business_df, on='business_id', how='left')
rev_bus_df = rev_bus_df[['text', 'name', 'stars_x', 'stars_y']]

rev_bus_df.rename(columns={'name': 'business_name', 'text': 'review'}, inplace=True)

rev_bus_df.head()

Unnamed: 0,review,business_name,stars_x,stars_y
0,dr. goldberg offers everything i look for in a...,"Eric Goldberg, MD",5,3.5
1,"Unfortunately, the frustration of being Dr. Go...","Eric Goldberg, MD",2,3.5
2,Dr. Goldberg has been my doctor for years and ...,"Eric Goldberg, MD",4,3.5
3,Been going to Dr. Goldberg for over 10 years. ...,"Eric Goldberg, MD",4,3.5
4,Got a letter in the mail last week that said D...,"Eric Goldberg, MD",4,3.5


In [25]:
pop_dish_star

Unnamed: 0,index,dish_name,count_in_reviews,avg_stars
544,544,garlic,23098,3.918564
553,553,pork,36794,3.863619
1814,1814,bacon,28543,3.845496
587,587,beef,48307,3.783675
1308,1308,shrimp,36834,3.776674
1922,1922,pizza,55577,3.770696
114,114,cheese,85044,3.761888
1804,1804,steak,44297,3.760119
779,779,potatoes,23882,3.758856
752,752,chicken,106199,3.716655


In [26]:
pop_indices = pop_dish_star.index.tolist()
pop_indices

[544, 553, 1814, 587, 1308, 1922, 114, 1804, 779, 752]

In [27]:
rev_res_stars_lst = []

for i in pop_indices:
    bool_arr = [bool(x) for x in binary_arr[i]]
    reviews_part_df = rev_bus_df[bool_arr]
    rev_groups = reviews_part_df.groupby(by='business_name')
    part_rev_groups = rev_groups.filter(lambda x: x['stars_x'].count() > 100)
    # count criterion
    rev_part_group = part_rev_groups.groupby(by='business_name')
    # mean stars
    rev_group_stars = rev_part_group.stars_x.mean().nlargest()
    
    rev_res_stars_lst.append(rev_group_stars)

In [30]:
dish_df.head(20)

Unnamed: 0,index,dish_name,count_in_reviews,avg_stars
0,0,bisi bele bath,0,-1.0
1,1,margherita pizza,0,-1.0
2,2,penne,1488,-1.0
3,3,carbonara spaghetti,0,-1.0
4,4,olive green,0,-1.0
5,5,grilled octopus,0,-1.0
6,6,red chili,0,-1.0
7,7,meat tender,0,-1.0
8,8,asti,63,-1.0
9,9,pani puri,0,-1.0


In [32]:
for i in range(10):
    dish_name = dish_df.iloc[pop_indices[i]].dish_name
    print(f'dish_name:{dish_name}\n')
    print(rev_res_stars_lst[i], '\n-----------------------------------------')

dish_name:garlic

business_name
Lotus of Siam           4.429787
Secret Pizza            4.307692
Bachi Burger            4.277778
Hot N Juicy Crawfish    4.224490
Mesa Grill              4.213115
Name: stars_x, dtype: float64 
-----------------------------------------
dish_name:pork

business_name
Raku                     4.649425
Tacos El Gordo           4.472772
Sushi House Goyemon      4.431373
Viva Las Arepas          4.418182
Rollin Smoke Barbeque    4.409357
Name: stars_x, dtype: float64 
-----------------------------------------
dish_name:bacon

business_name
Raku                    4.523810
Double Down Saloon      4.387931
Firefly                 4.358839
Sage                    4.355140
Citizen Public House    4.354545
Name: stars_x, dtype: float64 
-----------------------------------------
dish_name:beef

business_name
Raku                     4.631336
Rollin Smoke Barbeque    4.453782
KoMex Fusion             4.447761
Gordon Ramsay Steak      4.408680
Texas de Brazil       