In [2]:
import numpy as np 
import pandas as pd 
import json 
from tqdm import tqdm

### Load Datasets

Read review.json

In [3]:
data = {'review_id': [],'business_id': [], 'stars': [], 'useful': [], 'funny': [], 'cool': [], 'text': []}

with open('yelp_dataset/review.json', 'r', encoding='utf8') as f:
    for line in tqdm(f):
        review = json.loads(line)
        data['review_id'].append(review['review_id'])
        data['business_id'].append(review['business_id'])
        data['stars'].append(review['stars'])
        data['useful'].append(review['useful'])
        data['funny'].append(review['funny'])
        data['cool'].append(review['cool'])
        data['text'].append(review['text'])

6685900it [00:57, 115746.67it/s]


Put review data into a dataframe

In [5]:
review_df = pd.DataFrame(data)
review_df.head()

Unnamed: 0,review_id,business_id,stars,useful,funny,cool,text
0,Q1sbwvVQXV2734tPgoKj4Q,ujmEBvifdJM6h6RLv4wQIg,1.0,6,1,0,Total bill for this horrible service? Over $8G...
1,GJXCdrto3ASJOqKeVWPi6Q,NZnhc2sEQy3RmzKTZnqtwQ,5.0,0,0,0,I *adore* Travis at the Hard Rock's new Kelly ...
2,2TzJjDVDEuAW6MR5Vuc1ug,WTqjgwHlXbSFevF32_DJVw,5.0,3,0,0,I have to say that this office really has it t...
3,yi0R0Ugj_xUx_Nek0-_Qig,ikCg8xy5JIg_NGPx-MSIDA,5.0,0,0,0,Went in for a lunch. Steak sandwich was delici...
4,11a8sVPMUFtaC7_ABRkmtw,b1b1eb3uo-w561D0ZfCEiQ,1.0,7,0,0,Today was my second out of three sessions I ha...


In [6]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6685900 entries, 0 to 6685899
Data columns (total 7 columns):
review_id      object
business_id    object
stars          float64
useful         int64
funny          int64
cool           int64
text           object
dtypes: float64(1), int64(3), object(3)
memory usage: 357.1+ MB


Read bussiness.json and put into a dataframe

In [7]:
business_df = pd.read_json(r'yelp_dataset/business.json', lines = True)

In [8]:
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,128,1,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W..."
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,1,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-..."
3,xvX2CttrVhyG2z1dFg_0xw,Farmers Insurance - Paul Lorenz,"15655 W Roosevelt St, Ste 237",Goodyear,AZ,85338,33.455613,-112.395596,5.0,3,1,,"Insurance, Financial Services","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ..."
4,HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,"4209 Stuart Andrew Blvd, Ste F",Charlotte,NC,28217,35.190012,-80.887223,4.0,4,1,"{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...","Plumbing, Shopping, Local Services, Home Servi...","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."


In [9]:
business_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192609 entries, 0 to 192608
Data columns (total 14 columns):
business_id     192609 non-null object
name            192609 non-null object
address         192609 non-null object
city            192609 non-null object
state           192609 non-null object
postal_code     192609 non-null object
latitude        192609 non-null float64
longitude       192609 non-null float64
stars           192609 non-null float64
review_count    192609 non-null int64
is_open         192609 non-null int64
attributes      163773 non-null object
categories      192127 non-null object
hours           147779 non-null object
dtypes: float64(3), int64(2), object(9)
memory usage: 20.6+ MB


### Data Cleaning

#### Clean the bussiness dataset and get only the restaurants

In [10]:
#filter out only the restaurants that are already closed
business_df = business_df[business_df['is_open']==1]

In [11]:
#drop irrevelant columns 
drop_columns = ['hours','is_open','review_count']
business_df = business_df.drop(drop_columns, axis=1)

In [12]:
#plit the business data by categories
business_category = business_df.assign(categories = business_df.categories.str.split(', ')).explode('categories')

In [13]:
print('Total number of categories: ', len(business_category.categories.value_counts()))

Total number of categories:  1290


In [14]:
# print out top 10 categories
business_category.categories.value_counts()[:10]

Restaurants                  42237
Shopping                     26734
Food                         23208
Home Services                18455
Beauty & Spas                16545
Health & Medical             15875
Local Services               12830
Automotive                   11955
Nightlife                     9396
Event Planning & Services     8960
Name: categories, dtype: int64

In [15]:
#attract only the data with categories that contain "Restaurants"
business_res = business_df.loc[business_df['categories'].str.contains('Restaurants', case=True, na=False)]
business_res.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,attributes,categories
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported..."
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese"
11,1Dfx3zM-rW4n-31KeC8sJg,Taco Bell,2450 E Indian School Rd,Phoenix,AZ,85016,33.495194,-112.028588,3.0,"{'RestaurantsTakeOut': 'True', 'BusinessParkin...","Restaurants, Breakfast & Brunch, Mexican, Taco..."
13,fweCYi8FmbJXHCqLnwuk8w,Marco's Pizza,5981 Andrews Rd,Mentor-on-the-Lake,OH,44060,41.70852,-81.359556,4.0,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...","Italian, Restaurants, Pizza, Chicken Wings"
23,1RHY4K3BD22FK7Cfftn8Mg,Marathon Diner,"Center Core - Food Court, Fl 3, Pittsburgh Int...",Pittsburgh,PA,15231,40.496177,-80.246011,4.0,"{'RestaurantsTakeOut': 'True', 'BusinessParkin...","Sandwiches, Salad, Restaurants, Burgers, Comfo..."


#### Merge the relevant businesses (restaurants) to the reviews

In [16]:
# Renaming column name to avoid conflict with business overall star rating
review_df = review_df.rename(columns={'stars': 'review_stars'})
review_df.head()

Unnamed: 0,review_id,business_id,review_stars,useful,funny,cool,text
0,Q1sbwvVQXV2734tPgoKj4Q,ujmEBvifdJM6h6RLv4wQIg,1.0,6,1,0,Total bill for this horrible service? Over $8G...
1,GJXCdrto3ASJOqKeVWPi6Q,NZnhc2sEQy3RmzKTZnqtwQ,5.0,0,0,0,I *adore* Travis at the Hard Rock's new Kelly ...
2,2TzJjDVDEuAW6MR5Vuc1ug,WTqjgwHlXbSFevF32_DJVw,5.0,3,0,0,I have to say that this office really has it t...
3,yi0R0Ugj_xUx_Nek0-_Qig,ikCg8xy5JIg_NGPx-MSIDA,5.0,0,0,0,Went in for a lunch. Steak sandwich was delici...
4,11a8sVPMUFtaC7_ABRkmtw,b1b1eb3uo-w561D0ZfCEiQ,1.0,7,0,0,Today was my second out of three sessions I ha...


In [17]:
merged = pd.merge(business_res, review_df, on='business_id', how='inner')
merged.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,attributes,categories,review_id,review_stars,useful,funny,cool,text
0,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",6W0MQHmasK0IsaoDo4bmkw,3.0,3,2,0,My girlfriend and I went for dinner at Emerald...
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",BeeBfUxvzD4qNX4HxrgA5g,3.0,0,0,0,We've always been there on a Sunday so we were...
2,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",A1D2kUnZ0HTroFreAheNSg,3.0,0,0,0,"***No automatic doors, not baby friendly!*** I..."
3,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",2pf45Stf-pNew-xgTababQ,1.0,1,0,0,"Horrible service,\nI went there tonight with m..."
4,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",RHhlmL07evgAdPaXQV8Omg,4.0,2,1,2,One of the gauges of a good Chinese restaurant...


#### Convert the merged dataframe into a CSV file

In [18]:
merged.to_csv("yelp_reviews_restaurants.csv", index=False)