#### The raw dataset includes 6, 990, 280 reviews from 131, 930 business. For model building stage, I will focus only on reviews of Vietnamese restaurants. The goal of this notebook is to subset review data of Vietnamese restaurants. 

In [1]:
import pandas as pd
import numpy as np
import joblib
import json

In [2]:
# import data
df_review = joblib.load('review_no_text.py') 
df_business = pd.read_json('yelp_academic_dataset_business.json', lines=True)

In [3]:
# NaN is easier to deal with. Covert None to NaN.
df_business = df_business.fillna(value=np.nan)
df_business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float64
 7   longitude     150346 non-null  float64
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int64  
 10  is_open       150346 non-null  int64  
 11  attributes    136602 non-null  object 
 12  categories    150243 non-null  object 
 13  hours         127123 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 16.1+ MB


In [4]:
# Drop NaN cases
df_business_clean = df_business.dropna(axis = 0).reset_index(drop = True)
df_business_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117618 entries, 0 to 117617
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   117618 non-null  object 
 1   name          117618 non-null  object 
 2   address       117618 non-null  object 
 3   city          117618 non-null  object 
 4   state         117618 non-null  object 
 5   postal_code   117618 non-null  object 
 6   latitude      117618 non-null  float64
 7   longitude     117618 non-null  float64
 8   stars         117618 non-null  float64
 9   review_count  117618 non-null  int64  
 10  is_open       117618 non-null  int64  
 11  attributes    117618 non-null  object 
 12  categories    117618 non-null  object 
 13  hours         117618 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 12.6+ MB


In [5]:
# sanity check
df_business_clean.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
1,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
3,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
4,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."


In [6]:
# Create a binary indicator of Vietnamese restaurant (Vietnamese = 1, Others = 0)
vietnamese_yn = []
for i in range(len(df_business_clean)):
    cat = df_business_clean['categories'][i]
    if 'Vietnamese' in cat:
        vietnamese_yn.append(1)
    else:
        vietnamese_yn.append(0)

In [7]:
# Adding the binary indicator to the cleaned business dataset
df_business_clean['vietnamese_yn'] = vietnamese_yn
df_viet = df_business_clean[df_business_clean['vietnamese_yn'] == 1]

In [8]:
df_viet.shape

(743, 15)

#### There are X Vietnamese restaurants in total. If including all of these business, corresponding review data will be very large. For now, at model building stage, I will randomly choose 30 Vietnamese restaurants and only analyze corresponding reviews.

In [9]:
# Obtained business id of randomly selected 30 Vietnamese restaurants
business_id_list = df_viet['business_id'].sample(30, random_state = 1).values
business_id_list

array(['8kUh6TROemLfbVR_ewVVLg', 'h-xgk0e_WisVzWLUudKgDg',
       'NpBIOzibkzTHG56bLdeiAA', 'BGD95aQNLkNcWbmsUFh_Dg',
       '6Md7LjLUqf0scKfl78CQDg', 'j5IsoB-xT1Kqp8yQjQnaXA',
       'kbeSI3swiGHymmlK2Cznfw', 'q9JGf4DBh2mN5Qo6XPXBeQ',
       'p9Dd6AjOawHGGzcUntYr-A', 'VdCIcyn_s63Pxo916vkGvg',
       'tqCDj8b0lcRfhLMP9iZ3FA', '41RbEZa99W2d_kTnYTp_mw',
       'G5sMILBHK-4Qh9gam6GcAg', 'zM2FiARffKtAW7cO1jer-w',
       'ZUPnWlLgqnU1PVtF6Q9-HQ', 'rD5cJ74ZX4UiBo1geRljPw',
       '4iB7FxO5-8EVLKPOm579Nw', '3d-CfM2eT_Rn3N5FbExHHA',
       'yYGCoJfHANvtOZ7e51FU5g', 'txHjMRDgOEnqsvJi0qCKHA',
       'c8kCy8ZwXaOhOgPbFLa4ow', 'e_9qxd4k6z58I9-b3vBxBA',
       '1xF6JoMExE6-7RowriRazA', 'PzhPMkaNYiKDTHoTG0r8rw',
       'NUVPGowRmfN1NkecJB6R2Q', 'CQD7-sfUiADa6KyTYJT2NQ',
       'qtlDqYE5Dnn-lztBw4qXrg', 'Po3aZ9CLemP2E9PeovySPQ',
       'dTDBXOnGK0w96L79P92AbA', 'cw2QlnunGxprse--SmlPuA'], dtype=object)

In [10]:
# Create a binary indicator for the sampled reviews (Included in the sample = 1, Not included = 0)
sample_yn = []
for i in range(len(df_review)):
    if df_review['business_id'][i] in business_id_list:
        sample_yn.append(1)
    else:
        sample_yn.append(0)

In [11]:
# Adding the binary indicator to the review dataset
df_review['sample_yn'] = sample_yn
# Extracting reviews of the sampled business based on the binary indicator
sample_review = df_review[df_review['sample_yn'] == 1]

In [12]:
# sanity check
sample_review.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,date,sample_yn
68,fGYcFOHfQL4stYPdD3J47g,CgyCtH9CbLO7J_uO3cL7OA,PzhPMkaNYiKDTHoTG0r8rw,4.0,0,0,0,2016-09-18 22:43:18,1
3913,EwkeL8nm4vug3htgi0ZbsQ,_exJkgTPirjUjEUbQju2ow,PzhPMkaNYiKDTHoTG0r8rw,5.0,0,0,0,2017-04-19 12:34:20,1
4148,KRowK5RRz8IF2auM-8ROSw,zEuTnMbf5IEQ1TiEGWHVQg,PzhPMkaNYiKDTHoTG0r8rw,3.0,0,0,0,2018-09-03 02:20:28,1
5487,66KUpAcnKRA3BB_BNSgjpw,X3APf4FSiOYg4BB6sdhJPw,PzhPMkaNYiKDTHoTG0r8rw,5.0,1,0,0,2018-07-07 22:45:43,1
6288,fFFHTE0tY1OFqV50Iiy92Q,CQW0mI3IXtOeVv5TNDuuhw,PzhPMkaNYiKDTHoTG0r8rw,5.0,0,0,0,2015-11-06 00:36:28,1


In [13]:
# Check the number of reviews included
sample_review.shape[0]

3696

#### The sample_review dataset does not include text of review. Review text of selected business should be extracted from the raw review data. The raw review data is extremely large, hence cannot be imported together. Instead, data will be read line by line and reviews from the selected business will be appended to the review test list.

In [14]:
import time
start_time = time.time()

review_txt = []
with open("yelp_academic_dataset_review.json", "r") as f:
    for line in f.readlines():
        review = json.loads(line) # current line of the original data that has been read
        if review['business_id'] in business_id_list:
            review_txt.append(review['text'])
        #r += 1
        #print(r)

end_time = time.time()
end_time - start_time 

36.54101920127869

In [15]:
# Append the text of review to the review data
sample_review['text'] = review_txt

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_review['text'] = review_txt


In [16]:
# sanity check
sample_review.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,date,sample_yn,text
68,fGYcFOHfQL4stYPdD3J47g,CgyCtH9CbLO7J_uO3cL7OA,PzhPMkaNYiKDTHoTG0r8rw,4.0,0,0,0,2016-09-18 22:43:18,1,After 3 weeks of working in the area I finally...
3913,EwkeL8nm4vug3htgi0ZbsQ,_exJkgTPirjUjEUbQju2ow,PzhPMkaNYiKDTHoTG0r8rw,5.0,0,0,0,2017-04-19 12:34:20,1,Soup was fresh spicy and delicious! Wish it wa...
4148,KRowK5RRz8IF2auM-8ROSw,zEuTnMbf5IEQ1TiEGWHVQg,PzhPMkaNYiKDTHoTG0r8rw,3.0,0,0,0,2018-09-03 02:20:28,1,A pho place that checks all the boxes but does...
5487,66KUpAcnKRA3BB_BNSgjpw,X3APf4FSiOYg4BB6sdhJPw,PzhPMkaNYiKDTHoTG0r8rw,5.0,1,0,0,2018-07-07 22:45:43,1,Steamed buns for the adults. Beef & Broccoli f...
6288,fFFHTE0tY1OFqV50Iiy92Q,CQW0mI3IXtOeVv5TNDuuhw,PzhPMkaNYiKDTHoTG0r8rw,5.0,0,0,0,2015-11-06 00:36:28,1,The spicy beef pho is wonderful. The broth is ...


In [17]:
# Store the sampled review as joblib object
joblib.dump(sample_review, 'sample_review.py') 

['sample_review.py']