# Filtering Reviews By Business

In [1]:
import pandas as pd
import progressbar

In [4]:
business = pd.read_json('data/business.json', lines=True, orient='columns')
business.drop(['attributes', 'hours'], axis=1, inplace=True)

In [5]:
business.head()

Unnamed: 0,address,business_id,categories,city,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,2818 E Camino Acequia Drive,1SWheh84yJXfytovILXOAQ,"Golf, Active Life",Phoenix,0,33.522143,-112.018481,Arizona Biltmore Golf Club,85016,5,3.0,AZ
1,30 Eglinton Avenue W,QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON
2,"10110 Johnston Rd, Ste 15",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC
3,"15655 W Roosevelt St, Ste 237",xvX2CttrVhyG2z1dFg_0xw,"Insurance, Financial Services",Goodyear,1,33.455613,-112.395596,Farmers Insurance - Paul Lorenz,85338,3,5.0,AZ
4,"4209 Stuart Andrew Blvd, Ste F",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,1,35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC


In [79]:
business_name = "The Cheesecake Factory"

In [80]:
businesses = business[business.name == business_name].drop(['categories', 'postal_code', 'is_open', 'name'], axis=1)
businesses.set_index('business_id', inplace=True)
businesses

Unnamed: 0_level_0,address,city,latitude,longitude,review_count,stars,state
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
eSlmVbicgkhtD9niSuReJw,24265 Cedar Rd,Lyndhurst,41.5023,-81.502145,223,3.5,OH
L8rJht4Jw_BuR8gsckEWYg,"4400 Sharon Rd, Space A",Charlotte,35.151587,-80.830559,447,3.0,NC
vXEZ-r6fah-5Fjt3a6c-Gw,415 S 27th St,Pittsburgh,40.428411,-79.965348,245,3.5,PA
htVvtLIFftBLqzRISjReDw,3401 Dufferin Street,Toronto,43.724684,-79.454173,450,3.0,ON
I-5qHCVwT7k_KH67-YOx3A,160 S Green Valley Pkwy,Henderson,36.022082,-115.083185,654,3.5,NV
NoA6bD6W7z_Aztk_cOU5cg,"505 S Grand Central Pkwy, Ste 3201",Las Vegas,36.166142,-115.1564,308,3.5,NV
3oTVApC-eUzpGjrOVxIr5g,"2402 E Camelback, Space 101",Phoenix,33.510305,-112.029567,414,3.5,AZ
1NNBpiQ3rUT-7T8ch45SVA,148 Crocker Park Blvd,Westlake,41.460132,-81.95088,160,3.0,OH
7ADjU1ZmRfUeMebjbYU3kw,1000 Ross Park Mall Dr,Ross Township,40.544559,-80.008739,152,3.5,PA
-Ylpy3VyRWwubf9dysuwjQ,3500 S Las Vegas Blvd,Las Vegas,36.117453,-115.176688,663,3.5,NV


Select all reviews that match any of these businesses

In [81]:
num_chunks = 50
chunk_size = 6685900 / num_chunks

In [82]:
reviews = pd.DataFrame()
bar = progressbar.ProgressBar(max_value=num_chunks)

df = pd.read_json('data/review.json', lines=True, orient='columns', chunksize=chunk_size)
i = 0
for chunk in df:
    chunk.set_index('business_id', inplace=True)
    joined = businesses.join(chunk, lsuffix='_biz', how='inner')
    reviews = reviews.append(joined)
    i += 1
    bar.update(i)

reviews.shape[0]

100% (50 of 50) |########################| Elapsed Time: 0:01:32 ETA:  00:00:00

6226

In [83]:
pd.DataFrame(reviews.groupby(reviews.index).size().sort_values(ascending=False))

Unnamed: 0_level_0,0
business_id,Unnamed: 1_level_1
eZcCFV-8X91ZSnmB9807bw,721
I-5qHCVwT7k_KH67-YOx3A,679
-Ylpy3VyRWwubf9dysuwjQ,677
L8rJht4Jw_BuR8gsckEWYg,465
htVvtLIFftBLqzRISjReDw,463
HwEDsb1xtRBTuFwATTRjpA,454
3oTVApC-eUzpGjrOVxIr5g,432
dPGs5b0N9MarZjVgQVelGQ,390
5diHqpk_JSC4p8tIo4APow,365
NoA6bD6W7z_Aztk_cOU5cg,317


In [84]:
reviews = reviews.reset_index().set_index('review_id')

In [85]:
reviews.to_parquet('data/'+business_name+'_reviews.parquet')

In [86]:
reviews.head()

Unnamed: 0_level_0,business_id,address,city,latitude,longitude,review_count,stars_biz,state,cool,date,funny,stars,text,useful,user_id
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
rsVqYKtEZykBZqwur6QyVw,L8rJht4Jw_BuR8gsckEWYg,"4400 Sharon Rd, Space A",Charlotte,35.151587,-80.830559,447,3.0,NC,0,2017-02-09 04:06:45,0,5,Famous for its cheesecake and infamous for its...,0,HFFiM63x9asevqVOZG5GBQ
T5dE90hplvP7UWQqYxYMTg,L8rJht4Jw_BuR8gsckEWYg,"4400 Sharon Rd, Space A",Charlotte,35.151587,-80.830559,447,3.0,NC,0,2013-06-24 15:00:55,0,2,I want to preface this review by admitting I d...,0,K1osyO04_hXoDKyGqe_V2Q
j5MrhwuMcvNkHO4q5ocdWg,L8rJht4Jw_BuR8gsckEWYg,"4400 Sharon Rd, Space A",Charlotte,35.151587,-80.830559,447,3.0,NC,0,2016-12-19 07:12:54,0,5,Went here for me and my boyfriends anniversary...,0,GOk6jLtou4Gufxo-rtX5ZA
yHTyWcbGR30Y9tXB8nPpSQ,L8rJht4Jw_BuR8gsckEWYg,"4400 Sharon Rd, Space A",Charlotte,35.151587,-80.830559,447,3.0,NC,0,2014-04-19 14:11:00,0,1,I'm not sure if I came in at a bad time... I c...,0,BCHZd09vfR1ztLlFeTrzbQ
F813bToytGklZok1WVxl6w,L8rJht4Jw_BuR8gsckEWYg,"4400 Sharon Rd, Space A",Charlotte,35.151587,-80.830559,447,3.0,NC,0,2016-05-31 01:54:07,0,2,Ive eaten at Cheesecake Factory at other locat...,3,X3h37GG6Za3qhVZiOnzIcw


# Applying NLP to the reviews

In [8]:
import spacy
from tqdm import tqdm_notebook
tqdm_notebook().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
nlp = spacy.load("en_core_web_lg")

In [14]:
business_name = 'The Cheesecake Factory'
df = pd.read_parquet('data/' + business_name + '_reviews.parquet')

In [15]:
df['spacy'] = df.text.progress_apply(nlp)

HBox(children=(IntProgress(value=0, max=6226), HTML(value='')))




In [16]:
df.to_pickle("data/" + business_name + "_spacy", compression=None)