# Cleaned Yelp Dataset

This noteboook extracts business in Philly from the business.json and then extracts all the reviews. It does the same only for establish gay businesses in Philly.

In [1]:
import pandas as pd
import geopandas as gpd
import requests
import json
from bs4 import BeautifulSoup
import os
import matplotlib.pyplot as plt
import contextily as ctx

# All Businesses in Philadelphia

## Filter Business JSON for Philadelphia Area

In [None]:
# load data
biz = pd.read_json('/Users/harper/Documents/GitHub/yelp_dataset/yelp_academic_dataset_business.json',lines=True)

# create geodataframe
bizGdf = gpd.GeoDataFrame(biz, geometry=gpd.points_from_xy(biz.longitude, biz.latitude), crs='EPSG:4326')

# filter for Philadelphia metro area
bizPhil = bizGdf[(bizGdf.state == 'PA') | (bizGdf.state == 'NJ') | (bizGdf.state == 'DE')]

In [None]:
# plot
fig, ax = plt.subplots(figsize=(20,20))
bizPhil.to_crs('EPSG: 3857').plot(ax=ax, color = 'red')
ctx.add_basemap(ax)

plt.savefig('images/yelp_philly.png', dpi=300)

In [None]:
# plot
fig, ax = plt.subplots(figsize=(20,20))
bizGdf.to_crs('EPSG: 3857').plot(ax=ax, color = 'red')
ctx.add_basemap(ax)

plt.savefig('images/yelp_country.png', dpi=300)

## Bring in reviews for Philadelphia area

<div class="alert alert-block alert-danger">
Reset kernel before running loop! Takes 30+ mins to run
</div>

In [None]:
# create jsonreader object
reader = pd.read_json('/Users/harper/Documents/GitHub/yelp_dataset/yelp_academic_dataset_review.json', 
                    orient = 'records', lines=True, chunksize=1000)

# initialize empty dataframe
colnames = ['review_id','user_id','business_id','stars','useful','funny','cool','text','date']
reviews = pd.DataFrame(columns = colnames)

# iterate through, filter for state, then add each chunk to the dataframe.
for chunk in reader:
    subset = chunk[chunk['business_id'].isin(bizPhil.business_id)]
    reviews = pd.concat([reviews, subset])



In [None]:
reviews.to_pickle('data/reviewsPhil.pandas')

In [None]:
reviews.to_csv('data/reviewsPhil.csv')

In [None]:
# just for testing -- dont run
limit = 10 
for index,chunk in zip(range(limit), reader):
    subset = chunk[chunk['business_id'].isin(bizPhil.business_id)]
    reviews = pd.concat([reviews, subset])

## Gay Businesses in Philadelphia

First, bring in Michael's list of the names of gay businesses in philadelphia. Then subset the bizDf for business names in the gay business list. Then use that subset to bring in review only for those businesses. Then pickle those reviews and bring them over to NLP and just run through that script. 

## Filter business JSON for gay business in Philadelphia

In [2]:
# read in gay businesses and biz id 
gaybiz = pd.read_csv('data/philly_gay_businesses.csv')

In [3]:
# create jsonreader object
reader = pd.read_json('/Users/harper/Documents/GitHub/yelp_dataset/yelp_academic_dataset_review.json', 
                    orient = 'records', lines=True, chunksize=10000)

# initialize empty dataframe
colnames = ['review_id','user_id','business_id','stars','useful','funny','cool','text','date']
reviews = pd.DataFrame(columns = colnames)

# iterate through, filter for state, then add each chunk to the dataframe.
for chunk in reader:
    subset = chunk[chunk['business_id'].isin(gaybiz.business_id)]
    reviews = pd.concat([reviews, subset])

In [4]:
reviews.to_pickle('data/gay_reviews_phil.pandas')

In [5]:
reviews.to_csv('data/gay_reviews_phil.csv')

In [None]:
# just for testing -- dont run
limit = 10 
for index,chunk in zip(range(limit), reader):
    subset = chunk[chunk['business_id'].isin(bizPhil.business_id)]
    reviews = pd.concat([reviews, subset])