[Back](https://keqideng.github.io/data_analysis_portfolio_project/)
# Chinese Eateries Analysis
Date: Sept 27, 2021
Prepared by ***Keqi Deng***

>This dataset is made available by [Yelp Fusion AIP](https://www.yelp.com/developers), the usage of the dataset is subject to the terms and conditions of Yelp.

In [191]:
import gc # garbage collector
import numpy as np # linear algebra
from collections import Counter # for counting commong words
import requests
# request data from yelp
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # visualization
plt.style.use('fivethirtyeight') # use ggplot ploting style
import seaborn as sns # visualization
from wordcloud import WordCloud, STOPWORDS # this module is for making wordcloud in python
import re # regular expression
import string # for finding punctuation in text
import nltk # preprocessing text
from textblob import TextBlob
# import ploty for visualization
import plotly
import plotly.offline as py # make offline
py.init_notebook_mode(connected=True)
import plotly.tools as tls
import plotly.graph_objs as go
from plotly.graph_objs import *
import plotly.tools as tls
import plotly.figure_factory as fig_fact
# remove warnings
import warnings
warnings.filterwarnings('ignore')
# this will allow ploting inside the notebook
%matplotlib inline

## Data Source
Firstly use Yelp Fusion API to pull data from Yelp for further analysis.

The selecting criteria is the Chinese restaurants in New York City.

In [192]:
def yelp_business_search(offset=0, location = 'NYC', lim = 50, term = 50, endpoint = 'https://api.yelp.com/v3/businesses/search'):
    #Business Search Yelp 'https://api.yelp.com/v3/businesses/search'
    client_id = 'By4LztDH6ULQMoOWW73Uag'
    #This is a private api key, please get your own api key by access yelp fusion api website.
    api_key = 'hC90EyLZwRlSqQGxIzbg_t8CAfz0aFyIovi270VIjCqEGHBx8xXyPGW4HUQoIh2ELaArE7Lw3zIM9aLkJQkuREJs5S4G1HtBx43eFCpvnP8IebL-uZk2CBKfFqxQYXYx'

    headers = {'Authorization': 'bearer %s'%api_key}

    #Define the parameters
    params = {'term': term,
              'limit':lim,
              'offset': offset,
              'location':location,
              'radius':40000}

    #Make a request to the yelp api
    response = requests.get(url=endpoint, params=params, headers=headers)

    #Convert json string to dictionary
    business_data = response.json()
    return pd.DataFrame(business_data['businesses'])

Here we select 250 top rating Chinese restaurants near New York City.

In [193]:
df = yelp_business_search()
for i in range(50,200,50):
    df = pd.concat([df,yelp_business_search(i)])

## Dataset Modification
> Modify and prepare the datasets for further analysis and mapping.

In [194]:
#modify the coordinate data
coordinates = df.coordinates
lat = [i['latitude'] for i in coordinates]
lon = [i['longitude'] for i in coordinates]
df['latitude'] = lat
df['longitude'] = lon
df.drop('coordinates',axis=1,inplace=True)
df['city'] = [i['city'] for i in df.location]
df['state'] = [i['state'] for i in df.location]
df['postal'] = [i['zip_code'] for i in df.location]

In [195]:
# Dataset information
yelp_df = df.copy().reset_index()
print(yelp_df[:1])

   index                      id                         alias  \
0      0  GYO8lvStXGZAfFDRlOxQ1Q  hotel-50-bowery-nyc-new-york   

                  name                                          image_url  \
0  Hotel 50 Bowery NYC  https://s3-media3.fl.yelpcdn.com/bphoto/opMnr9...   

   is_closed                                                url  review_count  \
0      False  https://www.yelp.com/biz/hotel-50-bowery-nyc-n...           200   

                                          categories  rating  ... price  \
0  [{'alias': 'hotels', 'title': 'Hotels'}, {'ali...     4.5  ...    $$   

                                            location         phone  \
0  {'address1': '50 Bowery', 'address2': None, 'a...  +12125088000   

    display_phone     distance  latitude  longitude      city state postal  
0  (212) 508-8000  1202.705247  40.71599  -73.99683  New York    NY  10013  

[1 rows x 21 columns]


In [196]:
# Dataset length:
len(yelp_df)

200

In [197]:
#locate the middle of the map by calculate the mean of the latitude and longitude
lat_mean = yelp_df.latitude.mean()
lon_mean = yelp_df.longitude.mean()

# import plotly for plotting interactive map
hov_label = yelp_df['name']

# use mapbox token
#this is a private token key, please access mapbox for your own token
tkn = 'pk.eyJ1IjoicGF0cmlja2RkZCIsImEiOiJja3R5dGluOWEzNzE3MzFvMzR0MjRlZWVtIn0.ezHYtubmTTHs1z2n11c7yQ'

data = Data([Scattermapbox(lat=yelp_df.latitude.tolist(),
                           lon=yelp_df.longitude.tolist(),
                                       mode='markers',
                                       text=hov_label)])
layout = Layout(title='Chinese Restaurants on Yelp',
                autosize=True,
                hovermode='closest',
                mapbox=dict(accesstoken=tkn,
                            bearing=0,
                            center=dict(lat=lat_mean, lon=lon_mean),
                            style='light',
                            pitch=0,
                            zoom=10))

fig=dict(data=data, layout=layout)
plotly.offline.iplot(fig, filename='mapbox')