In [1]:
%matplotlib inline
import numpy as np
import seaborn as sns
import pandas as pd
import csv
import matplotlib.pyplot as plt
from scipy import stats
import plotly.express as px
import datetime
import gmaps
from config import g_key
gmaps.configure(api_key = g_key)


#File to read in
data = pd.read_csv("Resources/yelp_output.csv", error_bad_lines=False, encoding = 'UTF-16LE')

#Choosing which columns to import to the new csv from the old
yelp_ratings = data[['Restaurant Name',"Street Address", "Lat", "Long", 'Inspection Score', "yelp_rating", "Rating","Rating Count",
                     "yelp_review_count", "Inspection Date", "high_match_score_flag"]]

#Renaming Columns
df = yelp_ratings.rename(columns={"Restaurant Name": "Restaurant Name", "Inspection Score": "Health Inspection Score",
                                  "yelp_rating":"Yelp Rating", "Rating":"Google Rating", "Rating Count":"Google Review Count",
                                  "yelp_review_count":"Yelp Review Count", "high_match_score_flag":"Flag" })

#creating a list of column names to drop "NAN" values and a for loop to do it quickly
things_to_fix = ["Yelp Rating","Google Rating","Google Review Count","Yelp Review Count"]

for things in things_to_fix:
    df = df[np.isfinite(df[things])]


# Creates a new column for year by separating the inspection date column using dateTime
df['Inspection Year'] = pd.DatetimeIndex(df['Inspection Date']).year
df['Inspection Month'] = pd.DatetimeIndex(df['Inspection Date']).month

ideal_df = df["Flag"] == 1
df = df[ideal_df]
df.head()

Unnamed: 0,Restaurant Name,Street Address,Lat,Long,Health Inspection Score,Yelp Rating,Google Rating,Google Review Count,Yelp Review Count,Inspection Date,Flag,Inspection Year,Inspection Month
1,A & J BAKERY,3515 OAK LAWN AVE,32.81238,-96.805424,92,3.0,4.0,60.0,55.0,4/30/19,1.0,2019,4
5,A SANDWICH SHOP,8333 DOUGLAS AVE #C140,32.864038,-96.809478,92,3.0,5.0,1.0,2.0,1/28/19,1.0,2019,1
6,A TASTE OF WAYS CATERING,2949 E KIEST BLVD #A,32.726334,-96.786419,97,4.5,4.6,97.0,6.0,12/11/19,1.0,2019,12
7,A TO Z FOOD MART,2120 N ST AUGUSTINE RD #130,32.749386,-96.65617,97,2.0,4.4,96.0,1.0,6/17/19,1.0,2019,6
10,A&A PIZZA,900 S CORINTH ST,32.73888,-96.79832,92,4.5,4.5,95.0,15.0,9/30/19,1.0,2019,9


In [11]:
#Question being answered, is there a relationship between yelp rating and health inspection score...according to our data no

x=df["Yelp Rating"]
y=df["Health Inspection Score"]

fig = px.scatter(df, x="Yelp Rating", y="Health Inspection Score", trendline="ols", marginal_x="histogram", trendline_color_override = "red")
gradient, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print(f"R Value is: {r_value}, R2 value is: {r_value**2}, and P Value is: {p_value}")

fig.write_image("output_data/fig1.png")

fig.show(renderer = "browser")


R Value is: -0.028775749484799804, R2 value is: 0.0008280437584119562, and P Value is: 0.2352770068118278


In [3]:
#Question being answered, is there a relationship between google rating and health inspection score...according to our data no
#google reviews do tend to be on the higher end of the spectrum compared to yelp which has a more evenly distributed rating spread
x=df["Google Rating"]
y=df["Health Inspection Score"]

fig = px.scatter(df, x="Google Rating", y="Health Inspection Score", trendline="ols", marginal_x="histogram", trendline_color_override = "red")
gradient, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print(f"R Value is: {r_value}, R2 value is: {r_value**2}, and P Value is: {p_value}")

fig.write_image("output_data/fig2.png")

fig.show()

R Value is: -0.014917638777729827, R2 value is: 0.00022253594670282865, and P Value is: 0.5384261531862314


In [4]:
#Question ebing answered is there a relationship between google and yelp reviews, or are reviews consistent across platforms...
#answer is kind of, we have an R2 value of 0.25 which means that yelp ratings are affected by google ratings approx. 25%
#this is not statistically significant because our sample size is small at only ~2300
x=df["Yelp Rating"]
y=df["Google Rating"]

fig = px.scatter(df, x="Yelp Rating", y="Google Rating", trendline="ols", trendline_color_override = "red")
gradient, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print(f"R Value is: {r_value}, R2 value is: {r_value**2}, and P Value is: {p_value}")

fig.write_image("output_data/fig3.png")


fig.show()

R Value is: 0.4910335352272902, R2 value is: 0.24111393271781045, and P Value is: 4.8394366640952495e-104


In [5]:
#need to be adjusted for mean instead of sum


fig = px.bar(df, x="Inspection Month", y = "Health Inspection Score")
fig.show()

In [6]:
fig = px.scatter(df, x="Yelp Rating", y="Health Inspection Score", size="Yelp Review Count",
           hover_name="Restaurant Name", log_x=True, size_max=60)

fig.write_image("output_data/fig4.png")


fig.show()

In [7]:
fig = px.scatter(df, x="Google Rating", y="Health Inspection Score", size="Google Review Count",
           hover_name="Restaurant Name", log_x=False, size_max=60)

fig.write_image("output_data/fig5.png")


fig.show()

In [8]:
# User chosen conditions
inspectionInput = int(input("What is the minimum health inspection score that you consider to be acceptable? "))
googleRatingInput = float(input("From 1-5, what do you consider to be a good rating on Google?"))
yelpRatingInput = float(input("From 1-5, what do you consider to be a good rating on Yelp? "))

# Apply filters
inspectionScore = df['Health Inspection Score'] >= inspectionInput
googleRating = df['Google Rating'] >= googleRatingInput
yelpRating = df['Yelp Rating'] >= yelpRatingInput  

narrowed_restaurant = df[inspectionScore & googleRating & yelpRating]

narrowed_restaurant

What is the minimum health inspection score that you consider to be acceptable? 90
From 1-5, what do you consider to be a good rating on Google?4.5
From 1-5, what do you consider to be a good rating on Yelp? 4.5


Unnamed: 0,Restaurant Name,Street Address,Lat,Long,Health Inspection Score,Yelp Rating,Google Rating,Google Review Count,Yelp Review Count,Inspection Date,Flag,Inspection Year,Inspection Month
6,A TASTE OF WAYS CATERING,2949 E KIEST BLVD #A,32.726334,-96.786419,97,4.5,4.6,97.0,6.0,12/11/19,1.0,2019,12
10,A&A PIZZA,900 S CORINTH ST,32.738880,-96.798320,92,4.5,4.5,95.0,15.0,9/30/19,1.0,2019,9
15,ABUGEDA GROCERY,12101 GREENVILLE AVE #107,32.913151,-96.746146,94,5.0,4.7,1096.0,1.0,11/14/19,1.0,2019,11
83,ARIF CAFE,8367 PARK LN,32.871317,-96.760545,92,4.5,4.9,20.0,94.0,12/4/19,1.0,2019,12
115,BAILEY'S CAFE,2525 INWOOD RD #123,32.822093,-96.836398,94,4.5,5.0,5.0,69.0,11/28/18,1.0,2018,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2723,WESTLAKE BREWING COMPANY,2816 COMMERCE ST,32.783065,-96.782355,100,4.5,5.0,60.0,20.0,12/3/19,1.0,2019,12
2809,YENAT GUADA ETHIOPIAN CUISINE,7015 GREENVILLE AVE #300,32.873853,-96.763575,95,4.5,4.5,306.0,140.0,12/19/19,1.0,2019,12
2813,YOGURTLAND,13350 DALLAS PKWY #925,32.929022,-96.820039,100,4.5,4.5,651.0,6.0,2/5/19,1.0,2019,2
2816,YOKOHAMA JAPANESE RESTAURANT,19009 PRESTON RD 115,33.006425,-96.796399,94,4.5,4.6,476.0,444.0,1/9/20,1.0,2020,1


In [9]:
better_yelp = narrowed_restaurant['Yelp Review Count'] >= 200
better_google = narrowed_restaurant['Google Review Count'] >= 200

recommendations = narrowed_restaurant[better_yelp & better_google]
recommendations.head()

Unnamed: 0,Restaurant Name,Street Address,Lat,Long,Health Inspection Score,Yelp Rating,Google Rating,Google Review Count,Yelp Review Count,Inspection Date,Flag,Inspection Year,Inspection Month
195,BLUES BURGERS,1820 W MOCKINGBIRD LN #44,32.821696,-96.85411,92,4.5,4.6,711.0,436.0,9/13/19,1.0,2019,9
389,CHEESESTEAK HOUSE,2015 W DAVIS ST,32.749946,-96.851552,92,4.5,4.6,934.0,330.0,10/10/18,1.0,2018,10
614,DEEP ELLUM BREWING COMPANY,2821 ST LOUIS ST,32.780772,-96.781694,98,4.5,4.6,680.0,484.0,8/22/19,1.0,2019,8
615,DEEP ELLUM BREWING COMPANY (IN-HOUSE CATERING),2823 ST. LOUIS ST,32.780751,-96.781738,100,4.5,4.6,680.0,484.0,10/13/16,1.0,2016,10
703,E-BAR TEX-MEX,1901 HASKELL AVE #100,32.801355,-96.785775,95,4.5,4.5,1264.0,643.0,12/16/19,1.0,2019,12


In [10]:
# Store 'Lat' and 'Lng' into  locations 
locations = recommendations[["Lat", "Long"]]
rating = recommendations["Health Inspection Score"].astype(int)

# Create a poverty Heatmap layer
fig = gmaps.figure()

heat_layer = gmaps.heatmap_layer(locations,weights=rating, 
                                 dissipating=True, max_intensity=100,
                                 point_radius = 40)

# Using the template add the hotel marks to the heatmap
info_box_template = """
<dl>
<dt>Restaurant</dt><dd>{Restaurant Name}</dd>
<dt>Address</dt><dd>{Street Address}</dd>
<dt>Google Rating</dt><dd>{Google Rating}</dd>
<dt>Yelp Rating</dt><dd>{Yelp Rating}</dd>
<dt>Health Inspection Score</dt><dd>{Health Inspection Score}</dd>
</dl>
"""

# Store the DataFrame Row
# NOTE: be sure to update with your DataFrame name
restaurant_info = [info_box_template.format(**row) for index, row in recommendations.iterrows()]
locations = recommendations[["Lat", "Long"]]


## Mouseover information for each point on the map
info_box_template2 = '''
Restaurant: {Restaurant Name}
Address: {Street Address}
Google Rating: {Google Rating}
Yelp Rating : {Yelp Rating}
Health Inspection Score: {Health Inspection Score}
'''


restaurant_info2 = [info_box_template2.format(**row) for index, row in recommendations.iterrows()]

markers = gmaps.marker_layer(locations, info_box_content=restaurant_info, hover_text=restaurant_info2)


# Display Map
fig.add_layer(heat_layer)
fig.add_layer(markers)

print("Here are restaurants in Dallas that satisfy your conditions.")
fig

Here are restaurants in Dallas that satisfy your conditions.


Figure(layout=FigureLayout(height='420px'))