In [None]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st

In [None]:
path = 'yelp_csv/yelp_csv_v2.csv'

restaurants0_df = pd.read_csv(path)

restaurants0_df.head()

In [None]:
restaurants1_df = restaurants0_df.drop(columns=['Unnamed: 0'])
restaurants1_df

In [None]:

len(restaurants1_df['restaurant_id'].unique())

In [None]:
# Remove duplicate rows
restaurants2_df = pd.DataFrame(restaurants1_df).drop_duplicates(keep='last',subset='restaurant_id')

In [None]:
restaurants2_df.count()

In [None]:
# Check how many zip codes in original datafram
restaurants2_df['zip code'].value_counts()

In [None]:
# List of Chicago zip codes comes from https://zipcode.org/city/IL/CHICAGO
chi_zips = ['60701','60696','60693','60691','60690','60685','60684','60699',
            '60697','60695','60694','60689','60688','60687','60686','60682',
            '60681','60680','60678','60675','60674','60670','60669','60677',
            '60673','60668','60666','60659','60657','60655','60654','60651',
            '60649','60645','60644','60640','60639','60664','60661','60660',
            '60656','60653','60652','60647','60646','60643','60642','60641',
            '60638','60634','60633','60631','60637','60636','60632','60630',
            '60629','60626','60625','60621','60620','60617','60616','60613',
            '60612','60609','60628','60624','60623','60622','60619','60618',
            '60615','60614','60611','60610','60608','60607','60606','60605',
            '60604','60603','60602','60601','60290']

In [None]:
# Remove restaurants not in a Chicago zip code
restaurants3_df = pd.DataFrame(restaurants2_df.loc[restaurants2_df['zip code'].isin(chi_zips),:])

In [None]:
# Check the count of non-missing values per column
restaurants3_df.count()

In [None]:
len(restaurants3_df['restaurant_id'].unique())

In [None]:
restaurants3_df.reset_index(drop=True,inplace=True)
restaurants3_df.head()

In [None]:
restaurants3_df.dtypes

In [None]:
restaurants3_df['price_integer'] = np.nan
restaurants3_df.loc[restaurants3_df['price'] == "$", 'price_integer'] = 1
restaurants3_df.loc[restaurants3_df['price'] == "$$", 'price_integer'] = 2
restaurants3_df.loc[restaurants3_df['price'] == "$$$", 'price_integer'] = 3
restaurants3_df.loc[restaurants3_df['price'] == "$$$$", 'price_integer'] = 4

In [None]:
restaurants3_df.head()

In [None]:
restaurants3_df['rating'].value_counts()

In [None]:
restaurants3_df.to_csv('yelp_csv/yelp_csv_CLEAN.csv')

In [None]:
ratings_df = restaurants3_df.groupby("zip code")

mean_rating = round(ratings_df['rating'].mean(), 1)

median_rating = round(ratings_df['rating'].median(), 1)

std_rating = round(ratings_df['rating'].std(), 1)

var_rating = round(ratings_df['rating'].var(),1)




summary_stats = {"Mean": mean_rating,
                "Median": median_rating,
                "Variance": var_rating,
                "Std Deviation": std_rating}
                

summ_stats_df = pd.DataFrame(summary_stats)
summ_stats_df.columns=pd.MultiIndex.from_product([['Ratings Summary Stats'],summ_stats_df.columns])
summ_stats_df.head()

In [None]:
zip_codes = pd.DataFrame(ratings_df['zip code'].count())
zip_codes.head()

In [None]:
restaurants3_df['review_count'].value_counts()

In [None]:
# Capomulin_df = Combined_data.loc[Combined_data["Drug Regimen"] == "Capomulin",:]

# zip_rating_df = restaurants3_df.loc[restaurants3_df["rating"] == (0<=5),:]
restaurants3_df["rating"].value_counts()

In [None]:
fig1, ax1 = plt.subplots(figsize=(15, 10))
place_hold =restaurants3_df.groupby(['price_integer']).mean()
place_hold

In [None]:
marker_size=15
plt.scatter(restaurants3_df["price_integer"],restaurants3_df['rating'],s=175, color="blue")
plt.title('Price Versus Rating',fontsize =25)
plt.xlabel('Price',fontsize =14)
plt.ylabel('Rating',fontsize =14)



plt.show()

In [None]:
corr=round(st.pearsonr(avg_zip['zip code'],avg_zip['rating'])[0],2)
print(f"The correlation between zip code and rating is {corr}")

In [None]:
x_values = avg_zip['zip code']
y_values = avg_zip['rating']

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

print(f"slope:{slope}")
print(f"intercept:{intercept}")
print(f"rvalue (Correlation coefficient):{rvalue}")
print(f"pandas (Correlation coefficient):{corr}")
print(f"stderr:{stderr}")

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

print(line_eq)

In [None]:
# Add the linear regression equation and line to plot
fig1, ax1 = plt.subplots(figsize=(15, 10))
plt.scatter(x_values,y_values,s=175, color="blue")
plt.plot(x_values,regress_values,"r-")
plt.title('Regression Plot of Zip Codes Vs. Rating',fontsize =20)
plt.xlabel('Price',fontsize =14)
plt.ylabel('Average Rating',fontsize =14)
ax1.annotate(line_eq, xy=(20, 40), xycoords='data',xytext=(0.8, 0.95), textcoords='axes fraction',horizontalalignment='right', verticalalignment='top',fontsize=30,color="red")

print(f"The r-squared is: {rvalue**2}")


plt.show()

In [None]:
# zip_vs_rating = restaurants3_df["zip code"]
# zip_vs_rating
zip_rating = restaurants3_df.groupby("zip code")

mean_rating = zip_rating['rating'].mean()
mean_rating

In [None]:
# rating_per_categ = restaurants3_df.groupby(["zip code"]).mean()
mean_rating = zip_rating['rating'].mean() 

# plot_pandas = rating_per_categ.plot.bar(figsize=(15,10), color='b',fontsize = 14)
plot_pandas = mean_rating.plot.bar(figsize=(15,10), color='b',fontsize = 14)
mean_rating
plt.xlabel("Zip Codes",fontsize = 14)
plt.ylabel("Avg Number of Ratings",fontsize = 14)
plt.title("Avergae Number of Ratings per Zip Code",fontsize = 20)


plt.tight_layout()
plt.show()

mean_rating