In [None]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np

In [None]:
# Import csv
restaurants3_df = pd.read_csv('data/yelp_csv/yelp_csv_CLEAN.csv')

In [None]:
# View subset of dataframe
restaurants3_df.head()

In [None]:
# Create groupby object by zip code for rating and price_integer
by_zip = restaurants3_df.groupby(['zip code'])[['rating','price_integer']]

In [None]:
# Use describe method for rating and price_integer by zip code
zip_stats = by_zip.describe()
zip_stats

In [None]:
# Get average of rating and price_integer by zip code and drop null values 
zip_means = by_zip.mean()
zip_means.dropna(inplace=True)
zip_means

In [None]:
# Define x and y axis 
x_axis = zip_means['price_integer']
y_axis = zip_means['rating']

# Perform linear regression
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_axis,y_axis)
reg_line = slope * x_axis + intercept
line_eq = f'y = {round(slope,2)}x + {round(intercept,2)}'
line_eq

In [None]:
# Perform correlation analysis and print result
corr = st.pearsonr(x_axis,y_axis)
print(f'Correlation coefficient: {round(corr[0],2)}')

# Plot scatterplot and regression line
plt.scatter(x_axis,y_axis,color='steelblue');
plt.plot(x_axis,reg_line, color = 'darkred');
plt.ylabel('Average Rating');
plt.xlabel('Average Dollar Signs');
plt.ylim(2.2,4)
plt.xlim(.95,2.2)
plt.title('')

# Annotate with R^2 value
plt.annotate(f'R-squared: {round((rvalue)**2,2)}',(1.1,2.5),color='darkred');

# plt.savefig('images/avg_price_rating_scatter.png')

In [None]:
# Create variable that multiplies rating by the number of reviews
restaurants3_df['rating_x_reviews'] = restaurants3_df['rating'] * restaurants3_df['review_count']
restaurants3_df

In [None]:
# Group by zip code and category type
by_category = restaurants3_df.groupby(['zip code','category_title'])

In [None]:
# Create dataframes with ratings and price by zip codes
restaurants_means = pd.DataFrame(by_category[['rating','price_integer']].mean())
restaurants_counts = pd.DataFrame(by_category[['rating','price_integer']].count())

# Create dataframe summing 'rating_x_reviews' and 'review_count' (for weigthted average)
restaurants_sums = pd.DataFrame(by_category[['rating_x_reviews','review_count']].sum())

restaurants_means.reset_index(inplace=True)
restaurants_counts.reset_index(inplace=True)

# Merge dataframes
restaurants4_df = pd.merge(restaurants_means,restaurants_counts,on=['zip code','category_title'])
restaurants5_df = pd.merge(restaurants4_df,restaurants_sums,on=['zip code','category_title'])

In [None]:
# Rename columns
restaurants5_df = restaurants5_df.rename(columns={'rating_x' : 'rating_mean',
                                                 'price_integer_x' : 'price_mean',
                                                 'rating_y' : 'rating_count',
                                                 'price_integer_y' : 'price_count'})
restaurants5_df.dropna(inplace=True)

# Calculate weighted average
restaurants5_df['wgt_avg_rating'] = restaurants5_df['rating_x_reviews'] / restaurants5_df['review_count']

In [None]:
# Restrict to pizza restaurants in zip codes with more than 5 restaurants 
best_pizza = restaurants5_df.loc[(restaurants5_df['category_title'] == 'Pizza') &
                                 (restaurants5_df['rating_count'] > 5),:]

# Sort by average rating
best_pizza_simple = best_pizza.sort_values(by='rating_mean',ascending=False)
best_pizza_simple.head()

In [None]:
# Sort by weighted average rating 
best_pizza_wgt = best_pizza.sort_values(by='wgt_avg_rating',ascending=False)
best_pizza_wgt

In [None]:
# Retain top 10 zip codes for pizza and relevant columns
pizza_chart = pd.DataFrame(best_pizza_simple.iloc[0:10,[0,2,8]])

# Plot bar chart
pizza_chart.plot(kind='bar',x='zip code',y=['rating_mean','wgt_avg_rating'],
                 color=['steelblue','goldenrod'],width=.6);
plt.title('Top 10 Best Zip Codes for Pizza');
labels = ['Average Rating', 'Weighted Average Rating']
plt.legend(loc='lower right',labels=labels);
plt.ylim(0,4.5)
plt.xlabel('Zip Code');
plt.ylabel('Rating');
# plt.savefig('images/pizza_rating_zips.png')

In [None]:
# View subset of dataframe
restaurants3_df.head()

In [None]:
# Groupby restaurants by category title 
by_type = pd.DataFrame(restaurants3_df.groupby(['category_title']).agg({'rating' : ['count','mean'],
                                                          'review_count' : 'sum',
                                                          'rating_x_reviews' : 'sum',              
                                                          'price_integer' : 'mean'}))

# Sort dataframe by descending number of restaurants per category
by_type.sort_values(by=[('rating','count')],ascending=False,inplace=True)
by_type['wgt_avg_rating'] = by_type[('rating_x_reviews','sum')] / by_type[('review_count','sum')]
by_type.head()

In [None]:
# Retain the top ten restaurant categories by # of restaurants
categories_df = pd.DataFrame(by_type.iloc[0:10,0])
categories_df.reset_index(inplace=True)

# Make bar chart
x_axis = np.arange(0,len(categories_df['category_title']));
x_labels = categories_df['category_title'];
y_axis = categories_df[('rating','count')];
plt.xticks(x_axis,x_labels,rotation=90);
plt.bar(x=x_axis,height=y_axis,color=['goldenrod','steelblue','steelblue','steelblue','steelblue'
                                     ,'steelblue','steelblue','steelblue','steelblue','steelblue']);
plt.title('Top 10 Categories by Number of Restaurants');
plt.xlim(-.6,9.6);
# plt.savefig('images/number_restaurants_by_type.png')