In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
from matplotlib.colors import ListedColormap
from matplotlib.animation import FuncAnimation
import geocoder
import geopandas
import random

In [None]:
load_dotenv()

MAPQUEST_API_KEY = os.environ['MAPQUEST_API_KEY']

In [None]:
df = pd.read_csv('onebite_ratings.csv')

In [None]:
df.sort_values(by='timestamp', inplace=True)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
citystate_to_latlng = {}
for location in df[['city', 'state-prov']].drop_duplicates().itertuples(index=False, name=None):
    while True:
        try:
            location_string = f'{location[0]}, {location[1]}'
            citystate_to_latlng[location_string] = tuple(geocoder.mapquest(location_string, key=MAPQUEST_API_KEY).latlng)
            break
        except:
            print(location_string)
            continue

In [None]:
df['location'] = df.apply(lambda x: f"{x['city']}, {x['state-prov']}", axis=1)

In [None]:
df['latlng'] = df['location'].map(citystate_to_latlng)

In [None]:
pres_df = df[df['user'] == 'stoolpresidente'].reset_index(drop=True)
fans_df = df[(df['user'] != 'stoolpresidente') & (df['rating'] <= 10.0)].reset_index(drop=True)

In [None]:
bins = np.arange(0.0, 10.5, 0.5)
pres_rating_counts = pd.cut(pres_df.rating, bins=bins, include_lowest=True).value_counts(sort=False, normalize=True)
fans_rating_counts = pd.cut(fans_df.rating, bins=bins, include_lowest=True).value_counts(sort=False, normalize=True)

In [None]:
plt.rcParams['figure.figsize'] = (15, 10)

In [None]:
plt.title('Distributions of Relative Frequencies of Pres Ratings & Fan Ratings')

plt.xticks(rotation = 45)

plt.bar(
    [str(x) for x in fans_rating_counts.index],
    fans_rating_counts.values,
    label='Relative Frequencies of Fan Ratings',
    color='red'
)

plt.bar(
    [str(x) for x in pres_rating_counts.index],
    pres_rating_counts.values,
    label='Relative Frequencies of Pres Ratings',
    color='blue',
    alpha = 0.5
)

xvals = np.linspace(0, 10, 1000)

plt.figtext(0.18, 0.84, 'fans')
plt.figtext(0.15, 0.75, fans_df.rating.describe()[['mean', 'std', '25%', '50%', '75%']].to_string())

plt.figtext(0.3, 0.84, 'pres')
plt.figtext(0.27, 0.75, pres_df.rating.describe()[['mean', 'std', '25%', '50%', '75%']].to_string())

plt.legend()
plt.savefig('ratings_distribution.png')
plt.show()

In [None]:
plt.title('Ratios of Relative Frequency of Fan Ratings to Pres Ratings')

plt.xticks(rotation = 45)

rescale = lambda y: 1 if y < 1 else 0
cmp = ListedColormap(['red', '#807cfc'])

ratios = fans_rating_counts.values / pres_rating_counts.values

plt.axhline(y=1, c='black')

plt.bar(
    [str(x) for x in fans_rating_counts.index],
    ratios,
    label='Pres Rating Frequencies',
    color=cmp((np.vectorize(rescale)(ratios)))
)

plt.yscale('log')

plt.savefig('ratings_distribution_ratio.png')

In [None]:
usa = geopandas.read_file('usa-states-census-2014.shx')

In [None]:
cmap = 'plasma'

fans_merged = usa.merge(fans_df.groupby('state-prov').rating.mean(), left_on='STUSPS', right_on='state-prov', how='left')
pres_merged = usa.merge(pres_df.groupby('state-prov').rating.mean(), left_on='STUSPS', right_on='state-prov', how='left')

vmin, vmax = pres_merged.rating.min(), fans_merged.rating.max()

fig, ax = plt.subplots()
plt.axis('off')

fig.set_size_inches(15, 10)

fans_merged.plot(
    column='rating',
    cmap=cmap,
    linewidth=0.8,
    edgecolor='0.8',
    ax=ax,
    vmin=vmin,
    vmax=vmax
)

sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
cbar = fig.colorbar(sm, shrink=0.5, aspect=15)

plt.show()


fig, ax = plt.subplots()
plt.axis('off')

fig.set_size_inches(15, 10)

pres_merged.plot(
    column='rating',
    cmap=cmap,
    linewidth=0.8,
    edgecolor='0.8',
    vmin=vmin,
    vmax=vmax,
    ax=ax,
    missing_kwds=dict(color='lightgrey',)
)

sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
cbar = fig.colorbar(sm, shrink=0.5, aspect=15)

plt.scatter([lng for lat, lng in pres_df[pres_df['state-prov'] != 'ON'].latlng], [lat for lat, lng in pres_df[pres_df['state-prov'] != 'ON'].latlng], s=5, c='white', edgecolor='k',linewidth=0.2)

plt.show()

In [None]:
plt.rcParams['animation.embed_limit'] = 2**128

x = []
y = []
colors = []

fig, ax = plt.subplots(figsize=(20, 10))
plt.axis('equal')

pres_merged.plot(
    column='rating',
    cmap=cmap,
    linewidth=0.8,
    edgecolor='0.8',
    vmin=vmin,
    vmax=vmax,
    ax=ax,
    missing_kwds=dict(color='lightgrey',)
)
  
def animation_func(i):
    coords = pres_df.latlng.iloc[-i]
    x.append(coords[1])
    y.append(coords[0])
    colors.append(pres_df.rating.iloc[-i])
    plt.xlim(-130, -60)
    plt.ylim(25, 50)
    plt.scatter(x, y, c='green', edgecolor='k')

animation = FuncAnimation(fig, animation_func, frames=len(pres_df))

animation.save('anim.gif')

In [None]:
plt.rcParams['animation.embed_limit'] = 2**128

x = []
y = []
colors = []

fig, ax = plt.subplots(figsize=(20, 10))
plt.axis('equal')

pres_merged.plot(
    column='rating',
    cmap=cmap,
    linewidth=0.8,
    edgecolor='0.8',
    vmin=vmin,
    vmax=vmax,
    ax=ax,
    missing_kwds=dict(color='lightgrey',)
)
  
def animation_func(i):
    coords = pres_df.latlng.iloc[-i]
    x.append(coords[1])
    y.append(coords[0])
    colors.append(pres_df.rating.iloc[-i])
    plt.xlim(-130, -60)
    plt.ylim(25, 50)
    plt.scatter(x, y, c='white', edgecolor='k')

animation = FuncAnimation(fig, animation_func, frames=50)

animation.save('anim.gif')

In [None]:
fans_df['user_cum_reviews_count'] = fans_df.groupby('user').user.cumcount(ascending=True)
fans_df['user_total_reviews_count'] = fans_df.user.map(fans_df.groupby('user', sort=False).user.count())

pres_df['user_cum_reviews_count'] = pres_df.groupby('user').user.cumcount(ascending=True)
pres_df['user_total_reviews_count'] = pres_df.user.map(fans_df.groupby('user', sort=False).user.count())

In [None]:
fans_df['user_rating_rolling_average'] = (
    fans_df.groupby('user')['rating'].apply(lambda x: x.expanding().mean())
)

pres_df['user_rating_rolling_average'] = (
    pres_df.groupby('user')['rating'].apply(lambda x: x.expanding().mean())
)

fig, ax1 = plt.subplots()

ax2 = ax1.twinx()

cum_reviews_count_data = fans_df.groupby('user_cum_reviews_count').user_rating_rolling_average.mean()

ax1.set_ylim(6, 8)
# plt.xscale('log')

ax1.plot(cum_reviews_count_data.index,
         cum_reviews_count_data.values
)

cum_reviews_counts = fans_df.groupby('user_cum_reviews_count').size()

ax2.set_yscale('log')

ax2.plot(
    cum_reviews_counts.index,
    cum_reviews_counts.values,
    c='orange',
    zorder=0
)
ax1.set_xlabel('Review number')
ax1.set_ylabel('Expanding average rating (blue)')
ax2.set_ylabel('Number of users who have\n published x number of reviews (orange)')
plt.show()

fig.set_size_inches(15, 10)
fig.savefig('maturity_effect.png')

In [None]:
x = np.asarray(cum_reviews_count_data.index)[:100] + 1
y = np.asarray(cum_reviews_count_data.values)[:100]

coeffs = np.polyfit(np.log(x), y, 1)

fig, ax = plt.subplots()

plotted_x = np.linspace(np.min(x), np.max(x), len(y))
ax.plot(x, y)
ax.plot(plotted_x, coeffs[0] * np.log(x) + coeffs[1], color='green')

fig.set_size_inches(15, 10)
fig.savefig('curve_fit.png')

In [None]:
for _, group in fans_df[fans_df['user_total_reviews_count'] > 300].groupby('user'):
    plt.plot(
        group.user_cum_reviews_count,
        group.user_rating_rolling_average
    )
    
plt.plot(pres_df.user_cum_reviews_count[:400], pres_df.user_rating_rolling_average[:400], c='k', linestyle='dashdot')

plt.show()

fig, ax = plt.subplots()

plt.xticks(list(range(0, 1340, 100)), rotation=45)
plt.plot(pres_df.timestamp, pres_df.user_rating_rolling_average, c='k', linestyle='dashdot')
fig.set_size_inches(15, 10)
fig.savefig('pres_trend.png')

In [None]:
daily_average_rating = fans_df.groupby('timestamp').rating.mean().rolling(100).mean()

plt.ylim(5, 9)

plt.xticks(list(range(0, len(daily_average_rating), 100)), rotation=45)

plt.plot(daily_average_rating.index, daily_average_rating.values)

In [None]:
%store fans_df
%store pres_df