In [None]:
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests

In [None]:
load_dotenv()

YELP_API_KEY = os.environ['YELP_API_KEY']
YELP_SEARCH_API_URL = 'https://api.yelp.com/v3/businesses/search'

In [None]:
%store -r pres_df

In [None]:
def get_business_id(title, location, verbose=False):
    
    if title.endswith('...'):
        short_title = title[:-3]
    else:
        short_title = title

    headers = {'Authorization': f'Bearer {YELP_API_KEY}'}
    params = {
        'term': short_title,
        'location': location
    }
    
    response = requests.get(YELP_SEARCH_API_URL, headers=headers, params=params, timeout=5)
    
    response_json = response.json()

    if response_json['businesses']:
        return response_json['businesses'][0]['id']
    else:
        if verbose:
            print(title)
            print(response_json)
        return

In [None]:
pres_df['business_id'] = pres_df.apply(lambda x: get_business_id(x.title, x.location), axis=1)

In [None]:
top_20_ids = list(pres_df[(~pres_df['business_id'].isnull()) & (pres_df['timestamp'] > '2017-01-01')].nlargest(20, 'rating').business_id)

In [None]:
def get_reviews(business_id):
    
    review_ratings = []
    review_timestamps = []

    start_index = 0
    
    while True:
        
        url = f'https://www.yelp.com/biz/{business_id}'
        params = {
            'start': str(start_index)
        }
        
        re = requests.get(url, params=params)

        soup = BeautifulSoup(re.text, 'html.parser')

        if start_index == 0:
            page_nav_element = soup.select_one("div[class='pagination__09f24__VRjN4 border-color--default__09f24__NPAKY']")
            page_count_element = page_nav_element.select_one("div[class='border-color--default__09f24__NPAKY text-align--center__09f24__fYBGO']")
            num_pages = int(page_count_element.text.split('of ')[1])
            max_start_index = 10 * (num_pages - 1)
        
        review_lis = soup.select("li[class='margin-b5__09f24__pTvws border-color--default__09f24__NPAKY']")
        review_divs = [li.select_one("div[class='margin-t1__09f24__w96jn margin-b1-5__09f24__NHcQi border-color--default__09f24__NPAKY']") for li in review_lis]
        
        review_divs = [r for r in review_divs if r is not None]
        
        for review_div in review_divs:

            rating_div = review_div.select_one("div[class='five-stars__09f24__mBKym five-stars--regular__09f24__DgBNj display--inline-block__09f24__fEDiJ border-color--default__09f24__NPAKY']")
            rating = int(rating_div['aria-label'].replace(' star rating', ''))

            review_ratings.append(rating)
            
            timestamp_div = review_div.select_one("span[class='css-chan6m']")

            timestamp = pd.Timestamp(timestamp_div.text).date()

            review_timestamps.append(timestamp)

        start_index += 10
        
        if start_index > max_start_index:
            break
    
    return list(zip(review_ratings, review_timestamps))

In [None]:
def plot_pres_effect(business_id, pres_reviews=None, rolling_window=30, plot_ylim=(1, 5)):
    
    yelp_reviews = get_reviews(business_id)

    raw_df = pd.DataFrame(yelp_reviews, columns=['rating', 'timestamp'])
    raw_df['timestamp'] = pd.to_datetime(raw_df['timestamp'])
    
    df = pd.DataFrame(raw_df.groupby('timestamp').rating.mean()) # timestamp is the index for this df
    
    df['rolling_avg'] = df.rolling(rolling_window).mean()

    freq_effect = pd.date_range(start=raw_df.timestamp.min(), end=raw_df.timestamp.max(), freq='D').to_series(name='daily')
    yelp_reviews_by_day = raw_df.timestamp.value_counts().rename('yelp_value_counts')

    freq_df = pd.concat([freq_effect, yelp_reviews_by_day], axis=1).drop(columns=['daily'])
    freq_df.fillna(0, inplace=True)
    freq_df['rolling_avg'] = freq_df.rolling(rolling_window).mean()
    
    fig, ax = plt.subplots(2, 1)
    plt.subplots_adjust(hspace=1)
    
    ax[0].set_title(f'{rolling_window}-review rolling average review')
    
    ax[0].set_ylim(plot_ylim)
    
    ax[0].plot(df.index, df.rolling_avg)
    
    if pres_reviews is None:
        pres_reviews = pres_df[pres_df['business_id'] == business_id][['timestamp', 'rating']]

    ax[0].axvline(pres_reviews.timestamp, c='orange')
    
    ax[1].set_title(f'{rolling_window}-day rolling average per-day review frequency')
    
    ax[1].plot(freq_df.index, freq_df.rolling_avg)
    ax[1].axvline(pres_reviews.timestamp, c='orange')
    
    plt.show()

In [None]:
with ThreadPoolExecutor(max_workers=1000) as p:

    p.map(plot_pres_effect, top_20_ids)