In [None]:
import numpy as np
import pandas as pd
import requests
import time

from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

In [None]:
def parse_timestamp(timestamp):
    if 'ago' in timestamp:
        return (pd.Timestamp.today() - pd.Timedelta(timestamp.replace(' ago', ''))).date()
    else:
        return pd.to_datetime(timestamp).date()

In [None]:
states = pd.read_csv('state.csv')['Code']
can_provinces = pd.read_csv('can_province.csv')['Code']

In [None]:
review_scores = []
review_titles = []
review_locations = []
review_users = []
review_timestamps = []
    
page = 1
start = time.time()

while True:

    re = requests.get(f'https://onebite.app/reviews?page={page}')
    soup = BeautifulSoup(re.text, 'html.parser')

    if soup.select("div[class*='errorContainer']"):
        print(f'error found on page {page}')
        page += 1
        continue

    review_scores_elements = soup.select("p[class*='rating__score']")
    review_titles_elements = soup.select("h2[class*='reviewCard__title']")
    review_locations_elements = soup.select("p[class*='reviewCard__location']")
    review_users_elements = soup.select("p[class*='userMeta__username']")
    review_timestamps_elements = soup.select("p[class*='userMeta__timestamp']")

    for score in review_scores_elements:
        review_scores.append(float(score.text))

    for title in review_titles_elements:
        review_titles.append(title.text)

    for location in review_locations_elements:
        review_locations.append(location.text)

    for user in review_users_elements:
        review_users.append(user.text.replace('Verified', ''))

    for timestamp in review_timestamps_elements:
        review_timestamps.append(parse_timestamp(timestamp.text))

    if not soup.select("a[class*='btn--next']"):
        break
        
    if page % 1000 == 0:
        print(page, re.text[:5])

    page += 1

end = time.time()
end - start

In [None]:
all_reviews_info = list(zip(review_titles, review_locations, review_users, review_timestamps, review_scores))

df = pd.DataFrame(all_reviews_info, columns=['title', 'location', 'user', 'timestamp', 'rating'])

In [None]:
df['city'] = df['location'].apply(lambda x: x.split(', ')[0])
df['state-prov'] = df['location'].apply(lambda x: x.split(', ')[1])
df.drop(columns=['location'], inplace=True)

df = df[df['state-prov'].isin(states) | df['state-prov'].isin(can_provinces)]

df = df[['title', 'city', 'state-prov', 'user', 'timestamp', 'rating']]

In [None]:
df.reset_index(inplace=True)

In [None]:
df.to_csv('onebite_ratings.csv')