In [1]:
from bs4 import BeautifulSoup
from lxml import html
import re
from re import finditer
import os
import pandas as pd

def parse_yelp(soup):
    '''Takes in soup object.'''
    stars, dates, reviews = [], [], []
    # paragraph element has text information.
    for line in soup.find_all('p'):
        if line.decode_contents() == 'Embed review':
            text = line.next_element.next_element.text
            for match in finditer(rx, text):
                idx = match.span()
                if idx[0] == 0:
                    date, review = (match.group(), text[idx[1]:])
                    dates.append(date)
                    review = review.rstrip('\ax0')
                    reviews.append(review.strip())
    # Try other parse function if initial loop yielded nothing
    if len(dates) == 0:
        return parse_yelp_2(soup)
    for line in soup.find_all("div", {"aria-label": True}):
        if "star" in line.attrs["aria-label"]:
            if 'content' in list(line.next_element.next_element.attrs.keys()):
                continue
            if "Previous review" in line.next_element.next_element.text:
                continue
            else:
                stars.append(line.attrs["aria-label"])
    stars = stars[1:]
    stars = list(map(lambda x: int(x[0]), stars))
    return tuple(zip(stars, dates, reviews))

def parse_yelp_2(soup):
    stars, dates, reviews = [], [], []
    for line in soup.find_all('span',{'lang':'en'}):
        review = line.next_element
        reviews.append(review)
    for line in soup.find_all("div", {"aria-label": True}):
        if "star" in line.attrs["aria-label"]:
            if 'content' in list(line.next_element.next_element.attrs.keys()):
                continue
            if "Previous review" in line.next_element.next_element.text:
                continue
            if "Updated review" in line.next_element.next_element.text:
                stars.append(line.attrs["aria-label"])
                dates.append(line.next_element.next_element.text[:-(len("Updated review"))])
            else:
                stars.append(line.attrs["aria-label"])
                dates.append(line.next_element.next_element.text)
    stars, dates = stars[1:], dates[1:]
    stars = list(map(lambda x: int(x[0]), stars))
    return tuple(zip(stars, dates, reviews))

def parse_year(string):
    if len(string) > 4:
        return string[:4]
    else:
        return string

def munge_dates(dframe):
    # split dates on / to parse dates
    df = dframe.copy()
    df.dates = df.dates.apply(lambda x: x.split("/"))
    # Separate the d/m/y
    df["year"] = df.dates.apply(lambda x: x[-1])
    df["month"] = df.dates.apply(lambda x: int(x[0]))
    df["day"] = df.dates.apply(lambda x: int(x[1]))

    df.loc[:, 'year'] = df.loc[:,"year"].apply(lambda x: int(parse_year(x)))
    # Join date back on dates to convert to date time.
    df = df[(df.loc[:, "year"] > 2019) & (df.loc[:, "month"] >= 4)]
    df.loc[:, "dates"] = df.apply(lambda x: '/'.join([str(x.day), str(x.month), str(x.year)]), axis=1)
    df.loc[:, "dates"] = pd.to_datetime(df.dates)
    return df

rx = r"\b\d+/\d+/\d+"
p = re.compile(rx)
data = []
directory = "C:\\Users\\Kyle Beloin\\OneDrive\\Documents\\yelp_api\\test_data\\2"

for folder in os.listdir(directory):
    print(folder)
    try:
        for filename in os.listdir(f'{directory}\\{folder}'):
            
            try:
                tree = open(f'{directory}\\{folder}\\{filename}', encoding="UTF-8").read()
                soup = BeautifulSoup(tree)
                to_add = parse_yelp(soup)
                #print(len(to_add))
                data += to_add
            except Exception:
                print ("Not a directory")
                
    except Exception:
        print ("file not found")
    

df = pd.DataFrame(data, columns=["stars", "dates", "reviews"])
df = munge_dates(df)
df['pricerange'] = 2
df.set_index("dates", inplace=True)
df.to_csv("yelp_pricerange2_reviews.csv")
print (len(data))