### Get Text Reviews from Yelp Web site using Web Scraping

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import seaborn as sns
import warnings

from scipy import stats


from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import HistGradientBoostingClassifier
# from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
# from numpy.random import RandomState
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score



# Ignore notebook warnings to keep it clearer
# warnings.filterwarnings('ignore')

# This is a magic function that renders the plot in the notebook itself.
%matplotlib inline 
# To see all the content in the colu

In [3]:
#!pip install xgboost
#!pip install beautifulsoup4

In [4]:
from bs4 import BeautifulSoup   # Python package for parsing HTML and XML documents used for web scraping.
import requests                 # Requests is a simple HTTP library.
import re                       # Regular Expressions library

In [5]:
# Setting options to display all rows and columns
pd.options.display.max_columns = None
# pd.options.display.max_rows = None

In [6]:
URL= " https://www.yelp.ca/biz/pai-northern-thai-kitchen-toronto-5?osq=Restaurants"
page = requests.get(URL)

print(page.text)

<!DOCTYPE html><html lang="en-CA" prefix="og: http://ogp.me/ns#" style="margin: 0;padding: 0; border: 0; font-size: 100%; font: inherit; vertical-align: baseline;"><head><script>document.documentElement.className=document.documentElement.className.replace(no-j/,"js");</script><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><meta http-equiv="Content-Language" content="en-CA" /><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link rel="mask-icon" sizes="any" href="https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_large_assets/b2bb2fb0ec9c/assets/img/logos/yelp_burst.svg" content="#FF1A1A"><link rel="shortcut icon" href="https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_large_assets/dcfe403147fc/assets/img/logos/favicon.ico"><script> window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;window.ygaPageStartTime=new Date().getTime();</script><script>
            window.yelp = window.yelp || {};
            

In [7]:
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find(id="ResultsContainer")


In [8]:
def get_text(soup):
    restaurant_name = soup.find('h1').text
    span_texts = soup.find_all("span")
    filtered_span_text = list(filter(lambda x: re.match(r'\d+ reviews$', x.text)is not None, span_texts))
    total_reviews =filtered_span_text[0].text.split(" ")[0]

    reviewers = []
    ratings = []
    reviews = []

    review_lists = soup.find_all('ul', {"class": "undefined"})
    for ul in review_lists:
        for name in ul.select('span[class*="fs-block"]'):
            reviewers.append(name.text)
            # From analyzing th text above we can see that there is an attribute named
            # "aria-label" where we can find the rating intext format 
        for rating in ul.select('div[class*="five-stars__"]'):
            ratings.append(rating.get("aria-label"))
        for comment in ul.select('p[class*="comment__"]'):
            reviews.append(comment.text)

    df = pd.DataFrame.from_dict(
        {"restaurant_name": [restaurant_name] * len(reviewers), 
        "total_reviews": [total_reviews] * len(reviewers),
        "reviewer": reviewers,
        "rating": ratings,
        "review": reviews},
        orient = 'index')
    df = df.transpose()
    df = df.dropna()
    df.to_csv(f"{restaurant_name}.csv", index = False)
    return df

In [9]:

df = get_text(soup)
df


Unnamed: 0,restaurant_name,total_reviews,reviewer,rating,review
0,Pai Northern Thai Kitchen,3338,Nadine M.,5 star rating,I've heard great things about Pai so I've been...
1,Pai Northern Thai Kitchen,3338,Erika N.,4 star rating,I came with a few friends for dinner on a Mond...
2,Pai Northern Thai Kitchen,3338,Mary M.,4 star rating,On my hunt to find the best Thai food in the G...
3,Pai Northern Thai Kitchen,3338,Cathy C.,4 star rating,"Yelp Review 2023 #26:When in Toronto, you MUST..."
4,Pai Northern Thai Kitchen,3338,Chad J.,4 star rating,Visiting Toronto and was told this place is on...
5,Pai Northern Thai Kitchen,3338,Michelle D.,4 star rating,Back again at Pai!Got their pad gra prow with ...
6,Pai Northern Thai Kitchen,3338,Winnie H.,4 star rating,Finally got to try Pai after so long! I made r...
7,Pai Northern Thai Kitchen,3338,Angela T.,4 star rating,Come back again for dinner. Very popular place...
8,Pai Northern Thai Kitchen,3338,Christine T.,5 star rating,Pai might have been the most recommended resta...
9,Pai Northern Thai Kitchen,3338,Jeung P.,4 star rating,Portions are small but flavors are good!We ord...


In [10]:
df.shape

(10, 5)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   restaurant_name  10 non-null     object
 1   total_reviews    10 non-null     object
 2   reviewer         10 non-null     object
 3   rating           10 non-null     object
 4   review           10 non-null     object
dtypes: object(5)
memory usage: 480.0+ bytes
