# Restaurant Review Classification with Web Scraping

In [None]:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from IPython.display import Image 
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')


## Data Collection
We will use BeautifulSoup to collect the restaurant reviews and overall ratings from OpenTable websites, then store as Pandas df. 

### Customer Reviews Scraping
Firstly, we search for the review section of website

In [14]:
Image(url="./image/review.png") 

Then we look into the inspect of this section

In [15]:
Image(url="./image/reviewfeature.png") 

By inspecting the html code of the websites, we found that the review section is stored as 'span' and attributes with 'data-testid':'wrapper-tag', 'class':'t9JcvSL3Bsj1lxMSi3pz h_kb2PFOoyZe1skyGiz9 DUkDy8G7CgNvYcWgJYPN'. Also, we need to find the overall rating for each review as well, so we can get both information in this function.

In [16]:
def get_parser(soup, review_list, overall_rating_list):
    reviews = soup.find_all('span', {'data-testid':'wrapper-tag', 'class':'t9JcvSL3Bsj1lxMSi3pz h_kb2PFOoyZe1skyGiz9 DUkDy8G7CgNvYcWgJYPN'}) 
    ratings = soup.find_all('span', {'class':'Q2DbumELlxH4s85dk8Mj'})
    for review in reviews:  # reviews is a list of items, need to use loop to process all searched items
        review = review.get_text(separator=' ')  # get text(comment) from each item
        review_list.append(review)
    for rating in ratings[::4]: 
        rating = rating.get_text(separator=' ')
        overall_rating_list.append(rating)

Then, we tried to search for the pagination of this page, however the developer wrapped the page information into script section. So we cannot simply use BeautifulSoup to get the total page number. We will output all script section and look for "page" related words.

In [17]:
Image(url="./image/totalpage.png") 

In here we can first use BeautifulSoup to get script content, then use rule based method to sort for the number of total page

In [18]:
def get_total_pages(url):
    session = HTMLSession()
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    soup_find_total_page = soup.find_all('script') 
    words = word_tokenize(str(soup_find_total_page[-3]))
    l = []
    flag = 0
    for i in words:
        if i =='totalPages':
            flag = 1
        if flag == 1:
            l.append(i)
    total_pages = int(l[2].replace(":", ""))
    return total_pages

Here we create a function to access all pages and scraping reviews from each page.

In [19]:
def get_reviews_from_all_pages(first_page_url, max_page):
    page = 1
    review_list = []
    overall_rating_list = []
    p1, p2 = first_page_url.split('page=')

    while page!=(max_page+1):
        url = p1 + 'page=' + str(page)+ p2[1:]
        session = HTMLSession()
        response = session.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        get_parser(soup, review_list, overall_rating_list)        
        page += 1
    df = pd.DataFrame({'review': np.asarray(review_list), 'overall rating': np.asarray(overall_rating_list)})
    df['overall rating'] = df['overall rating'].astype(int) - 1  # label to ID, {'5 stars':4, '4 stars':3, ... , '1 stars': 0} 
    return df

We would like to create a url list to save our target restaurants, so we also need to create a function to process all urls.  
It will concat all reviews and save as one df.

In [20]:
def get_data_from_page_list(url_list):
    df_list = []
    for url in url_list:
        max_page = get_total_pages(url)
        df = get_reviews_from_all_pages(url, max_page)
        df_list.append(df)
    df = pd.concat(df_list)
    return df

Finally we can test the pipeline with one url list

In [22]:
url = ['https://www.opentable.ca/r/the-keg-steakhouse-and-bar-north-york?originId=bcc0b7a5-d42e-468c-8a2d-985968665f45&corrid=bcc0b7a5-d42e-468c-8a2d-985968665f45&avt=eyJ2IjoyLCJtIjoxLCJwIjowLCJzIjowLCJuIjowfQ&page=1&sortBy=newestReview']
df = get_data_from_page_list(url)
print(df)


                                                 review  overall rating
0     The service was impeccable and the food was de...               4
1     Another grand slam for The Keg. Our group of 1...               4
2     My favourite steak house.\nIf anyone wants a d...               4
3     Always a good meal and great value at the Keg ...               3
4     Always reliable. Good food, service and ambian...               4
...                                                 ...             ...
1726  Hostess was very welcoming, server was great a...               4
1727  Great decor and the ambiance was perfect. The ...               4
1728  Great experience!! Will definitely come back a...               4
1729  Excellent food and excellent service make for ...               4
1730  This is a great addition to the neighborhood. ...               4

[1731 rows x 2 columns]


## Review Classification with BERT
Here we will fine tune a pretrained model with our own data to perform the task.

### Pretrained model from Amazon product reviews on Kaggle dataset
It is a pretrained model that predict reviews rating from 1 star to 5 stars, where the ID starts from 0 to 4. It will give a similar output to our case, where the output ratings is 1 to 5 stars as well. Also, we believe that the dataset gives similar content from product reviews to restaurant reviews, which will give good performance in Transfer Learning.  
We use 3 sentences to test the below model to see its expected output.

In [25]:
pd.set_option('display.max_rows', None)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("LiYuan/amazon-review-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis")
model = model.to(device)

def forward_pass(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    inputs = inputs.to(device)
    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_class_id = logits.argmax().item()
    pred = model.config.id2label[predicted_class_id]
    return predicted_class_id, pred

def get_review_classification(df):
    df['prediction'] = ''
    for idx, review in df['review'].iteritems():
        pred = forward_pass(review)
        df.loc[idx, 'prediction'] = pred
    return df

sentence = ['I love this product', 'I hate this product', 'It is not bad']
for i in sentence:
    predicted_class_id ,pred = forward_pass(i)
    print(predicted_class_id, pred)

4 5 stars
0 1 star
3 4 stars


Finally we selected a list of restaurants, including restaurants from high ratings to low ratings. Also, we selected one branch of our target restaurant and hopefully it can provide similar keywords to our model

In [26]:
url_list = [
                # rating above 4
                'https://www.opentable.ca/miller-tavern?originId=9ebce773-3b2e-48fc-93bc-8b79a06e95bc&corrid=9ebce773-3b2e-48fc-93bc-8b79a06e95bc&avt=eyJ2IjoyLCJtIjoxLCJwIjowLCJzIjowLCJuIjowfQ&page=1&sortBy=newestReview',
                'https://www.opentable.ca/r/blue-blood-steakhouse-toronto?originId=1937513f-6bbc-4f51-b20e-444a14fea337&corrid=1937513f-6bbc-4f51-b20e-444a14fea337&avt=eyJ2IjoyLCJtIjoxLCJwIjowLCJzIjowLCJuIjowfQ&page=1&sortBy=newestReview',
                # same restaurant but different location
                'https://www.opentable.ca/the-keg-steakhouse-and-bar-york-street?corrid=b977b24e-4643-4356-9441-763d4bebd7cf&avt=eyJ2IjoyLCJtIjoxLCJwIjowLCJzIjoxLCJuIjowfQ&p=2&sd=2023-10-10T19%3A00%3A00&page=1&sortBy=newestReview',
                # rating 3.1
                'https://www.opentable.ca/r/chez-mal-manchester?page=1&sortBy=newestReview',
                # rating 2.9
                'https://www.opentable.ca/r/lookout-rooftop-boston?page=1&sortBy=newestReview',
                'https://www.opentable.ca/r/bar-31-shangri-la-the-shard-london?page=1&sortBy=newestReview',
                # rating 2.3
                'https://www.opentable.ca/pizza-rustica-restaurant-and-bar?originId=d084e009-f0b5-4a6f-8ba0-477c01aea935&corrid=d084e009-f0b5-4a6f-8ba0-477c01aea935&avt=eyJ2IjoyLCJtIjoxLCJwIjowLCJzIjoxLCJuIjowfQ&p=2&sd=2023-10-10T19%3A00%3A00&page=1&sortBy=newestReview',
                # rating 1.9
                'https://www.opentable.ca/r/lime-an-american-cantina-denver?page=1&sortBy=newestReview',
                # rating 1.6
                'https://www.opentable.ca/r/bourgee-lakeside-grays?page=1&sortBy=newestReview',
                # rating 1.3
                'https://www.opentable.ca/r/chophouse-363-chino?page=1&sortBy=newestReview'
                ]

eval_url = [
            'https://www.opentable.ca/r/the-keg-steakhouse-and-bar-north-york?originId=bcc0b7a5-d42e-468c-8a2d-985968665f45&corrid=bcc0b7a5-d42e-468c-8a2d-985968665f45&avt=eyJ2IjoyLCJtIjoxLCJwIjowLCJzIjowLCJuIjowfQ&page=1&sortBy=newestReview'
            ]


We saved the train df and eval df as .csv files to save processing time in later training

In [None]:
df = get_data_from_page_list(url_list)
eval = get_data_from_page_list(eval_url)
df.to_csv('./data/train.csv', index=False)
eval.to_csv('./data/eval.csv', index=False)

### Model Training

In [27]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import evaluate
import numpy as np

def load_data(path, name):
    df = pd.read_csv(path)  
    df = df.rename(columns={'review': 'text', 'overall rating': 'label'})
    dataset = Dataset.from_pandas(df, split=name)
    return dataset

def forward_pass(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    inputs = inputs.to(device)
    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_class_id = logits.argmax().item()
    pred = model.config.id2label[predicted_class_id]
    return predicted_class_id, pred

def get_review_classification(df):
    df['prediction'] = ''
    for idx, review in df['review'].iteritems():
        pred = forward_pass(review)
        df.loc[idx, 'prediction'] = pred
    return df

def preprocess_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

def compute_metrics(eval_pred):
    metric = evaluate.load('accuracy')
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

if __name__ == '__main__':

    # pd.set_option('display.max_rows', None)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    tokenizer = AutoTokenizer.from_pretrained("LiYuan/amazon-review-sentiment-analysis")
    model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis")
    model = model.to(device)
    
    train_path = '/path to/data/train.csv'
    eval_path = '/path to/data/eval.csv'
    checkpoints_path = '/path to/checkpoints'

    train_dataset = load_data(train_path, name='train')
    eval_dataset = load_data(eval_path, name='eval')
    train_dataset = train_dataset.map(preprocess_function, batched=True)
    eval_dataset = eval_dataset.map(preprocess_function, batched=True)

    training_args = TrainingArguments(
        output_dir=checkpoints_path,
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=2,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy'
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

Map:   0%|          | 0/13443 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7648,0.819088,0.685797
2,0.6385,0.889767,0.693913
