# KT Yelp Review Auto-Responder

In [136]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bs4 import BeautifulSoup
import re
import html
import json
from datetime import datetime, timedelta

import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

In [84]:
locations_df_temp = pd.read_excel("KT Locations Data.xlsx")
locations_df = locations_df_temp.copy()
business_ids = locations_df['Yelp_Bus_Id'].tolist()
reviews_data = []

current_date = datetime.now()
three_months_ago = current_date - timedelta(days=90)

## Web Scraping

In [3]:
# given a 'list', list_search searches for the given string 'term' and will output whatever is in the position 
# 'num' off of the position of 'term'
def list_search(list, term, num):
    
    indices = [index for index, item in enumerate(list) if item == term]
    index = indices[0]
    target_val = list[index + num]
    
    return target_val

In [4]:
# pulls the 10 most recent reviews from yelp page given business_id and outputs them as a df
def yelp_review_scraper(business_id):   
    search_url = f"https://www.yelp.com/biz/{business_id}?sort_by=date_desc"
    search_response = requests.get(search_url)

    # Check if the request was successful (status code 200)
    if search_response.status_code == 200:
        soup = BeautifulSoup(search_response.text, 'html.parser')

    # find matches for set pattern
    soup_string = str(soup)
    soup_string = soup_string.replace('null', '"%"')
    pattern = r'"reviews":\[(.*?)\](.*?)\](.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\[(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\}(.*?)\['
    matches = re.findall(pattern, soup_string)

    # convert tuple matches in match to list
    matches_new = [list(t) for t in matches]

    # cut lists at stop list to get only the data we need
    stop_phrase = '"tags":'

    new_list = []
    current_sublist = []

    for item in matches_new[0]:
        if stop_phrase in item:
            if current_sublist:
                current_sublist.append(item)
                new_list.append(current_sublist)
            current_sublist = []
        else:
            current_sublist.append(item)

    if current_sublist:
        new_list.append(current_sublist)

    # handle the issue of nested lists
    too_many_lists = [["".join(sublist)] for sublist in new_list]
    flattened_list = [item for sublist in too_many_lists for item in sublist]
    filtered_list = [item for item in flattened_list if stop_phrase in item]

    # extract the data we actually want from individual review JSON data
    names = []
    review_texts = []
    dates = []
    ratings = []
    review_ids = []
    already_replieds = []

    for match in filtered_list:
                filtered_list = match.split('"')
                filtered_list = [item for item in filtered_list if item != ',']
                name = list_search(filtered_list, 'markupDisplayName', 2)
                review_text = list_search(filtered_list, 'text', 2)
                date = datetime.strptime(list_search(filtered_list, 'localizedDate', 2), '%m/%d/%Y')
                rating = int(list_search(filtered_list, 'rating', 1).strip(':,'))
                review_id = list_search(filtered_list, 'id', 2)
            
                businessOwnerReplies = list_search(filtered_list, 'businessOwnerReplies', 2)
            
                if businessOwnerReplies == '%':
                    already_replied = False
                else:
                    already_replied = True
                names.append(name)
                review_texts.append(review_text)
                dates.append(date)
                ratings.append(rating)
                review_ids.append(review_id)
                already_replieds.append(already_replied)
            
    data = {
        "names": names,
        "review_text": review_texts,
        "date": dates,
        "rating": ratings,
        "review_id": review_ids,
        "already_replied": already_replieds
    }

    df = pd.DataFrame(data)

    return df           

In [7]:
dfs = []

# Loop through each business_id
for business_id in business_ids:
    df = yelp_review_scraper(business_id)
    if df is not None:
        df['business_id'] = business_id
        dfs.append(df)

In [116]:
# Concatenate all the DataFrames in the list
yelp_reviews = pd.concat(dfs, ignore_index=True)

# Replace business_id with Clinic in df
merged_df = yelp_reviews.merge(locations_df[['Yelp_Bus_Id', 'Clinic']], left_on='business_id', right_on='Yelp_Bus_Id', how='left')

# Drop the 'business_id' column and rename the 'Clinic' column
merged_df.drop(columns=['business_id'], inplace=True)
merged_df.rename(columns={'Yelp_Bus_Id': 'business_id'}, inplace=True)
yelp_reviews = merged_df

yelp_reviews['review_text'] = yelp_reviews['review_text'].apply(lambda x: x.replace('\xa0', ''))
yelp_reviews['review_text'] = yelp_reviews['review_text'].apply(lambda x: x.replace('&amp;#39;', "'"))
yelp_reviews['review_text'] = yelp_reviews['review_text'].apply(lambda x: x.replace('<br&gt;', ''))


  yelp_reviews = pd.concat(dfs, ignore_index=True)


In [117]:
yelp_reviews

Unnamed: 0,names,review_text,date,rating,review_id,already_replied,business_id,Clinic
0,Elizabeth F.,This office new staff is horrible. I used to l...,2023-09-22,1.0,YF6jJ3EZtimxRcpWDgRM0Q,0.0,kids-and-teens-medical-group-northridge-northr...,Northridge
1,Martha G.,Dr Benjamin is the best she so nice and patien...,2023-03-19,5.0,4zb5jC0Y8vium32lovjqpQ,1.0,kids-and-teens-medical-group-northridge-northr...,Northridge
2,Ehis I.,"This place should be closed honestly , the sta...",2023-03-07,1.0,uxNYDRtjCawALeK64CkE6w,1.0,kids-and-teens-medical-group-northridge-northr...,Northridge
3,Richard C.,"They don't deserve 1 star, yelp should allow 0...",2022-12-28,1.0,xkyt-QoHzSvsvbWYsJnWDw,1.0,kids-and-teens-medical-group-northridge-northr...,Northridge
4,Diana D.,We had to do a blood test for our daughter and...,2022-11-18,5.0,OhwsOar1NOrXrwwJu4cCxw,1.0,kids-and-teens-medical-group-northridge-northr...,Northridge
...,...,...,...,...,...,...,...,...
98,Maria B.,I scheduled an appointment and specified and a...,2023-08-18,1.0,WBckyR48WhpjM5jVnPr75A,1.0,kids-and-teens-medical-group-pico-rivera-pico-...,Pico Rivera
99,Emily R.,I was in yesterday to see Dr Man and let me te...,2023-06-28,5.0,z5xb3-6aBisThXgNhARyCA,0.0,kids-and-teens-medical-group-pico-rivera-pico-...,Pico Rivera
100,Monica R.,This 3rd party shit is the stupidest thing I'v...,2023-06-09,1.0,mCXcwAb-gICj3maVPJEGjw,0.0,kids-and-teens-medical-group-pico-rivera-pico-...,Pico Rivera
101,Liza S.,I recently have gone to this doctor. I have an...,2021-09-08,1.0,Rk6XLnfoCqdnrYrjGYYI6g,0.0,kids-and-teens-medical-group-arcadia-arcadia,Arcadia


## Sentiment Analysis

In [137]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x2053975b450>

In [151]:
yelp_reviews['review_text'][5]
yelp_reviews['rating'][5]

2.0

In [165]:
filtered_reviews = yelp_reviews[(yelp_reviews['rating'] != 1) & (yelp_reviews['rating'] != 5)]
filtered_review_texts = filtered_reviews['review_text'].tolist()
filtered_review_texts[3]

"We were a patient of Dr. Saade for 11 years. When the practice was turned over to &amp;#34;Kids and Teens&amp;#34; earlier this year we started being seen by Dr. Mann. Dr. Mann is an incredibly kind, patient and lovely physician. We were thrilled. Unfortunately, not two months later, Dr. Mann was transferred out of the Kids and Teens network. The office couldn't even tell us who was going to be taking over the practice at this location. Kids and Teens patient services calls are super generic and impersonal. It seems like a large network but there has been zero effort to inform patients of what is happening at this location. We will be looking for a new physician that makes us feel like people - to Kids and Teens we clearly are just numbers."

In [166]:
doc = nlp(filtered_review_texts[3])

print(doc._.blob.polarity)
print(doc._.blob.subjectivity)

0.2485438803620622
0.5257378984651712


5 - Generic thank you, parse through 5 different ones with rng

4 - Generic Thank you, if theres any issue: email

1 - if theres any issue email

Hi F_name,
Thank you....

## Respond to Reviews

In [167]:
responses = []

for index, row in yelp_reviews.iterrows():
    review = pd.DataFrame(row).T
    already_replied = review['already_replied'].values[0]
    date = pd.to_datetime(review['date'].values[0])

    if (already_replied == 0) and (date > three_months_ago):
        response = 
        responses.append(response)

SyntaxError: invalid syntax (96363527.py, line 9)