In [5]:
!pip install beautifulsoup4
!pip install requests



# Heading

In [6]:
# the relevant libraries are imported
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

In [7]:
customer_review = []

# define the functions outside the loop
def comm_head(review):
    comment_header = review.find('h3')
    if comment_header:
        return comment_header.text.strip()
    else:
        return None
    

def dept(review):
    department = review.find('h5')
    if department:
        return department.text.strip()
    else:
        return None


def comment_date(review):
    date = review.find('div', class_="service-review-date")
    if date:
        return date.text.strip()
    else:
        return None


# ORIGINAL ratings() method

# def ratings(review):
#    rating_header = review.find('div', class_="service-review-rating star-rating-compact")
#    if rating_header:
#        score = rating_header.find("input", {"name": "score"})
#        score_rating = score["value"] if score else None
#        return score_rating
#    else:
#        return None

# PROPOSED ratings() method that can extract ratings from HealthWatchSuffolk 
#    website

# Actually, this was a challenging problem for various reasons, that I will 
#    explain below:

# Firstly, it seemed impossible to get the scraper to read the div with
#    class_="service-review-rating star-rating-compact". Whilst when inspecting 
#    the element we could see the div, the scraper was reporting back as always 
#    getting a null. After several attempts, I had a look at the html in more 
#    detail, it seems that this was originally dynamically populated using a 
#    JQuery and Javascript. So instead, I aimed for the "script" tag, which 
#    the scraper seemed to have access to and which seemed to return something.
#

# see #1 in code. As it was not possible to read the div, as this was a JS 
#   dynamically created component (ie upon load), and the web crawler 
#   (BeautifulSoup) was not able to read it when running.
# Solution: we read the script tag instead, to allow us in, as it had the score 
#   information needed

# At this stage, I ended with a long JQuery text in the Ratings field, and here
#   is when all the trouble of converting this to string and extracting the
#   number started. 
#
#   This looked like this:
#
#   
#   jQuery(window).load(function () {
#        jQuery('#service-review-rating-214576').raty({
#            readOnly: true,
#            score: 5
#        });
#    });
#
# The next step was to try and somehow strip the text of everything before
#    and after 5 (the rating) and store this as a string value in the ratings()
#    method.
# Eventually and after trying different ways to tackle this, I
#    ended up with the solution below:

# see in code #2. the contents of the "script" JS script need to be parsed 
#    using the BS decode_contents() method with a formatter="minimal" parameter 
#    or formatter="html". This will change the format of the <script> tag
#    contents from type 'bytestream' to 'string', which will allow us to do 
#    further splitting and replacing. The formatter parameter will ensure that 
#    this conversion to string takes place.
#    
# Next, we need to split the long JQuery text. One way to do this is to use 
#    the partition() method.
#
# see in code #3. This will look for the string "score: " in the score_rating
#    string and split it in three parts as follows:

# [part 1] with index [0]

# jQuery(window).load(function () {
#        jQuery('#service-review-rating-214576').raty({
#            readOnly: true,

# [part 2] with index [1]

# score: 

# [part 3] with index [2]

# 5
#        });
#    });

# Of these three, we only need to keep [part 3] with index [2] and this is why
#    [2] is included in 

#    score_rating=score_rating.partition("score: ")[2]

# The last problem to solve was the code failing after reading the first line
#    of each page with the contents from the form (see my comment below on 
#    including not all <h3> tags but only those under <div class="mt-5">).
# see in code line #4. This is to ensure that the code does not fail if one of 
#    the ratings fields is empty which happens everytime the crawler reads  
#    the start of each page. So if the score_rating string is non-empty (>0),
#    then the first character of the score_rating variable will be returned.
#    Remember the first element has index 0 represented by [0] - see in code
#    #5

#  else it will return "no score rating available" -  see code #6

def ratings(review):
    rating_header = review.find("script") #1
    if rating_header:
        score_rating = rating_header.decode_contents(formatter="minimal") #2
        score_rating=score_rating.partition("score: ")[2] #3
        if len(score_rating)>0: #4
            return score_rating[0] #5
        else:
            return "No score rating available" #6
    else:
        return "No ratings class found"




def comment_body(review):
    comment = review.find('div', class_="service-review-body")
    if comment:
        return comment.text.strip()
    else:
        return None


In [8]:
    
# each page is looped through to extract data to be appended to the customer_review list
for i in range(1, 3):
    url = f'https://healthwatchsuffolk.co.uk/services/ipswich-hospital-ipswich-1?page={i}'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

    try:
        hw_reviews = requests.get(url, headers=headers)
        if hw_reviews.status_code != 200:
            print(f"Failed to retrieve data from page {i}. Status code: {hw_reviews.status_code}")
            continue

        response = hw_reviews.content
        soup = BeautifulSoup(response, 'html.parser')

        review_articles = soup.find_all('div', class_='service-review')
        for review in review_articles:
            customer_review.append([comm_head(review), dept(review), comment_date(review), ratings(review), comment_body(review)])

        # Add a delay between page requests to avoid overwhelming the server
        time.sleep(2)
    except Exception as e:
        print(f"Error while processing page {i}: {e}")
        continue

# Create a DataFrame from the customer_review list
df = pd.DataFrame(customer_review, columns=['Comment Header', 'Department', 'Comment Date', 'Ratings', 'Comment Body'])

# Write the DataFrame to a CSV file named 'customer_reviews.csv'
df.to_csv('../data/healthwatchsuffolk/customer_reviews.csv', index=False)


In [9]:

# Display the DataFrame (optional)
# print(df)

df.head(40)

Unnamed: 0,Comment Header,Department,Comment Date,Ratings,Comment Body
0,Your ratings (select if applicable),,,No score rating available,
1,Brilliant care,Diabetic Medicine,"July 19, 2023",5,The care provided by the hospital for my child...
2,I was treated very well,Endoscopy,"July 18, 2023",5,I was booked in for a colonoscopy. Before I at...
3,Appreciation of care at breast screening,Imaging,"July 15, 2023",5,I have an unusual disability of severe light s...
4,My medication wasn't locked away,wards,"July 12, 2023",4,I went in to have an operation on my parathyro...
5,Please look at people with hearing loss,Ear Nose and Throat,"July 12, 2023",3,I have found that consultatnts in the ENT depa...
6,A very good treatment for colonoscopy.,unknown,"July 10, 2023",5,I went to Ipswich Hospital for colonoscopy thi...
7,Cancer treatment great,Cancer services,"July 10, 2023",5,I recently finished my chemo and ratdiotherapy...
8,Hip replacement criteria is strict,Orthopaedics,"July 5, 2023",1,I needed a hip replacement but was told that I...
9,Amazing cancer care,Cancer services,"July 5, 2023",5,Cancer treatment and care from the first time ...
