# Scraping UKL Reviews from  [klinikbewertungen.de](https://www.klinikbewertungen.de/klinik-forum/erfahrung-mit-uniklinik-leipzig/bewertungen?allbew) 

In [2]:
# import libaries

import requests
from bs4 import BeautifulSoup
import csv
import json
import time
import random
from datetime import datetime
import pandas as pd
import re  # regular expression library, for year of stay

In [None]:
# Scrape function with handling if a item is not in HTML 
# change: get date comment UKL, better year of stay, (auhor email protected), get stars per aspect, get if review is used for overall score

def scrape_reviews(url):
    """
    Scrape hospital reviews from the given URL, including ratings, comments, and metadata.
    Handles missing HTML elements gracefully and includes anti-scraping measures.

    Requires:
        requests, BeautifulSoup (from bs4), datetime, re, time, random

    Parameters:
        url (str): URL of the hospital reviews page to scrape.

    Returns:
        list: A list of dictionaries, where each dictionary contains:
            - Review metadata (date, author, role, etc.)
            - Star ratings for various aspects (treatment, advice, etc.)
            - Review text, pros/cons, and comments (if any).
            Returns an empty list if the request fails.

    Notes:
        - Uses random delays (2-5 seconds) between requests to avoid detection.
        - Parses dates in 'dd.mm.yyyy' or 'yyyy-mm-dd' formats.
        - Converts 'ja'/'nein' for private insurance to True/False.
        - Extracts star ratings from image class names (e.g., 'star-5').
    """

    # Send a request to the website
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all review articles
        reviews = soup.find_all('article', class_='bewertung')

        # List to store each review's data
        review_data = []

        # Loop through each review
        for review in reviews:
            # Extract the various pieces of information
            review_info = {}
            
            # 1. datePublished
            date_published = review.find('meta', itemprop='datePublished')
            review_info['date_published'] = datetime.strptime(date_published['content'], '%Y-%m-%d').date() if date_published else None
            
            # 2. ratingValue
            rating_value = review.find('meta', itemprop='ratingValue')
            review_info['rating_value'] = float(rating_value['content']) if rating_value else None
            
            # 3. name (headline)
            headline = review.find('h2', itemprop='name')
            review_info['headline'] = headline.get_text(strip=True) if headline else None
            
            # Extract "for_hospital_rating" information
            for_hospital_rating_text = review.find('p', style='color:red')
            if for_hospital_rating_text and "Diese Bewertung geht nicht in die Gesamtbewertung ein" in for_hospital_rating_text.get_text():
                review_info['for_hospital_rating'] = False
            else:
                review_info['for_hospital_rating'] = True

            # 4. department
            department_link = review.find('a', class_='js-tooltip')
            review_info['department'] = department_link.get_text(strip=True) if department_link else None

            # 5. author
            author = review.find('span', itemprop='author')
            review_info['author'] = author.get_text(strip=True) if author else None
            
            # 6. role
            role_text = review.get_text()
            if 'berichtet als Angehöriger eines Patienten' in role_text:
                review_info['role'] = 'Angehöriger'
            elif 'berichtet als Klinikmitarbeiter' in role_text:
                review_info['role'] = 'Klinikmitarbeiter'
            elif 'berichtet als Arzt oder Einweiser' in role_text:
                review_info['role'] = 'Arzt oder Einweiser'
            elif 'berichtet als sonstig Betroffener' in role_text:
                review_info['role'] = 'sonstig Betroffener'
            elif 'berichtet als Patient' in role_text:
                review_info['role'] = 'Patient'
            else:
                review_info['role'] = 'Unknown'  # Fallback if no role is matched

            # 7. year_of_stay
            year_of_stay = review.find(text=lambda x: 'Jahr der Behandlung' in x)
            if year_of_stay:
                # Extract year using regex to match only the first set of digits 4 or 2
                match = re.search(r'(\d{4}|\d{2})', year_of_stay.split(":")[1])
                review_info['year_of_stay'] = int(match.group(0)) if match else None
            else:
                review_info['year_of_stay'] = None

            # Extract the recommendation status
            recommends_hospital = None
            recommendation_image = review.find('img', alt='Empfehlung')
            if recommendation_image:
                img_src = recommendation_image.get('src', '')
                if 'icon-recommend-me-fill.png' in img_src:
                    recommends_hospital = True
                elif 'icon-no-recommend-me-fill.png' in img_src:
                    recommends_hospital = False
            
            review_info['recommends_hospital'] = recommends_hospital

            # Extract information from the "rating" section
            rating_section = review.find('section', class_='rating')

            # Function to extract star ratings
            def extract_star_rating(dd_element):
                img_tag = dd_element.find('img', class_=lambda x: x and 'star-' in x)
                if img_tag:
                    # Extract the class containing "star-" and get the number after it
                    for class_name in img_tag['class']:
                        if 'star-' in class_name:
                            return int(class_name.split('star-')[-1])
                return None

            
            # 8. overall satisfaction
            overall_satisfaction = rating_section.find('dt', text='Gesamtzufriedenheit:')
            overall_satisfaction_dd = overall_satisfaction.find_next_sibling('dd') if overall_satisfaction else None
            review_info['overall_satisfaction_stars'] = extract_star_rating(overall_satisfaction_dd) if overall_satisfaction_dd else None
            review_info['overall_satisfaction'] = overall_satisfaction_dd.get_text(strip=True) if overall_satisfaction_dd else None

            # 9. quality of advice
            quality_of_advice = rating_section.find('dt', text='Qualität der Beratung:')
            quality_of_advice_dd = quality_of_advice.find_next_sibling('dd') if quality_of_advice else None
            review_info['quality_of_advice_stars'] = extract_star_rating(quality_of_advice_dd) if quality_of_advice_dd else None
            review_info['quality_of_advice'] = quality_of_advice_dd.get_text(strip=True) if quality_of_advice_dd else None
            
            # 10. medical treatment
            medical_treatment = rating_section.find('dt', text='Mediz. Behandlung:')
            medical_treatment_dd = medical_treatment.find_next_sibling('dd') if medical_treatment else None
            review_info['medic_treatment_stars'] = extract_star_rating(medical_treatment_dd) if medical_treatment_dd else None
            review_info['medic_treatment'] = medical_treatment_dd.get_text(strip=True) if medical_treatment_dd else None
            
            # 11. administration and processes
            administration = rating_section.find('dt', text='Verwaltung und Abläufe:')
            administration_dd = administration.find_next_sibling('dd') if administration else None
            review_info['administration_and_processes_stars'] = extract_star_rating(administration_dd) if administration_dd else None
            review_info['administration_and_processes'] = administration_dd.get_text(strip=True) if administration_dd else None
            
            # 12. equipment and design
            equipment_design = rating_section.find('dt', text='Ausstattung und Gestaltung:')
            equipment_design_dd = equipment_design.find_next_sibling('dd') if equipment_design else None
            review_info['equipment_and_design_stars'] = extract_star_rating(equipment_design_dd) if equipment_design_dd else None
            review_info['equipment_and_design'] = equipment_design_dd.get_text(strip=True) if equipment_design_dd else None

            # Extract information from the "report" section
            report_section = review.find('section', class_='report')

            # 13. pro of stay
            pro = report_section.find('dt', text='Pro:')
            review_info['pro'] = pro.find_next_sibling('dd').get_text(strip=True) if pro else None

            # 14. contra of stay
            contra = report_section.find('dt', text='Kontra:')
            review_info['contra'] = contra.find_next_sibling('dd').get_text(strip=True) if contra else None

            # 15. disease
            disease = report_section.find('dt', text='Krankheitsbild:')
            review_info['disease'] = disease.find_next_sibling('dd').get_text(strip=True) if disease else None

            # 16. private insurance
            private_insurance = report_section.find('dt', text='Privatpatient:')
            if private_insurance:
                insurance_status = private_insurance.find_next_sibling('dd').get_text(strip=True)
                if insurance_status.lower() == 'ja':
                    review_info['private_insurance'] = True
                elif insurance_status.lower() == 'nein':
                    review_info['private_insurance'] = False
                else:
                    review_info['private_insurance'] = None  # For any other text or if the value is not clear
            else:
                review_info['private_insurance'] = None  # If the 'Privatpatient' entry is not found

            # 17. patient review (reviewBody)
            review_body = report_section.find('p', itemprop='reviewBody')
            review_info['patient_review'] = review_body.get_text(strip=True) if review_body else None

            # Extract comments if present
            comments_section = review.find('section', class_='comments')
            comments = comments_section.find_all('dl') if comments_section else []

            # Extract each comment's details
            comment_data = []
            for comment in comments:
                comment_info = {}
                # 18. Commentator
                commentator = comment.find('a', class_='link')
                comment_info['commentator'] = commentator.get_text(strip=True) if commentator else None

                # 19. Date of Comment
                comment_date = comment.find('dt')
                if comment_date:
                    date_text = comment_date.get_text(strip=True)
                    # Extract the date part (should be after "am ")
                    if 'am ' in date_text:
                        date_text = date_text.split('am ')[-1]
                        # Convert the date from 'dd.mm.yyyy' to 'yyyy-mm-dd'
                        try:
                            # Parse the date in the format 'dd.mm.yyyy'
                            parsed_date = datetime.strptime(date_text, '%d.%m.%Y')
                            # Format it into 'yyyy-mm-dd'
                            formatted_date = parsed_date.strftime('%Y-%m-%d')
                            date_object = datetime.strptime(formatted_date, '%Y-%m-%d')
                            comment_info['date_of_comment'] = date_object.date()
                        except ValueError:
                            comment_info['date_of_comment'] = None
                    else:
                        comment_info['date_of_comment'] = None
                else:
                    comment_info['date_of_comment'] = None

                # 20. Comment Text
                comment_text = comment.find('dd')
                comment_info['comment'] = comment_text.get_text(strip=True) if comment_text else None

                # Add the comment info to the list of comments
                comment_data.append(comment_info)
            
            # Add the comments to the review info
            review_info['comments'] = comment_data

            # Append the extracted review data to the list
            review_data.append(review_info)
            
            # Add a pause to avoid being flagged by the server
            time.sleep(random.uniform(2, 5))  # Pause for 2 to 5 seconds randomly

    else:
        print(f"Failed to fetch the webpage. Status code: {response.status_code}")
    
    return review_data


In [10]:
# scraping the website

# URL of the website to scrape
url = "https://www.klinikbewertungen.de/klinik-forum/erfahrung-mit-uniklinik-leipzig/bewertungen?allbew"
review_data = scrape_reviews(url)

  year_of_stay = review.find(text=lambda x: 'Jahr der Behandlung' in x)
  overall_satisfaction = rating_section.find('dt', text='Gesamtzufriedenheit:')
  quality_of_advice = rating_section.find('dt', text='Qualität der Beratung:')
  medical_treatment = rating_section.find('dt', text='Mediz. Behandlung:')
  administration = rating_section.find('dt', text='Verwaltung und Abläufe:')
  equipment_design = rating_section.find('dt', text='Ausstattung und Gestaltung:')
  pro = report_section.find('dt', text='Pro:')
  contra = report_section.find('dt', text='Kontra:')
  disease = report_section.find('dt', text='Krankheitsbild:')
  private_insurance = report_section.find('dt', text='Privatpatient:')


In [11]:
# Save the Data

# Get today's date in the desired format (e.g., '2025-01-17')
today_date = datetime.now().strftime('%Y-%m-%d')

# Create the filename with today's date
filenamejson = f'reviews_{today_date}.json'

# Save as a JSON file with the formatted filename
with open(filenamejson, 'w', encoding='utf-8') as json_file:
    json.dump(review_data, json_file, default=str, ensure_ascii=False, indent=4)

In [12]:
# Safe the data

# Get today's date in the desired format (e.g., '2025-01-17')
today_date = datetime.now().strftime('%Y-%m-%d')

# Create the filename with today's date
filenamecsv = f'reviews_{today_date}.csv'

# Save as a CSV file
keys = review_data[0].keys() if review_data else []
with open(filenamecsv, 'w', newline='', encoding='utf-8') as output_file:
    dict_writer = csv.DictWriter(output_file, fieldnames=keys)
    dict_writer.writeheader()
    dict_writer.writerows(review_data)

#### Look at the scraped data

In [None]:
# Step 1: Create a DataFrame from the list of dictionaries
df = pd.DataFrame(review_data)

# Step 2: Drop the 'comments' column
df.drop(columns=['comments'], inplace=True)

r, c = df.shape
print(f"The data has {r} rows and {c} columns")

df.head()

The data has 485 rows and 24 columns


Unnamed: 0,datePublished,ratingValue,headline,for_hospital_rating,department,author,role,year_of_stay,recommends_hospital,overall_satisfaction_stars,...,medic_treatment,administration_and_processes_stars,administration_and_processes,equipment_and_design_stars,equipment_and_design,pro,contra,disease,private_insurance,patient_review
0,2024-11-22,1.0,Eine traumatisierende Erfahrung,True,Frauen,LauraLara25,Patient,2024.0,False,0,...,unzufrieden,2,weniger zufrieden (Die Operation wurde einmal ...,2.0,weniger zufrieden (Im Sommer müssen die Patien...,Viele Ärzte und Krankenschwestern waren sehr k...,Die Organisation und der Personalmangel machte...,Gebärmuttermyom mit Verdacht auf Endometriose ...,False,"Obwohl mir gesagt wurde, dass ich gesund und f..."
1,2024-11-15,4.0,endlich eine Diagnose,True,Neurologie,Alex847,Patient,2021.0,True,4,...,zufrieden,2,weniger zufrieden,4.0,zufrieden,Qualität der Behandlung bzw. Untersuchung,Essensangebot teilw. ungenügend,Heritäre Spastische Paraplegie,,Endlich hatte ich eine Diagnose...Der doch jun...
2,2024-10-11,2.0,Jeder artz sagt etwas anderes,True,Orthopädie,Kassenpatienten66,Patient,24.0,False,2,...,weniger zufrieden,2,weniger zufrieden,2.0,weniger zufrieden,Zum Teil freundliches Personal,Keine Aussage zum tages Ablauf für den Patienten,,False,Habe nach 14 tägigen schmerzterapie einen Term...
3,2024-10-02,5.0,gerne wieder,True,Gastrologie,hojome2,Patient,2024.0,True,6,...,sehr zufrieden,6,sehr zufrieden,6.0,sehr zufrieden,alles wurde sehr gut erklärt,,Stantwechsel,True,Ich hatte gestern einen Termin zum Entfernen e...
4,2024-09-09,4.0,Kein Bericht zur Behandlung,True,Neurologie,Kirsche02,Patient,2024.0,False,4,...,zufrieden,4,zufrieden,4.0,zufrieden,Hab mich wohl gefühlt,Kein Bericht keine weitere Behandlung für umso...,Long Covid,False,Ich war sehr glücklich einen Termin in der Lon...


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483 entries, 0 to 482
Data columns (total 24 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   datePublished                       483 non-null    object 
 1   ratingValue                         483 non-null    float64
 2   headline                            483 non-null    object 
 3   for_hospital_rating                 483 non-null    bool   
 4   department                          384 non-null    object 
 5   author                              483 non-null    object 
 6   role                                483 non-null    object 
 7   year_of_stay                        482 non-null    float64
 8   recommends_hospital                 404 non-null    object 
 9   overall_satisfaction_stars          483 non-null    int64  
 10  overall_satisfaction                483 non-null    object 
 11  quality_of_advice_stars             483 non-n

In [16]:
df.recommends_hospital.unique()

array([False, True, None], dtype=object)

In [17]:
df.private_insurance.unique()

array([False, True, None], dtype=object)

In [82]:
df.year_of_stay.unique()

array([  24., 2024., 2023., 2034., 2022.,   22.,   23., 2021., 2020.,
         21., 2019., 2015.,   20.,    3., 2018.,   19.,   17., 2014.,
       2017.,   18., 2016.,   16., 1995., 1996.,   nan,   11.,   12.,
         10., 1998., 1980., 2013.,   15., 2009., 2008.,   14., 1966.,
       2011., 2010., 2012., 2007., 2005., 2006.,    9., 2003.,    8.,
       2002.])

In [65]:
df[df.year_of_stay == None]

Unnamed: 0,datePublished,ratingValue,headline,for_hospital_rating,department,author,role,year_of_stay,overall_satisfaction_stars,overall_satisfaction,...,medic_treatment,administration_and_processes_stars,administration_and_processes,equipment_and_design_stars,equipment_and_design,pro,contra,disease,private_insurance,patient_review


In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

example = random.sample(list(df.rating_value[df.rating_value == 4.0]), k=1) 
df.query('rating_value==@example')