# Scraping Studycheck

## Setup

In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

## Mittweida

In [11]:


def get_page_reviews(driver, page_url):
    """Fetches reviews from a single page using Selenium."""
    driver.get(page_url)
    time.sleep(5)  # Wait for the page to load and for any pop-ups to appear

    # Close the cookie consent pop-up if it appears
    try:
        consent_button = driver.find_element(By.XPATH, '//button[text()="Alles akzeptieren"]')
        consent_button.click()
    except Exception as e:
        print(f"No cookie consent pop-up found: {e}")

    # Extract the page source and parse with BeautifulSoup
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    review_elements = soup.select('.item-text')

    reviews = [review.get_text(strip=True) for review in review_elements]
    return reviews

def scrape_studycheck(base_url, total_pages):
    """Scrapes reviews from StudyCheck across multiple pages using Selenium."""
    all_reviews = []

    # Setup Selenium WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    for page_num in range(1, total_pages + 1):
        page_url = f"{base_url}/seite-{page_num}"
        print(f"Scraping page {page_num}...")
        reviews = get_page_reviews(driver, page_url)
        all_reviews.extend(reviews)
        time.sleep(2)  # Polite delay to avoid overloading the server

    driver.quit()
    return all_reviews

# Base URL for the reviews
base_url = "https://www.studycheck.de/studium/medienmanagement/hs-mittweida-429/bewertungen"

# Total number of pages to scrape
total_pages = 60

# Scrape the reviews
reviews = scrape_studycheck(base_url, total_pages)

# Save the reviews to a CSV file
df = pd.DataFrame(reviews, columns=['Review'])
df.to_csv('medienmanagement_mittweida_reviews.csv', index=False)

print("Scraping completed and saved to medienmanagement_mittweida_reviews.csv")


Scraping page 1...
No cookie consent pop-up found: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//button[text()="Alles akzeptieren"]"}
  (Session info: chrome=124.0.6367.209); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000104c1e940 chromedriver + 4368704
1   chromedriver                        0x0000000104c16dd4 chromedriver + 4337108
2   chromedriver                        0x000000010483ac04 chromedriver + 289796
3   chromedriver                        0x000000010487ce00 chromedriver + 560640
4   chromedriver                        0x00000001048b55ec chromedriver + 792044
5   chromedriver                        0x0000000104871ab4 chromedriver + 514740
6   chromedriver                        0x000000010487250c chromedriver + 517388
7   chromedriver                        0x0000000104be2e5c ch

In [14]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

def get_page_reviews(driver, page_url):
    """Fetches reviews from a single page using Selenium."""
    driver.get(page_url)
    time.sleep(5)  # Wait for the page to load and for any pop-ups to appear

    # Close the cookie consent pop-up if it appears
    try:
        consent_button = driver.find_element(By.XPATH, '//button[text()="Alles akzeptieren"]')
        consent_button.click()
    except Exception as e:
        print(f"No cookie consent pop-up found: {e}")

    # Extract the page source and parse with BeautifulSoup
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    review_elements = soup.select('.item-text')

    reviews = [review.get_text(strip=True) for review in review_elements]
    return reviews

def scrape_studycheck(base_url, total_pages):
    """Scrapes reviews from StudyCheck across multiple pages using Selenium."""
    all_reviews = []

    # Setup Selenium WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    for page_num in range(1, total_pages + 1):
        page_url = f"{base_url}/seite-{page_num}"
        print(f"Scraping page {page_num}...")
        reviews = get_page_reviews(driver, page_url)
        all_reviews.extend(reviews)
        time.sleep(2)  # Polite delay to avoid overloading the server

    driver.quit()
    return all_reviews

# Define the base URLs and total pages for each study program
study_programs = [
    {
        "name": "Medienmanagement Mittweida",
        "base_url": "https://www.studycheck.de/studium/medienmanagement/hs-mittweida-429/bewertungen",
        "total_pages": 60
    },
    {
        "name": "Online Medien Management HdM",
        "base_url": "https://www.studycheck.de/studium/medienmanagement/hdm-stuttgart-15774/bewertungen",
        "total_pages": 30
    },

    {
        "name": "Medien und Kommunikationsmanagement Macromedia Hochschule",
        "base_url": "https://www.studycheck.de/studium/medienmanagement/hs-macromedia-14035/bewertungen",
        "total_pages": 97
    }

    ,
    {
        "name": "Digital und Medienwirtschaft HdM",
        "base_url": "https://www.studycheck.de/studium/medienwirtschaft/hdm-stuttgart-16089/bewertungen",
        "total_pages": 60
    }
]

# Scrape reviews for each study program and save to separate CSV files
for program in study_programs:
    print(f"Scraping reviews for {program['name']}...")
    reviews = scrape_studycheck(program["base_url"], program["total_pages"])
    df = pd.DataFrame(reviews, columns=['Review'])
    csv_filename = f"{program['name'].replace(' ', '_').lower()}_reviews.csv"
    df.to_csv(csv_filename, index=False)
    print(f"Scraping completed and saved to {csv_filename}")


Scraping reviews for Digital und Medienwirtschaft HdM...
Scraping page 1...
No cookie consent pop-up found: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//button[text()="Alles akzeptieren"]"}
  (Session info: chrome=124.0.6367.209); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000101396940 chromedriver + 4368704
1   chromedriver                        0x000000010138edd4 chromedriver + 4337108
2   chromedriver                        0x0000000100fb2c04 chromedriver + 289796
3   chromedriver                        0x0000000100ff4e00 chromedriver + 560640
4   chromedriver                        0x000000010102d5ec chromedriver + 792044
5   chromedriver                        0x0000000100fe9ab4 chromedriver + 514740
6   chromedriver                        0x0000000100fea50c chromedriver + 517388
7   