# Scraping Studycheck

## Setup

Create virtual environment (once)

```sh
conda create -n sentiment python=3.11 pip
```

Activate environment

```sh
conda activate sentiment
```

Install modules (once)

```sh
pip install jupyter ipykernel pandas selenium webdriver-manager beautifulsoup4 nltk spacy wordcloud matplotlib altair
```

Import modules (always)

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

## Scraping

In [None]:
def get_page_reviews(driver, page_url):
    """Fetches reviews from a single page using Selenium."""
    driver.get(page_url)
    time.sleep(5)  # Wait for the page to load and for any pop-ups to appear

    # Close the cookie consent pop-up if it appears
    try:
        consent_button = driver.find_element(By.XPATH, '//button[text()="Alles akzeptieren"]')
        consent_button.click()
    except Exception as e:
        print(f"No cookie consent pop-up found: {e}")

    # Extract the page source and parse with BeautifulSoup
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    review_elements = soup.select('.item-text')

    reviews = [review.get_text(strip=True) for review in review_elements]
    return reviews


In [None]:
def scrape_studycheck(base_url, total_pages):
    """Scrapes reviews from StudyCheck across multiple pages using Selenium."""
    all_reviews = []

    # Setup Selenium WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    for page_num in range(1, total_pages + 1):
        page_url = f"{base_url}/seite-{page_num}"
        print(f"Scraping page {page_num}...")
        reviews = get_page_reviews(driver, page_url)
        all_reviews.extend(reviews)
        time.sleep(2)  # Polite delay to avoid overloading the server

    driver.quit()
    return all_reviews


In [None]:

# Define the base URLs and total pages for each study program
study_programs = [
    {
        "name": "Medienmanagement Mittweida",
        "base_url": "https://www.studycheck.de/studium/medienmanagement/hs-mittweida-429/bewertungen",
        "total_pages": 60
    },
    {
        "name": "Online Medien Management HdM",
        "base_url": "https://www.studycheck.de/studium/medienmanagement/hdm-stuttgart-15774/bewertungen",
        "total_pages": 30
    },

    {
        "name": "Medien und Kommunikationsmanagement Macromedia Hochschule",
        "base_url": "https://www.studycheck.de/studium/medienmanagement/hs-macromedia-14035/bewertungen",
        "total_pages": 97
    }

    ,
    {
        "name": "Digital und Medienwirtschaft HdM",
        "base_url": "https://www.studycheck.de/studium/medienwirtschaft/hdm-stuttgart-16089/bewertungen",
        "total_pages": 60
    }
]


This script may take while to finish

In [None]:

# Scrape reviews for each study program and save to separate CSV files
for program in study_programs:
    print(f"Scraping reviews for {program['name']}...")
    reviews = scrape_studycheck(program["base_url"], program["total_pages"])
    df = pd.DataFrame(reviews, columns=['Review'])
    csv_filename = f"{program['name'].replace(' ', '_').lower()}_reviews.csv"
    df.to_csv(csv_filename, index=False)
    print(f"Scraping completed and saved to {csv_filename}")
