In [1]:
import pandas as pd
import numpy as np

import time
import re

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
from selenium.webdriver import remote

from bs4 import BeautifulSoup

In [2]:
def get_enrollment_stats(driver_):
    soup_ = BeautifulSoup(driver_.page_source, 'html.parser')

    # COURSE INFO (NAME, SEMESTER, INSTRUCTOR)
    course_info_html = soup_.find('div', class_='enrollment-info')
    course_name_string = course_info_html.find('div', class_='course').get_text()
    sem_instr_string = re.findall(r'(.*) • (.*)', course_info_html.find('div', class_='info').get_text())[0]
        # course_info_data = [course_name_string, sem_instr_string[0], sem_instr_string[1]]

    stat_section_html = soup_.find('div', class_='stat-section')
    
    # EXTRACT X DAYS AFTER PHASE: (DAYS, PHASE(STR))
    days_string = stat_section_html.find('div', class_='date').get_text()
    days_data = re.findall(r'(.*) Days After (.*)', days_string)[0]
        # days_data = [int(days_data[0]), str(days_data[1])]
    
    # EXTRACT ENROLLMENT COUNTS: (ENROLLED, MAX)
    enrolled_count_string = stat_section_html.find('div', class_="enrolled-stat").find('span', class_=["bt-indicator-red", "bt-indicator-orange", "bt-indicator-green"]).get_text()
    enrolled_count_data = re.findall(r'(\d*)/(\d*)', enrolled_count_string)[0]
        # enrolled_count_data = [int(enrolled_count_data[0]), int(enrolled_count_data[1])]
    
    # EXTRACT WAITLIST COUNTS: (WAITLISTED, MAX)
    waitlisted_count_string = stat_section_html.find('div', class_="waitlisted-stat").find('span', class_=["bt-indicator-red", "bt-indicator-orange", "bt-indicator-green"]).get_text()
    waitlisted_count_data = re.findall(r'(\d*)/(\d*)', waitlisted_count_string)[0]
    waitlisted_count_data = [int(waitlisted_count_data[0]), int(waitlisted_count_data[1])]
        # return course_info_data, days_data, enrolled_count_data, waitlisted_count_data
    return {'Course': course_name_string,
            'Semester': sem_instr_string[0],
            'Instructor': sem_instr_string[1],
            'Days After': int(days_data[0]),
            'Phase': str(days_data[1]),
            'Enrolled Count': int(enrolled_count_data[0]),
            'Enrolled Max': int(enrolled_count_data[1]),
            'Waitlisted Count': int(waitlisted_count_data[0]),
            'Waitlisted Max': int(waitlisted_count_data[1])}

In [3]:
# obtain width of recharts wrapper
def get_chart_width(driver):
    soup1 = BeautifulSoup(driver.page_source, 'html.parser')
    chart_width = int(soup1.find('clippath', id='recharts1-clip').find('rect').get('width'))
    return chart_width

In [4]:
def scraping_pipeline(url):
    driver = webdriver.Chrome()
    driver.get(url)
    
    time.sleep(2)
    
    chart_width = get_chart_width(driver)
    PX_per_step = 3
    
    # set mouse to  
    canvas = driver.find_element(By.ID, "recharts1-clip").find_element(By.TAG_NAME, 'rect')
    ActionChains(driver) \
        .move_to_element_with_offset(canvas, chart_width*(-1/2), 0) \
        .perform()
    
    data_i = 0
    prev_i = 1

    scraped_dataset = pd.DataFrame(columns=
                                   ['Course', 'Semester', 'Instructor',
                                    'Days After', 'Phase',
                                    'Enrolled Count', 'Enrolled Max',
                                    'Waitlisted Count', 'Waitlisted Max'])
    
    for i in range(int(chart_width / PX_per_step)):
        ActionChains(driver) \
            .move_by_offset(PX_per_step, 0) \
            .click() \
            .perform()
        data_i = get_enrollment_stats(driver)    
        if data_i != prev_i:
            scraped_dataset.loc[len(scraped_dataset)] = data_i
        prev_i = data_i
        # print(prev_i, data_i)
    return scraped_dataset

In [5]:
e1 = scraping_pipeline("https://berkeleytime.com/enrollment/1-20674-fall-2024-512736")