In [None]:
import requests
import bs4
import loguru
import urllib.parse
import typing

In [None]:
# See: https://stackoverflow.com/a/38020041/408734
def uri_validator(x):
    try:
        result = urllib.parse.urlparse(x)
        return all([
            result.scheme,
            result.scheme in ["file", "http", "https"],
            result.netloc,
        ])
    except:
        return False

def fetch_page_as_soup(url, silent_fail=True):
    
    # some light validation
    
    if url is None:
        loguru.logger.error("no URL provided!")
        return
    
    if not uri_validator(url):
        loguru.logger.error("invalid URL provided!")
        return
    
    # make the request
    
    try:
        req = requests.get(url)
    except Exception as exc:
        # if not silent, then reraise exception
        if not silent_fail:
            raise exc
        
        # otherwise just log and return None
        loguru.logger.error("when fetching URL: ", url)
        loguru.logger.exception(exc)
        return
    
    # check request is valid
    
    if not req.ok:
        loguru.logger.warning("status code:", req.status_code)
        return
    
    # return beautiful soup
    
    content = req.content
    
    try:
        soup = bs4.BeautifulSoup(content, "html.parser")
    except Exception as exc:
        # if not silent, then reraise exception
        if not silent_fail:
            raise exc
        
        # otherwise just log and return None
        loguru.logger.error("when parsing content of: ", url)
        loguru.logger.exception(exc)
        return
    
    return soup

In [None]:
fetch_page_as_soup("https://www.cs.princeton.edu/courses/schedule")

In [None]:
raw = requests.get("https://www.cs.princeton.edu/courses/schedule").content

In [None]:
soup = bs4.BeautifulSoup(raw)

In [None]:
semester_select = soup.find(name="select", attrs={"class": "semester-select"})

In [None]:
semesters = list(map(
    lambda option_tag: (option_tag["value"], option_tag.text),
    semester_select.find_all("option")))

In [None]:
semesters

In [None]:
raw2 = requests.get("https://www.cs.princeton.edu/courses/schedule/fall18").content

In [None]:
soup2 = bs4.BeautifulSoup(raw2)

In [None]:
schedule_table = soup2.find(name="table", attrs={"id": "course-schedules"})

In [None]:
FIELDS = {
    "Num": "course",
    "Name": "title",
    "Professor(s)": "people",
    "Classes": "hours",
    "Room": "room"
}

CosCourseInstance = typing.TypedDict(
    "CosCourseInstance", {
        "course": str,
        "title": str,
        "term": str,
        "people": typing.List[str],
        "hours": str,
        "room": str
    }, total=False)

course_fields_data = [
    {
        "id": i,
        "original": tag.text.strip(),
        "renamed": FIELDS.get(tag.text.strip()),
    }
    for i, tag in enumerate(schedule_table.find("thead").find_all("th"))
]

course_fields_id_to_caption = {
    d["id"]: d["renamed"]
    for d in course_fields_data
}

course_fields_original_to_caption = {
    d["original"]: d["renamed"]
    for d in course_fields_data
}

def process_course_tr(tr_tag):
    td_tags = tr_tag.find_all("td")
    course = {
        course_fields_id_to_caption[i]: td_tag.text
        for i, td_tag in enumerate(td_tags)
    }
    
    course["people"] = course["people"].split(", ")
    
    return course

In [None]:
CosCourseInstance({})

In [None]:
course_fields_original_to_caption

In [None]:
courses = [
    process_course_tr(tr_tag)
    for tr_tag in schedule_table.find("tbody").find_all("tr")
]

In [None]:
courses

In [2]:
import sys
sys.path.append("./src")

In [3]:

import urllib.parse

import princeton_scraper_cos_courses.parsing
import princeton_scraper_cos_courses.helpers

PRINCETON_CS_SCHEDULE_BASE = "https://www.cs.princeton.edu/courses/schedule/"

# Hard-coded fields



def _build_schedule_url(
        term: str,
) -> str:
    url = urllib.parse.urljoin(PRINCETON_CS_SCHEDULE_BASE, term)
    return url

In [5]:
data = {}

for term in princeton_scraper_cos_courses.parsing.get_all_terms():
    url = _build_schedule_url(term["internal"])
    soup = princeton_scraper_cos_courses.helpers.fetch_page_as_soup(url=url)
    
    schedule_table = soup.find(
        name="table",
        attrs={"id": "course-schedules"}
    )
    courses = [
        princeton_scraper_cos_courses.parsing.parse_cs_course(
            tag=tr_tag,
            term=term,
        )
        for tr_tag in schedule_table.find("tbody").find_all("tr")
    ]
    data[term["internal"]] = courses

In [6]:
len(data)

54

In [9]:
data

{'fall21': [{'course': 'COS 109',
   'title': 'Computers in Our World',
   'people': ['B. Kernighan'],
   'hours': 'MW 1:30-2:50',
   'room': '',
   'term': {'year': 2021,
    'period': 'Fall',
    'term': 'Fall 2021',
    'internal': 'fall21',
    'sortkey': '2021_1'}},
  {'course': 'COS 126',
   'title': 'Computer Science: An Interdisciplinary Approach',
   'people': ['R. Sedgewick', 'A. Kaplan', 'J. Lumbroso', 'S. Nam Liao'],
   'hours': 'MW 12:30-1:20',
   'room': '',
   'term': {'year': 2021,
    'period': 'Fall',
    'term': 'Fall 2021',
    'internal': 'fall21',
    'sortkey': '2021_1'}},
  {'course': 'COS 217',
   'title': 'Introduction to Programming Systems',
   'people': ['S. Rusinkiewicz'],
   'hours': 'TTh 10:00-10:50',
   'room': '',
   'term': {'year': 2021,
    'period': 'Fall',
    'term': 'Fall 2021',
    'internal': 'fall21',
    'sortkey': '2021_1'}},
  {'course': 'COS 226',
   'title': 'Algorithms and Data Structures',
   'people': ['K. Wayne', 'D. Leyzberg'],
   '