## 1 - Imports

In [1]:
import os
import errno
import requests
from bs4 import BeautifulSoup

## 2 - Setup

In [2]:
course_to_get = "part_A" # Needs to be a valid key for the div_ids dictionary

In [3]:
url = "https://courses.maths.ox.ac.uk/overview/undergraduate" # Course page home url
div_ids = {"part_A":"50879","part_A_Maths_and_Phi":"50805","part_B":"49210","part_C":"49743","MMath_phis":"44954"}

root_url = "https://courses.maths.ox.ac.uk" 

output_dir = f"P:/Desktop/maths/{course_to_get}" # Directory where you want all the files to be dumped

## 3 - Scraping

In [4]:
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
course_page = soup.find(id=div_ids[course_to_get]) # The container for all the links, this is the same for part A,B,C

In [5]:
files = {} # Dict to store (module name,file list) pairs

for course_link in course_page.select("a"):    
    
    try:
        materials_resp = requests.get(root_url + course_link["href"]+"/materials")
        matertials_soup = BeautifulSoup(materials_resp.text, "html.parser")
        
        course_name = course_link.text
        public_files = matertials_soup.find("table")
        
        course_files = [] # Array of (link,name) tuples
        for file_link in public_files.findAll("a"):
            if(file_link["href"].startswith("/node/view_material")):
                course_files.append((root_url+file_link["href"],file_link.text))
        files[course_name]=course_files
    except:
        print("It's likely that ",course_link," isn't a link to a course page")

It's likely that  <a href="https://courses.maths.ox.ac.uk/sites/default/files/Guide%20to%20Part%20A%20Courses_0.pdf">Guide to Part A courses</a>  isn't a link to a course page


## 4 - Save Locally

In [6]:
# We parse the files after we have finished scraping all content links

for course in files:
    course_files = files[course]
    
    if(len(course_files)>0):
        for url,name in course_files:
            try:
                file_path = output_dir + f'/{course.replace(":","-")}/{name.replace(":","-")}.pdf' 
                file_content = requests.get(url).content
            

                if not os.path.exists(os.path.dirname(file_path)):
                    try:
                        os.makedirs(os.path.dirname(file_path))
                    except OSError as exc: # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise

                with open(file_path, "wb") as f:
                    f.write(file_content)
            except:
                print(f'There was an issue getting {name} from {course}')