# Downloading Notre Dame Alumnus PDFs

From the [University Archives](http://archives.nd.edu/digital/):
- "The Alumnus, published by the Alumni Association from January of 1923 until December of 1971, provided news and feature articles of interest to Notre Dame graduates. Notre Dame Magazine replaced it starting in 1972."
- [Alumnus Digital Collection](http://archives.nd.edu/Alumnus/)

This Jupyter Notebook inclues codes + comments that downloads all PDFs of *The Alumnus*, and also matches volume and issue names to file names.

# Import Libraries, Load URL, and Create Beautiful Soup Object

In [None]:
# import libraries
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import csv

In [None]:
# load url, create beautifulsoup object
page = requests.get('http://archives.nd.edu/Alumnus/Alumnus.htm')

soup = BeautifulSoup(page.text, 'html.parser')

# isolate HTML with 'ol' tag
url_names = soup.find('ol')

# find all instances of 'a' tag
items = url_names.find_all('a')

items

# Get List of Volume Links and Volume Names

In [None]:
# create empty lists for volume links and file names
href_list = []

file_name_list = []

# for loop that isolates href contents and file names and appends to empty lists
for thing in items:
    links = thing.get('href')
    href_list.append(links)
    name_list = thing.contents[0]
    file_name_list.append(name_list)

In [None]:
# list of volume URLS
href_list

In [None]:
# list of volume names
file_name_list

# Get List of Issue Links and Names

In [None]:
# create empty lists for issue links and names
issue_href_list = []
issue_name_list = []

# for loop that isolates href contents and file names and appends to empty lists
for thing in href_list:
    issue_page = requests.get(thing)
    issue_soup = BeautifulSoup(issue_page.text, 'html.parser')
    issue_url_names = issue_soup.find('ol')
    issue_items = issue_url_names.find_all('a')
    for thing in issue_items:
        links = thing.get('href')
        
        issue_href_list.append(links)
        name_list = thing.contents[1]
        name_list = name_list.contents[0]
        combined_name = "Notre Dame Alumnus " + name_list
        issue_name_list.append(combined_name)

In [None]:
# list of issue links (end of url)
issue_href_list

In [None]:
# list of issue names
issue_name_list

In [None]:
# concatenate full url from issue_href_list

full_href_list = []

for thing in issue_href_list:
    if thing.startswith("VOL_0001"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0001/" + thing)
    elif thing.startswith("VOL_0002"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0002/" + thing)
    elif thing.startswith("VOL_0003"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0003/" + thing)
    elif thing.startswith("VOL_0004"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0004/" + thing)
    elif thing.startswith("VOL_0005"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0005/" + thing)
    elif thing.startswith("VOL_0006"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0006/" + thing)
    elif thing.startswith("VOL_0007"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0007/" + thing)
    elif thing.startswith("VOL_0008"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0008/" + thing)
    elif thing.startswith("VOL_0009"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0009/" + thing)
    elif thing.startswith("VOL_0010"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0010/" + thing)
    elif thing.startswith("VOL_0011"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0011/" + thing)
    elif thing.startswith("VOL_0012"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0012/" + thing)
    elif thing.startswith("VOL_0013"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0013/" + thing)
    elif thing.startswith("VOL_0014"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0014/" + thing)
    elif thing.startswith("VOL_0015"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0015/" + thing)
    elif thing.startswith("VOL_0016"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0016/" + thing)
    elif thing.startswith("VOL_0017"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0017/" + thing)
    elif thing.startswith("VOL_0018"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0018/" + thing)
    elif thing.startswith("VOL_0019"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0019/" + thing)
    elif thing.startswith("VOL_0020"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0020/" + thing)
    elif thing.startswith("VOL_0021"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0021/" + thing)
    elif thing.startswith("VOL_0022"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0022/" + thing)
    elif thing.startswith("VOL_0023"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0023/" + thing)
    elif thing.startswith("VOL_0024"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0024/" + thing)
    elif thing.startswith("VOL_0025"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0025/" + thing)
    elif thing.startswith("VOL_0026"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0026/" + thing)
    elif thing.startswith("VOL_0027"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0027/" + thing)
    elif thing.startswith("VOL_0028"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0028/" + thing)
    elif thing.startswith("VOL_0029"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0029/" + thing)
    elif thing.startswith("VOL_0030"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0030/" + thing)
    elif thing.startswith("VOL_0031"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0031/" + thing)
    elif thing.startswith("VOL_0032"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0032/" + thing)
    elif thing.startswith("VOL_0033"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0033/" + thing)
    elif thing.startswith("VOL_0034"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0034/" + thing)
    elif thing.startswith("VOL_0035"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0035/" + thing)
    elif thing.startswith("VOL_0036"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0036/" + thing)
    elif thing.startswith("VOL_0037"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0037/" + thing)
    elif thing.startswith("VOL_0038"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0038/" + thing)
    elif thing.startswith("VOL_0039"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0039/" + thing)
    elif thing.startswith("VOL_0040"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0040/" + thing)
    elif thing.startswith("VOL_0041"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0041/" + thing)
    elif thing.startswith("VOL_0042"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0042/" + thing)
    elif thing.startswith("VOL_0043"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0043/" + thing)
    elif thing.startswith("VOL_0044"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0044/" + thing)
    elif thing.startswith("VOL_0045"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0045/" + thing)
    elif thing.startswith("VOL_0046"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0046/" + thing)
    elif thing.startswith("VOL_0047"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0047/" + thing)
    elif thing.startswith("VOL_0048"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0048/" + thing)
    elif thing.startswith("VOL_0049"):
        full_href_list.append("http://www.archives.nd.edu/Alumnus/VOL_0049/" + thing)

In [None]:
# list of full issue urls
full_href_list

# Download PDFs from List of Full URLs

In [None]:
# import libraries
import urllib3
import os

# configure urllib
http = urllib3.PoolManager()
print("downloading with urllib")

# for loop that downloads PDF for each url in full_href_list
for url in full_href_list:
    r = http.request('GET', url)
    filename = os.path.basename(url)
    with open (filename, 'wb') as fcont:
        fcont.write(r.data)

# Matching File Names and Volume/Issue Info

In [None]:
# import pandas
import pandas as pd

# create dataframe 
df = pd.DataFrame(columns=['file_names', 'doc_title'])

# write file names to column
df['file_names'] = issue_href_list

# write document titles to column
df['doc_title'] = issue_name_list

# output dataframe
df

In [None]:
# write dataframe to csv file
df.to_csv('alumnus_file_name_master.csv', index=False)