# Downloading Notre Dame Observer PDFs

From the [University Archives](http://archives.nd.edu/digital/):
- "The Observer started providing news for the University of Notre Dame and Saint Mary's College starting in the fall of 1966, first as a weekly, then bi-weekly, and soon as a daily newspaper. Starting with the issue of October 10, 2009, the Notre Dame / Saint Mary's Observer appeared online (http://ndsmcobserver.com/)."
- [The Observer (student newspaper), 1966 - 2015](http://archives.nd.edu/Observer/)

This Jupyter Notebook inclues codes + comments that downloads all *Observer* PDFs, and also matches issue titles to file names.

# Import Libraries, Load URL, and Create Beautiful Soup Object

In [None]:
# import libraries
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import csv

In [None]:
# load url, create beautifulsoup object
page = requests.get('http://archives.nd.edu/Observer/Observer.htm')

soup = BeautifulSoup(page.text, 'html.parser')

# isolate HTML with 'ol' tag
url_names = soup.find('ul')

# find all instances of 'a' tag
items = url_names.find_all('a')

items

# Get List of Volume Links and Titles

In [None]:
# create empty list for volume urls
vol_url_list = []

# create empty list for volume titles
vol_title_list = []

# for loop that extracts href contents, concatenates full url, appends to url_list; extracts tag contents (volume title) and appends to title_list
for item in items:
    vol_url_list.append("http://archives.nd.edu/Observer/" + item.get('href'))
    vol_title_list.append(item.contents[0])

In [None]:
# show sample volume url
vol_url_list[0]

In [None]:
# show sample volume title
vol_title_list[0]

# Get List of Issue Links and Titles

In [None]:
# create empty list for issue html elements
issue_items = []

# create empty list for a tags
a_tags = []


# for loop that loads each volume page as beautifulsoup object, extracts 'a' tag elements on each page, appends to issue_items list
for url in vol_url_list:
    try:
        single_page = requests.get(url)
        soup = BeautifulSoup(single_page.text, 'html.parser')
        a_tags.append(soup.find('ol'))
        for tag in a_tags:
            url_names = tag.find_all('li')
            issue_items.append(url_names)           
            
    except:
        continue

In [None]:
# show sample issue_items value
issue_items[0]

In [None]:
# empty list for issue titles
issue_titles = []

for issue in issue_items:
    for i in issue:
        issue_title = i.contents[0]
        issue_date = i.contents[1].string.strip()
        issue_titles.append(issue_title + issue_date)

In [None]:
# convert issue_titles list to set and back to return only unique values
issue_titles = list(set(issue_titles))

In [None]:
# show sample issue title
issue_titles[0]

NOTE: Beginning in Volume 45 (2010-2011 academic year), the University Archive digital collections does not include separate black/white and color editions of each issue. This modified structure does not affect the code extracting issue names but does require modified code to extract issue URLs and file names. The `bw_color_issue_links` includes content for Volume 45 onward.

In [None]:
# empty list for black and white issue links
bw_issue_links = []

# empty list for color issue links
color_issue_links = []

# empty list for bw/color issue links
bw_color_issue_links = []

# import re
import re

# for loop that extracts href content for each issue version and appends to respective list
for issue in issue_items:
        for i in issue:
            try:
                bw_url = i.contents[3].get('href')
                bw_url = re.sub("^../","",bw_url)
                bw_url = re.sub("^../","",bw_url)
                bw_issue_links.append(bw_url)
                color_url = i.contents[5].get('href')
                color_issue_links.append(re.sub("^../","", color_url))
            except:
                bw_color = i.contents[3].get('href')
                bw_color_issue_links.append(re.sub("^../","", bw_color))
                continue

In [None]:
# convert lists to sets and back again to return only unique values
bw_issue_links = list(set(bw_issue_links))

color_issue_links = list(set(color_issue_links))

bw_color_issue_links = list(set(bw_color_issue_links))

In [None]:
# sample black/white issue link
bw_issue_links[0]

In [None]:
# sample color issue link
color_issue_links[0]

In [None]:
# sample bw/color issue link
bw_color_issue_links[0]

## Concatenate Full URLs

In [None]:
# empty list for concatenated black/white issue urls
full_bw_issue_links = []

for link in bw_issue_links:
    v45 = re.findall('v45', link)
    v46 = re.findall('v46', link)
    v47 = re.findall('v47', link)
    v48 = re.findall('v48', link)
    v49 = re.findall('v49', link)
    v50 = re.findall('v50', link)
    if v45:
        full_bw_issue_links.append("http://archives.nd.edu/Observer/v45/" + link)
    elif v46:
        full_bw_issue_links.append("http://archives.nd.edu/Observer/v46/" + link)
    elif v47:
        full_bw_issue_links.append("http://archives.nd.edu/Observer/v47/" + link)
    elif v48:
        full_bw_issue_links.append("http://archives.nd.edu/Observer/v48/" + link)
    elif v49:
        full_bw_issue_links.append("http://archives.nd.edu/Observer/v49/" + link)
    elif v50:
        full_bw_issue_links.append("http://archives.nd.edu/Observer/v50/" + link)
        
# show sample black/white full issue link
full_bw_issue_links[0]

In [None]:
# empty list for concatenated issue urls
full_bw_color_issue_links = []

for link in bw_color_issue_links:
    v45 = re.findall('v45', link)
    v46 = re.findall('v46', link)
    v47 = re.findall('v47', link)
    v48 = re.findall('v48', link)
    v49 = re.findall('v49', link)
    v50 = re.findall('v50', link)
    if v45:
        full_bw_color_issue_links.append("http://archives.nd.edu/Observer/v45/" + link)
    elif v46:
        full_bw_color_issue_links.append("http://archives.nd.edu/Observer/v46/" + link)
    elif v47:
        full_bw_color_issue_links.append("http://archives.nd.edu/Observer/v47/" + link)
    elif v48:
        full_bw_color_issue_links.append("http://archives.nd.edu/Observer/v48/" + link)
    elif v49:
        full_bw_color_issue_links.append("http://archives.nd.edu/Observer/v49/" + link)
    elif v50:
        full_bw_color_issue_links.append("http://archives.nd.edu/Observer/v50/" + link)
        
# show sample full issue link
full_bw_color_issue_links[0]

# Download PDFs from List of Full URLs

## Download Black/White Issues

In [None]:
# configure urllib
http = urllib3.PoolManager()
print("downloading with urllib")

# for loop that downloads PDF for each url in full_bw_issue_links
for link in full_bw_issue_links:
    r = http.request('GET', link)
    filename = os.path.basename(link)
    with open (filename, 'wb') as fcont:
        fcont.write(r.data)

## Download Color Issues

In [None]:
# configure urllib
http = urllib3.PoolManager()
print("downloading with urllib")

# for loop that downloads PDF for each url in color_issue_links
for link in color_issue_links:
    r = http.request('GET', link)
    filename = os.path.basename(link)
    with open (filename, 'wb') as fcont:
        fcont.write(r.data)

## Download Issues for Volumes 45-50

In [None]:
# configure urllib
http = urllib3.PoolManager()
print("downloading with urllib")

# for loop that downloads PDF for each url in full_bw_color_issue_links
for link in full_bw_color_issue_links:
    r = http.request('GET', link)
    filename = os.path.basename(link)
    with open (filename, 'wb') as fcont:
        fcont.write(r.data)

# Matching File Names and Volume/Issue Info

In [None]:
# empty list for black and white issue file names
bw_issue_names = []

# empty list for black and white issue titles
bw_issue_titles = []

# empty list for color issue file names
color_issue_names = []

# empty list for color issue titles
color_issue_titles = []

# empty list for bw/color issue file names
bw_color_issue_names = []

# empty list for bw/color issue titles
bw_color_issue_titles = []

# import re
import re

# for loop that extracts file name and title each issue version and appends to respective list
for issue in issue_items:
        for i in issue:
            try:
                bw_url = i.contents[3].get('href')
                bw_url = re.sub("^../","",bw_url)
                bw_url = re.sub("^../","",bw_url)
                bw_issue_names.append(bw_url)
                bw_issue_title = i.contents[0]
                bw_issue_date = i.contents[1].string.strip()
                bw_issue = bw_issue_title + bw_issue_date
                bw_issue_titles.append(bw_issue)
                color_url = i.contents[5].get('href')
                color_issue_names.append(re.sub("^../","", color_url))
                color_issue_title = i.contents[0]
                color_issue_date = i.contents[1].string.strip()
                color_issue = color_issue_title + color_issue_date
                color_issue_titles.append(color_issue) 
            except:
                bw_color = i.contents[3].get('href')
                bw_color_issue_names.append(re.sub("^../","", bw_color))
                bw_color_issue_title = i.contents[0]
                bw_color_issue_date = i.contents[1].string.strip()
                bw_color_issue = bw_color_issue_title + bw_color_issue_date
                bw_color_issue_titles.append(bw_color_issue)
                continue

## Black/White Issues

In [None]:
# import pandas
import pandas as pd

# create empty dataframe with two columns
bwdf = pd.DataFrame(columns=['file_name', 'title'])

# append issue_url_list to file_name column
bwdf['file_name'] = bw_issue_names

# append full_issue_title values to title column
bwdf['title'] = bw_issue_titles

# remove duplicates
bwdf = bwdf.drop_duplicates()

# show updated dataframe
bwdf

In [None]:
# write bwdf dataframe to csv file
bwdf.to_csv("observer_bw_file_name_master.csv", index=False)

## Color Issues

In [None]:
# import pandas
import pandas as pd

# create empty dataframe with two columns
cdf = pd.DataFrame(columns=['file_name', 'title'])

# append issue_url_list to file_name column
cdf['file_name'] = color_issue_names

# append full_issue_title values to title column
cdf['title'] = color_issue_titles

# remove duplicates
cdf = cdf.drop_duplicates()

# show updated dataframe
cdf

In [None]:
# write cdf dataframe to csv file
cdf.to_csv("observer_color_file_name_master.csv", index=False)

## Volumes 45-50

In [None]:
# import pandas
import pandas as pd

# create empty dataframe with two columns
df = pd.DataFrame(columns=['file_name', 'title'])

# append issue_url_list to file_name column
df['file_name'] = bw_color_issue_names

# append full_issue_title values to title column
df['title'] = bw_color_issue_titles

# remove duplicates
df = df.drop_duplicates()

# show updated dataframe
df

In [None]:
# write dataframe to csv file
df.to_csv("observer_bw_color_file_name_master.csv", index=False)