### Scraping Chronicling America

In [None]:
import csv
import json
import os
import sys

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By


Selenium/Chomedriver/Chrome: https://googlechromelabs.github.io/chrome-for-testing/


We'll use the list view instead of the gallery view (so we can get 50 results
at a time and avoid unnecessary complications with the javascript used
for handling thumbnails). The url can be obtained by doing a manual search and then copying the resulting URL and split it into a base:

In [None]:
base_url = 'https://chroniclingamerica.loc.gov/search/pages/results/list'

and a dictionary of parameters:

In [None]:
parameters = {
    'date1': 1789,  # default: 1789
    'rows': 50,  # default: 20
    'searchType': 'basic',  # default: basic
    'state': '',  # default: '' (empty)
    'date2': 1963,  # default: 1963
    'proxtext': '',
    'y': 12,  # default: 12
    'x': 13,  # default: 13
    'dateFilterType': 'yearRange',  # default: yearRange
    'page': 1,
    'sort': 'relevance',  # default: relevance
}

I also assumed that you'll want to do multiple searches, so
search_terms is a list of lists where each sublist is a collection
of terms to search for:

In [None]:
search_terms = [
    ['thanksgiving', 'indian'],
    # ['foo', 'bar']  # etc.
]

Now we setup the selenium driver **(adjust path as necessary)**:

In [None]:
service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)


###  Ancillary Functions

We define a couple of functions to which we can pass raw data and get processed results when we loop over pages:

In [None]:
def get_results_page(page):
    """Take a page number and returns the appropiate URL based on base_url and the parameters dictionary."""
    url = f'{base_url}/?'  # we add /? to start the parameters string
    parameters.update({'page': page})  # we update the dictionary with page
    count = 0  # to keep track of which parameter we're adding
    # iterate over the dictionary and append to the base URL
    for key, value in parameters.items():
        # if this is the first parameter we append it as-is
        # otherwise we prepend &
        url += f'{key}={value}' if count == 0 else f'&{key}={value}'
        count += 1  # increment the count by 1  # noqa: SIM113

    return url

In [None]:
def scrape_text(url):
    """Take the URL for a page and returns a dictionary containing data scraped from that page."""
    # the URLS we scraped have the format:
    # https://chroniclingamerica.loc.gov/lccn/2017270500/1915-01-01/ed-1
    # /seq-17/;words=[u'Indian',%20u'Thanksgiving']?date1=1789&rows [...]
    # so to get the base url we split on ';' (right before words)
    url_base, url_paras = url.split(';')
    driver.get(f'{url_base}ocr/')  # the text page is just the base url + /ocr/
    # we return a dictionary with the elements we want to capture
    # I selected a few basic things, just add more as necessary
    return {
        'title': driver.find_elements_by_css_selector('div[id=head_nav] h1')[1].text,
        'credit_line': driver.find_elements_by_css_selector('div[id=head_nav] h3')[0].text,
        'text': driver.find_elements_by_css_selector('p')[0].text,
        'purl': url_base,
        'raw_text_url': f'{url_base}ocr.txt',
    }

### Main Loop

Runs once for each set of search_terms:

In [None]:
for search_terms_set in search_terms:
    # lists for holding results links and content
    newspaper_links = []
    newspaper_text = []

    # update parameters dictionary with search terms
    parameters.update({
        'proxtext': '+'.join(search_terms_set)
    })

    # LOOP 1: Collect URLs from each results page
    # first we get the total number of result pages
    driver.get(get_results_page(1))  # we go to the first page
    # look at the pagination in the top-left corner, it is a bunch of <a> tags
    # nested inside a <span> with class = pagination
    # we get the text from the penultimate one in the list
    # (the last one is the [next] button)
    result_page_count = driver.find_elements(By.CSS_SELECTOR, 'span[class=pagination] a')[-2].text
    # now we iterate over the results pages and capture all the links
    # to individual results, 50 at a time
    for page in range(1, int(result_page_count)):
        driver.get(get_results_page(page))
        links = [i.get_attribute('href') for i in driver.find_elements(By.CSS_SELECTOR, '.results_list>li>a')]
        newspaper_links += links

    # LOOP 2: Iterate through links and get content
    for url in newspaper_links:
        newspaper_text.append(scrape_text(url))  # for each link we call the scraping function

    # SAVE THE RESULTS:
    file_path = './'  # add a path as necessary
    file_name = f'ca_scraping_{"_".join(search_terms_set)}'  # this makes each results set unique based on search terms
    txt_folder = f'txt_{"_".join(search_terms_set)}'  # folder for individual text files

    # save results to JSON -> compact format that saves all the data to a single file
    with open(f'{file_path}{file_name}.json', 'w') as file:
        file.write(json.dumps(newspaper_text))

    # save results to TXT and CSV -> more flexible format, sp. for text analysis
    # saves an index as a .csv file and the text as independent .txt files in a folder
    csv_list = []  # list to hold CSV data

    # try creating the folder for the txt files
    try:
        os.mkdir(f'{file_path}{txt_folder}')  # noqa: PTH102
    # if it already exists, offer to overwrite the existing files
    # and wait for confirmation just in case we don't want to do that!
    except FileExistsError:
        response = ''
        while response not in ['y', 'n']:
            response = input(f'The folder {txt_folder} already exists. Overwite contents [Y/N]?').lower()
        if response == 'n':
            sys.exit()  # if we don't want to overwrite the files, we exit the script

    for index, result in enumerate(newspaper_text, start=1):
        text = result.pop('text')  # remove the text from the result dictionary
        result['filename'] = f'{index}.txt'  # add the name of the text file for reference
        csv_list.append(result)  # add it to the list we'll turn into a csv

        # write the text file to disk
        with open(f'{file_path}{txt_folder}/{index}.txt', 'w') as file:
            file.write(text)

    # finally, we write the csv
    with open(f'{file_path}{file_name}.csv', 'w') as file:
        headers = csv_list[0].keys()  # we get the "headers" by extracting the keys to the first dictionary
        dict_writer = csv.DictWriter(file, headers)  # we create a csv writers with the headers
        dict_writer.writeheader()  # we write the headers first
        dict_writer.writerows(csv_list)  # and then we write the rows

In [None]:
driver.quit()  # lastly we clean after ourselves