# Chronicling America example

This notebookprovides sample code for calling the Chronicling America APIs.

For reference, see the [Chronicling America API documentation](https://chroniclingamerica.loc.gov/about/api/). 

In [None]:
import requests

from collections import Counter

Below are two example functions for loading issue metadata and OCR text files from the Chronicling America API.

In [None]:
def get_text(url):
    rsp = requests.get(url)
    if rsp.status_code != 200:
        raise Exception(f"Unable to load {url}")
    data = rsp.json()
    text_url = data["text"]
    
    txt_rsp = requests.get(text_url)
    if txt_rsp.status_code != 200:
        raise Exception(f"Unable to load {text_url}")
    return txt_rsp.text

In [None]:
def get_issue(_id):
    if _id.startswith("https"):
        url = _id
    else:
        url = f"https://chroniclingamerica.loc.gov/lccn/{_id}.json"
    doc = {
        "id": _id,
        "source": url
    }
    rsp = requests.get(url)
    if rsp.status_code != 200:
        raise Exception(f"Unable to load {url}")
    issue_data = rsp.json()
    return issue_data
    

Load a single newspaper issue and run two assertion checks to see if we are getting the data we expect.

In [None]:
issue_1 = get_issue("sn87093407/1920-05-21/ed-1")

In [None]:
assert(issue_1["date_issued"] == "1920-05-21")

In [None]:
assert(len(issue_1["pages"]) == 24)

Perform a very basic initial analysis - count the most frequent words in the first two pages of the issue. 

In [None]:
word_count = Counter()
n = 0

for pg in issue_1["pages"]:
    page_text = get_text(pg["url"])
    for line in page_text.split("\n"):
        for token in line.split():
            clean_token = token.strip(" ,.").lower()
            if clean_token == "":
                continue
            word_count[clean_token] += 1
    n += 1
    if n >= 2:
        break

In [None]:
for word, count in word_count.most_common(25):
    print(word.ljust(25), count)