In [1]:
# imports
import requests
import json
import math
import pandas as pd
import spacy

In [2]:
# initial search
url = 'https://chroniclingamerica.loc.gov/search/pages/results/?state=New+York&date1=1770&date2=1865&proxtext=Ellis+Island&x=20&y=8&dateFilterType=yearRange&rows=20&searchType=basic&format=json'
response = requests.get(url)
raw = response.text
results = json.loads(raw)

In [4]:
results.keys()

dict_keys(['totalItems', 'endIndex', 'startIndex', 'itemsPerPage', 'items'])

In [5]:
print(results['items'][0])

{'sequence': 6, 'county': ['New York'], 'edition': None, 'frequency': 'Daily', 'id': '/lccn/sn83030313/1862-01-08/ed-1/seq-6/', 'subject': ['New York (N.Y.)--Newspapers.', 'New York (State)--New York County.--fast--(OCoLC)fst01234953', 'New York (State)--New York.--fast--(OCoLC)fst01204333', 'New York County (N.Y.)--Newspapers.'], 'city': ['New York'], 'date': '18620108', 'title': 'The New York herald. [volume]', 'end_year': 1920, 'note': ['Also issued on microfilm from the Library of Congress, Photoduplication Service.', 'Archived issues are available in digital format from the Library of Congress Chronicling America online collection.', 'Issues for Sept. 22, 1840-Jan. 31, 1920 called also whole no. 1566-30,476.', 'Merged with Sun (New York, N.Y. : 1916) to form Sun and the New York herald.', 'Steamer eds.: Herald for Europe, 1846-<1848>, and: California herald (New York, N.Y.), 1848-<1849>, and: New York herald (New York, N.Y. : California ed.), <1852-1858>, and: New York herald (New

In [6]:
data = []
start_date = '1888'
end_date = '1920'
search_term = 'Ellis+Island'
state = 'New+York'

In [7]:
for i in range(1, 11):  # for sake of time I'm doing only 10, you will want to put total_pages+1
    url = (f'https://chroniclingamerica.loc.gov/search/pages/results/?state={state}&date1={start_date}'
           f'&date2={end_date}&proxtext={search_term}&x=16&y=8&dateFilterType=yearRange&rows=20'
           f'&searchType=basic&format=json&page={i}')  # f-string
    response = requests.get(url)
    raw = response.text
    print(f'page {i} status code:', response.status_code)  # checking for errors
    results = json.loads(raw)
    items_ = results['items']
    for item_ in items_:
        row_data = {}
        try:
          row_data['title'] = item_['title_normal']
        except:
          row_data['title'] = "none"
        try:
          row_data['city'] = item_['city']
        except:
          row_data['city'] = "none"
        try:
          row_data['date'] = item_['date']
        except:
          row_data['date'] = "none"
        try:
          row_data['raw_text'] = item_['ocr_eng']
        except:
          row_data['raw_text'] = 'none'
    data.append(row_data)

page 1 status code: 200
page 2 status code: 200
page 3 status code: 200
page 4 status code: 200
page 5 status code: 200
page 6 status code: 200
page 7 status code: 200
page 8 status code: 200
page 9 status code: 200
page 10 status code: 200


In [8]:
df = pd.DataFrame.from_dict(data)

In [9]:
df.head()

Unnamed: 0,title,city,date,raw_text
0,new-york tribune.,[New York],19200815,A Wave of Barbarism Threatens to Inundate Euro...
1,new-york tribune.,[New York],19001014,KEW IMMIGRANT STATION.\nMAIN BUILDING ON ELLIS...
2,new-york tribune.,[New York],19191125,Occasional adjustment of\nyour eyeglasses is n...
3,new-york tribune.,[New York],19200721,Travis Will Not\nEnter the Race\nFor Governor\...
4,new-york tribune.,[New York],19010721,DEPENDS UPON ' CROKER.\nSPECULATION AS TO THE ...


In [10]:
df['date'] = pd.to_datetime(df['date'])

In [11]:
df.head()

Unnamed: 0,title,city,date,raw_text
0,new-york tribune.,[New York],1920-08-15,A Wave of Barbarism Threatens to Inundate Euro...
1,new-york tribune.,[New York],1900-10-14,KEW IMMIGRANT STATION.\nMAIN BUILDING ON ELLIS...
2,new-york tribune.,[New York],1919-11-25,Occasional adjustment of\nyour eyeglasses is n...
3,new-york tribune.,[New York],1920-07-21,Travis Will Not\nEnter the Race\nFor Governor\...
4,new-york tribune.,[New York],1901-07-21,DEPENDS UPON ' CROKER.\nSPECULATION AS TO THE ...


In [12]:
# cleaning up \n characters, stop words, and lemmatizing raw text
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes('ner', 'parser')

def process(text):
    text = text.replace('\n', ' ')
    doc = nlp(text)
    tokens = [token for token in doc]
    no_stop = [token for token in tokens if not token.is_stop]
    no_punct = [token for token in no_stop if token.is_alpha]
    lemmas = [token.lemma_ for token in no_punct]
    lemmas_lower = [lemma.lower() for lemma in lemmas]
    lemmas_string = ' '.join(lemmas_lower)
    return lemmas_string

In [13]:
df['lemmas'] = df['raw_text'].apply(process)

In [14]:
df.head()

Unnamed: 0,title,city,date,raw_text,lemmas
0,new-york tribune.,[New York],1920-08-15,A Wave of Barbarism Threatens to Inundate Euro...,wave barbarism threaten inundate europe advanc...
1,new-york tribune.,[New York],1900-10-14,KEW IMMIGRANT STATION.\nMAIN BUILDING ON ELLIS...,kew immigrant station main building ellis isla...
2,new-york tribune.,[New York],1919-11-25,Occasional adjustment of\nyour eyeglasses is n...,occasional adjustment eyeglass necessary treat...
3,new-york tribune.,[New York],1920-07-21,Travis Will Not\nEnter the Race\nFor Governor\...,travis enter race governor values honor highly...
4,new-york tribune.,[New York],1901-07-21,DEPENDS UPON ' CROKER.\nSPECULATION AS TO THE ...,depend croker speculation tammany nominee idle...


In [16]:
df.to_csv(f'../cls161/{search_term}{start_date}-{end_date}.csv')