### Gathering Data from NamUs

NamUs is the National Missing and Unidentified Persons System, which is financed by the United States Department of Justice. NamUs does not provide an API, however it does provide a searchable interface.

In this notebook, we will use Selenium with beautifulSoup to retrieve and save data from NamUs as CSV.

In [1]:
# Import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import getopt, re, sys, time, os

In [2]:
# initialize global driver
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)




In [3]:
# Get path to the repository's data folder
path = "/".join(os.getcwd().split("/")[0:-1]) + "../web-scraping/namus/states_gathered.csv"
print(path)


../web-scraping/namus/states_gathered.csv


In [4]:
# Constants
CASE_NUMBER_KEY = 'Case Number'

INFO_COLUMNS = [
    'Case Number',
    'DBF',
    'Last Name',
    'First Name',
    'Sex',
    'Race/Ethnicity',
    'City',
    'County',
    'State', 
    'Date Modified'
]
MAX_ROWS_PER_PAGE = 100

In [5]:
def add_filters(filters):
    if 'states' in filters: location_filter(filters['states'])
    if 'date_elem' in filters: filtering_by_date(filters)

In [6]:
###

def filtering_by_date(date):
    print('Adding date filters...')

    section_on_circumstances = driver.find_element_by_id('Circumstances')
    operand_box = section_on_circumstances.find_elements_by_tag_name('date-range-input')[1].find_elements_by_tag_name('select')[0]
    Select(operand_box).select_by_visible_text(date['date_elem'])

    time.sleep(.5)

    month_box = section_on_circumstances.find_elements_by_tag_name('date-range-input')[1].find_elements_by_tag_name('select')[1]
    Select(month_box).select_by_visible_text(date['month'])

    day_box = section_on_circumstances.find_elements_by_tag_name('date-range-input')[1].find_elements_by_tag_name('select')[2]
    Select(day_box).select_by_visible_text(date['day'])

    year_box = section_on_circumstances.find_elements_by_tag_name('date-range-input')[1].find_elements_by_tag_name('select')[3]
    Select(year_box).select_by_visible_text(date['year'])

In [7]:
# find state filter
def location_filter(states):
    print('Fetching selected states to filter...')
    section_on_circumstances = driver.find_element_by_id('Circumstances')
    labels_in_section = section_on_circumstances.find_elements_by_tag_name('label')

    state_input_box = None

    for label in labels_in_section:
        if (label.text == "State"):
            state_input_box = label.find_element_by_tag_name('input')
            # add state filter
            for state in states:
                state_input_box.send_keys(state)
                state_input_box.send_keys(Keys.ENTER)
                


In [8]:
def records():
    print('Collecting data...')

    # navigate to list view
    driver.find_element_by_xpath("//i[@class=\"icon-list\"]").click()
    time.sleep(1.5)

    df_info = pd.DataFrame(columns=INFO_COLUMNS)
    soup = BeautifulSoup(driver.page_source, 'lxml')
    rows = soup.find('div', class_='ui-grid-canvas').contents

    for row in rows:
        if row != ' ':
            cells = row.find_all('div', class_='ui-grid-cell-contents')
            cells_text = map(lambda cell: cell.text.strip(), cells)
            df_new_info = pd.DataFrame([list(cells_text)], columns=INFO_COLUMNS)
            df_info = pd.concat([df_info, df_new_info], ignore_index=True)
    
    return df_info

In [9]:
def page_counts():
    print('Counting the amount of pages...')
    
    soup = BeautifulSoup(driver.page_source, 'lxml')
    page_num_data = soup.find('nav', {'aria-label': 'Page Selection'}).find('span').text
    index_of_slash = re.search('/', page_num_data).span()[1]
    page_numbers = int(page_num_data[index_of_slash:].strip())

    return page_numbers

In [10]:
def next_page():
    print('navigating to the next page...')
    time.sleep(5)

    try:
        driver.find_element_by_xpath("//i[@class=\"icon-triangle-right\"]").click()
    except:
        print('last page completed...')

In [11]:
def parse_args(argv):
    help_message = """
    Use: --states = New York
        Allows a comma-separated list, such as (—states=Oregon, California).
        Date of Last Interaction:-date= can search for dates greater or smaller than a certain date
                    Example:    --date=">=May-5-1995" 
                                --date="<=February-12-1997" 
        -h :        Displays a help screen; alternatively, use --help
    """

    filters = {}

    try:
        opts, args = getopt.getopt(argv,'h',['help', 'states=', 'date='])
    except getopt.GetoptError:
        print(help_message)
        sys.exit(2)

    for opt, arg in opts:
        if opt in ('-h','--help'):
            print(help_message)
            sys.exit()
        if opt == '--states':
            filters['states'] = arg.split(',')
        if opt == '--date':
            filters['date_elem'] = arg[:2]
            filters['month'] = arg[2:].split('-')[0]
            filters['day'] = arg[2:].split('-')[1]
            filters['year'] = arg[2:].split('-')[2]

    return filters

In [12]:
# show 100 results at a time
def rows_to_show(num_rows):
    print(f'Setting {MAX_ROWS_PER_PAGE} rows per page...')
    dropdown_selection_results = driver.find_element_by_xpath("//label/span[contains(text(),'Results')]/following-sibling::select")
    Select(dropdown_selection_results).select_by_value(f'{num_rows}')
    time.sleep(1.5)

In [13]:
def search():
    print('Searching...')
    page_results = driver.find_element_by_class_name('search-criteria-container')
    search_actions = page_results.find_element_by_class_name('search-criteria-container-actions').find_elements_by_tag_name('input')
    search_actions[1].click()
    time.sleep(1.5)

In [14]:
def main(argv):
    filters = parse_args(argv=['--states=New Jersey'])
    
    print('Navigating to namus.gov...')
    driver.get("https://www.namus.gov/UnclaimedPersons/Search")

    add_filters(filters)
    search()
    print("Starting case processing")

    rows_to_show(MAX_ROWS_PER_PAGE)
    page_numbers = page_counts()
    df_info = pd.DataFrame(columns=INFO_COLUMNS)

    try:
        for page in range(page_numbers):
            print(f'Gathering page {page}...')
            new_df = records()
            df_info = pd.concat([df_info, new_df], ignore_index=True)
            next_page()
    except Exception as e:
        print(f'Exception thrown. Creating a csv file from existing data: {path}')
        df_info.to_csv(path, index=False, encoding='utf-8')
        #driver.quit()
        print(e)
    
    # Output collected data to the "web-scraping" folder

    print(f'Saving gathered data to csv: {path}')
    df_info.to_csv(path, index=False, encoding='utf-8')
    #driver.quit()

    print('Scraping completed')
    

if __name__ == '__main__':
    main(sys.argv[1:])

Navigating to namus.gov...
Fetching selected states to filter...
Searching...
Starting case processing
Setting 100 rows per page...
Counting the amount of pages...
Gathering page 0...
Collecting data...
navigating to the next page...
Saving gathered data to csv: ../web-scraping/namus/states_gathered.csv
Scraping completed


In [15]:
driver.quit()

In [16]:
df1 = pd.read_csv('../web-scraping/namus/states_gathered.csv')



In [17]:
df1

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race/Ethnicity,City,County,State,Date Modified
0,UCP83053,06/30/2021,Eames,Paul,Male,White / Caucasian,Elizabeth,Union,New Jersey,08/12/2021
1,UCP72130,06/01/2020,Newman,Andrew,Male,Black / African American,Hoboken,Hudson,New Jersey,07/27/2020
2,UCP70872,05/23/2020,Rutkowski,Walter,Male,White / Caucasian,Deptford,Gloucester,New Jersey,06/22/2020
3,UCP70870,05/21/2020,Maddocks,Thomas,Male,White / Caucasian,Carneys Point,Salem,New Jersey,06/22/2020
4,UCP70871,05/11/2020,Thatcher,Larry,Male,White / Caucasian,Pennsauken,Camden,New Jersey,06/22/2020
5,UCP69428,04/20/2020,Thompson,Diane,Female,White / Caucasian,Burlington,Burlington,New Jersey,06/22/2020
6,UCP70868,01/29/2020,Smith,Jodi,Female,White / Caucasian,Winslow Township,Camden,New Jersey,06/18/2020
7,UCP60795,09/24/2019,Musumeci,Peter,Male,White / Caucasian,Freehold,Monmouth,New Jersey,10/10/2019
8,UCP60530,09/13/2019,York,Sandra,Female,White / Caucasian,New Brunswick,Middlesex,New Jersey,10/02/2019
9,UCP59175,06/05/2019,King,Peter,Male,Multiple,Atlantic City,Atlantic,New Jersey,01/22/2020
