### Create DB from URL of PDF's of child fatality or near fatalities per Clark, Washoe, & Rural Nevada
1. Download all pdf's from a given URL and save to local computer
1. List all pdfs in directory, create python list (will be looped through)
1. Function that scrapes each individual pdf
1. Function that scrapes all files via a loop
    a. Returns final dataframe

In [22]:
import os
import requests
from bs4 import BeautifulSoup

def download_all_pdfs(url, save_dir):

    # Get the webpage content
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links in the webpage
    links = soup.find_all('a')

    # Download each PDF
    for link in links:
        href = link.get('href')
        if href and 'ID' in href and href.endswith('.pdf'):

            # append needed dcfs link since pdf hrefs don't come naturally with it
            prefix_for_href = "https://dcfs.nv.gov/"
            href = prefix_for_href + href

            # Get the PDF content
            pdf_response = requests.get(href)

            # Get the PDF name from the URL
            pdf_name = os.path.basename(href)

            # Save the PDF
            with open(os.path.join(save_dir, pdf_name), 'wb') as f:
                f.write(pdf_response.content)
                
# URL of the webpage
url = 'https://dcfs.nv.gov/Programs/CWS/CPS/ChildFatalities/Clark/'

# Directory to save the PDFs
save_dir = r'C:\Users\kkurek\OneDrive - State of Nevada\Desktop\DCFS_Child\Clark_pdfs'

# takes a solid 1-2 min to download all files from url, especially Clark County
# download_all_pdfs(url, save_dir)

In [23]:
import os

def list_files(path):
    files = []
    for file in os.listdir(path):
        if os.path.isfile(os.path.join(path, file)):
            files.append(file)
    return files

# Provide the path you want to list the files for
path = r"C:\Users\kkurek\OneDrive - State of Nevada\Desktop\DCFS_Child\Clark_pdfs"

# Call the function
file_list = list_files(path)
print(len(file_list))
# file_list

191


In [74]:
import pdfplumber
import pandas as pd
import numpy as np

# Define the keys you want to extract
keys = [
    'Date',
    'Agency Name',
    'Agency Address',
    'Date of written notification to the Division of Child and Family Services and Legislative Auditor',
    'Internal reference UNITY Case Number',
    'Internal reference UNITY Case Number or Report Number',
    'Child Fatality Date of Death',
    'Near Fatality Date of Near Fatality'
]

def scrape_individual_pdf(pages_text, keys):

    # Initialize a dictionary with the keys and empty values
    data = {key: '' for key in keys}

    for page_text in pages_text:
        lines = page_text.split('\n')
        for line in lines:
            # Split each line into a key-value pair
            if ': ' in line:
                key, value = line.split(': ', 1)
                # Only add the key-value pair to the dictionary if the key is in the list of keys
                if key in keys and not data[key]:
                    data[key] = value

    # Now you can create a DataFrame from your data
    df = pd.DataFrame(data, index=[0])

    # Replace empty strings with NaN
    df.replace('', np.nan, inplace=True)
    
    return df

def loop_pdf_scrape(file_list, path):
    
    df_list = []
    for file in file_list:
        
        # make sure full directory is appended before opening
        file = path + "\\" + file
        
        with pdfplumber.open(file) as pdf:
            # Extract text from each page
            pages_text = [page.extract_text() for page in pdf.pages]
            individual_df = scrape_individual_pdf(pages_text, keys)
            df_list.append(individual_df)
            
    return df_list

# list all local pdfs
path = r"C:\Users\kkurek\OneDrive - State of Nevada\Desktop\DCFS_Child\Clark_pdfs"
file_list = list_files(path)

# scrape each individual pdf from file_list
df_list = loop_pdf_scrape(file_list, path)

# rename columns
rename_cols = {'date_of_written_notification_to_the_division_of_child_and_family_services_and_legislative_auditor': 
                  'date_of_written_notification',
              'near_fatality_date_of_near_fatality': 
                   'near_fatality_date'}

# renamed time columns
time_cols = ['date', 
             'date_of_written_notification', 
             'child_fatality_date_of_death', 
             'near_fatality_date']

def cleaning_df(df_list, rename_cols, time_cols):

    # concatenate final list of dataframes
    df = pd.concat(df_list).reset_index(drop=True)
    
    # make column names pretty
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_', regex=False).str.replace('(', '', regex=False).str.replace(')', '', regex=False)
    
    # rename long columns
    df = df.rename(columns=rename_cols)
    
    # convert to pandas datetime dtype
    df[time_cols] = df[time_cols].apply(pd.to_datetime, format="%m/%d/%Y", errors='coerce')
    
    # unify 'Internal reference UNITY Case Number' & 'Internal reference UNITY Case Number or Report Number' cols
    col1, col2 = 'internal_reference_unity_case_number_or_report_number', 'internal_reference_unity_case_number'
    df[col2] = df[col2].fillna(df[col1])
    
    # drop old column
    df = df.drop([col1], axis=1)
    
    # sort values by date, newest at top
    df = df.sort_values(by='date', ascending=False).reset_index(drop=True)
    
    return df

final_df = cleaning_df(df_list, rename_cols, time_cols)
print(final_df.shape)
final_df.head()

(191, 7)


Unnamed: 0,date,agency_name,agency_address,date_of_written_notification,internal_reference_unity_case_number,child_fatality_date_of_death,near_fatality_date
0,2023-06-08,Clark County Department of Family Services,"500 S. Grand Central Pkwy, 5th Floor, Las Vega...",2023-06-08,1486611,NaT,2023-06-04
1,2023-06-06,Clark County Department of Family Services,"500 S. Grand Central Pkwy, 5th Floor, Las Vega...",2023-06-06,1504446,2023-06-04,2023-05-31
2,2023-06-06,Clark County Department of Family Services,"500 S. Grand Central Pkwy, 5th Floor, Las Vega...",2023-06-06,1511413,NaT,NaT
3,2023-05-31,Clark County Department of Family Services,"500 S. Grand Central Pkwy, 5th Floor, Las Vega...",2023-05-31,1511295,NaT,2023-05-25
4,2023-05-16,Clark County Department of Family Services,"500 S. Grand Central Pkwy, 5th Floor, Las Vega...",2023-05-03,1468484,NaT,NaT


In [75]:
final_df.tail()

Unnamed: 0,date,agency_name,agency_address,date_of_written_notification,internal_reference_unity_case_number,child_fatality_date_of_death,near_fatality_date
186,2021-01-26,Clark County Department of Family Services (CC...,121 S. Martin Luther King Blvd.,2021-01-26,1475771,NaT,2021-01-17
187,2021-01-25,Clark County Department of Family Services (CC...,121 S. Martin Luther King Blvd.,2021-01-25,1439777,2021-01-22,NaT
188,2021-01-19,Clark County Department of Family Services (CC...,121 S. Martin Luther King Blvd.,2021-01-19,1458420,2021-01-16,NaT
189,2021-01-12,Clark County Department of Family Services (CC...,121 S. Martin Luther King Blvd.,2021-01-12,1478970,NaT,2021-01-04
190,2021-01-04,Clark County Department of Family Services (CC...,121 S. Martin Luther King Blvd.,2021-01-04,1478934,2021-01-01,NaT


In [77]:
print(final_df['internal_reference_unity_case_number'].isna().sum())

1
