### Create DB from URL of PDF's of child fatality or near fatalities per Clark, Washoe, & Rural Nevada
1. Download all pdf's from a given URL and save to local computer
1. List all pdfs in directory, create python list (will be looped through)
1. Function that scrapes each individual pdf
1. Function that scrapes all files via a loop
    a. Returns final dataframe

In [8]:
import os
import time
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import pdfplumber

def download_all_pdfs(url):

    # Get the webpage content
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links in the webpage
    links = soup.find_all('a')
                
    # directory pdfs will be saved
    county = url.split('/')[-2]
    save_dir = f'C:/Users/kkurek/OneDrive - State of Nevada/Desktop/DCFS_Child/{county}_pdfs'
    
    # UNIQUE: this uploadedFiles path is required, why?
    # Due to the level of inconsistency in the pdf formats between counties
    # Clark has consistency: 2023-01-17_ID_1469166.pdf, 2023-01-22_ID_1506602.pdf, etc...
    # Washoe & Rural are random: 1482308_child_B.pdf, Disclosure_Form_Final_01_04_2019.pdf
    upload_href = f'/uploadedFiles/dcfsnvgov/content/Programs/CWS/CPS/ChildFatalities/{county}'
                    # 'uploadedFiles/dcfsnvgov/content/Programs/CWS/CPS/ChildFatalities/Washoe/2023/1474590-2.3.23.pdf
    
    # Download each PDF
    for link in links:
        href = link.get('href')
        if href and upload_href in href and href.endswith('.pdf'):

            # append needed dcfs link since pdf hrefs don't come naturally with it
            prefix_for_href = "https://dcfs.nv.gov/"
            href = prefix_for_href + href
            print(href)

            # Get the PDF content
            pdf_response = requests.get(href)

            # Get the PDF name from the URL
            pdf_name = os.path.basename(href)
            
            # Create the directory if it doesn't exist
            if not os.path.exists(save_dir):
                print(f"Making Directory: {save_dir.split('/')[-1]}")
                os.makedirs(save_dir)

            # Save the PDFs to the directory
            with open(os.path.join(save_dir, pdf_name), 'wb') as f:
                f.write(pdf_response.content)
                
    return save_dir

url = 'https://dcfs.nv.gov/Programs/CWS/CPS/ChildFatalities/Rural/'
url = 'https://dcfs.nv.gov/Programs/CWS/CPS/ChildFatalities/Washoe/'
download_all_pdfs(url)

https://dcfs.nv.gov//uploadedFiles/dcfsnvgov/content/Programs/CWS/CPS/ChildFatalities/Washoe/2023/1474590-2.3.23.pdf
https://dcfs.nv.gov//uploadedFiles/dcfsnvgov/content/Programs/CWS/CPS/ChildFatalities/Washoe/2023/PD_2.19.23.pdf
https://dcfs.nv.gov//uploadedFiles/dcfsnvgov/content/Programs/CWS/CPS/ChildFatalities/Washoe/2023/Public_Disclosure_Fatality_1308762.pdf
https://dcfs.nv.gov//uploadedFiles/dcfsnvgov/content/Programs/CWS/CPS/ChildFatalities/Washoe/2023/Public_Disclosure_1372951_NF_3.30.23.pdf
https://dcfs.nv.gov//uploadedFiles/dcfsnvgov/content/Programs/CWS/CPS/ChildFatalities/Washoe/2023/Public_Disclosure_1384342_NF_3.30.23.pdf
https://dcfs.nv.gov//uploadedFiles/dcfsnvgov/content/Programs/CWS/CPS/ChildFatalities/Washoe/2023/1509493-4.8.2023.pdf
https://dcfs.nv.gov//uploadedFiles/dcfsnvgov/content/Programs/CWS/CPS/ChildFatalities/Washoe/2023/Public_Disclosure_1510126_F_4.24.23_v2.pdf
https://dcfs.nv.gov//uploadedFiles/dcfsnvgov/content/Programs/CWS/CPS/ChildFatalities/Washoe/20

'C:/Users/kkurek/OneDrive - State of Nevada/Desktop/DCFS_Child/Washoe_pdfs'

In [9]:
                
def list_files(path):
    
    files = []
    for file in os.listdir(path):
        if os.path.isfile(os.path.join(path, file)):
            files.append(file)
            
    return files

def scrape_individual_pdf(pages_text, keys):

    # Initialize a dictionary with the keys and empty values
    data = {key: '' for key in keys}

    for page_text in pages_text:
        lines = page_text.split('\n')
        for line in lines:
            # Split each line into a key-value pair
            if ': ' in line:
                key, value = line.split(': ', 1)
                # Only add the key-value pair to the dictionary if the key is in the list of keys
                if key in keys and not data[key]:
                    data[key] = value

    # Now you can create a DataFrame from your data
    df = pd.DataFrame(data, index=[0])

    # Replace empty strings with NaN
    df.replace('', np.nan, inplace=True)
    
    return df

def loop_pdf_scrape(file_list, path, keys):
    
    df_list = []
    for file in file_list:
        
        # make sure full directory is appended before opening
        file = path + "\\" + file
        
        with pdfplumber.open(file) as pdf:
            # Extract text from each page
            pages_text = [page.extract_text() for page in pdf.pages]
            individual_df = scrape_individual_pdf(pages_text, keys)
            df_list.append(individual_df)
            
    return df_list

def cleaning_df(df_list, rename_cols, time_cols):

    # concatenate final list of dataframes
    df = pd.concat(df_list).reset_index(drop=True)
    
    # make column names pretty
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_', regex=False)\
                    .str.replace('(', '', regex=False).str.replace(')', '', regex=False)
    
    # rename long columns
    df = df.rename(columns=rename_cols)
    
    # convert to pandas datetime dtype
    df[time_cols] = df[time_cols].apply(pd.to_datetime, format="%m/%d/%Y", errors='coerce')
    
    # unify 'Internal reference UNITY Case Number' columns
    col1, col2 = 'internal_reference_unity_case_number_or_report_number', 'internal_reference_unity_case_number'
    df[col2] = df[col2].fillna(df[col1])
    
    # drop old 'report number' version of column
    df = df.drop([col1], axis=1)
    
    # sort values by date, newest at top
    df = df.sort_values(by='date', ascending=False).reset_index(drop=True)
    
    return df

def run(url, keys, rename_cols, time_cols):

    # Start the timer
    start_time = time.time()

    # takes about 2 min to download all files from url for Clark County
    save_dir = download_all_pdfs(url)
    print("Done downloading all pdfs")

    # Call the function to create a list of all pdfs from directory
    file_list = list_files(path=save_dir)
    
    # list all local pdfs
    file_list = list_files(path=save_dir)
    print(len(file_list))

    # scrape each individual pdf from file_list: takes about 35 seconds for Clark County
    df_list = loop_pdf_scrape(file_list, path=save_dir, keys=keys)
    print("Done scraping pdfs")

    # concatenate final list of dataframes, clean, and sort dataframe
    final_df = cleaning_df(df_list, rename_cols, time_cols)
    print(final_df.shape)
    
    # save final csv per county
    county = url.split('/')[-2]
    csv_filename = f"child_fatality_{county}.csv"
    final_df.to_csv(csv_filename, index=False)
    print(f"Saved {csv_filename}")
    
    # Calculate the elapsed time
    elapsed_time = time.time() - start_time
    print(f"Execution time for {county}: {round(elapsed_time,2)} seconds")

### Debug individual pdf

In [10]:
def scrape_individual_pdf(pages_text, keys):

    # Initialize a dictionary with the keys and empty values
    data = {key: '' for key in keys}

    for page_text in pages_text:
        lines = page_text.split('\n')
        for line in lines:
            # Split each line into a key-value pair
            if ': ' in line:
                key, value = line.split(': ', 1)
                # Only add the key-value pair to the dictionary if the key is in the list of keys
                if key in keys and not data[key]:
                    data[key] = value

    # Now you can create a DataFrame from your data
    df = pd.DataFrame(data, index=[0])

    # Replace empty strings with NaN
    df.replace('', np.nan, inplace=True)
    
    return df

keys = [
'Date',
'Agency Name',
'Agency Address',
'Date of written notification to the Division of Child and Family Services and Legislative Auditor',
'Internal reference UNITY Case Number',
'Internal reference UNITY Case Number or Report Number',
'Child Fatality Date of Death',
'Near Fatality Date of Near Fatality',
'A summary of the report of abuse or neglect and a factual description of the contents of the report',
'The cause of the fatality or near fatality, if such information has been determined']

file = r'C:\Users\kkurek\OneDrive - State of Nevada\Desktop\DCFS_Child\Clark_pdfs\2023_04_13_ID_1446073.pdf'
with pdfplumber.open(file) as pdf:
    # Extract text from each page
    pages_text = [page.extract_text() for page in pdf.pages]
    print(pages_text)
    individual_df = scrape_individual_pdf(pages_text, keys)
    
individual_df

["Division of Child and Family Services MTL# 0401-12032021\nFamily Programs Office: Statewide Policy Manual Section 0400\nCHILD WELFARE AGENCY PUBLIC DISCLOSURE FORM\nDate: 5/2/2023\nAgency Name: Clark County Department of Family Services\nAgency Address: 500 S. Grand Central Pkwy, 5th Floor, Las Vegas, NV 89155\nDate of written notification to the Division of Child and Family Services and Legislative Auditor: 4/20/2023\nInternal reference UNITY Case Number: 1446073\nChildFatality Date of Death:\nNear Fatality Date of Near Fatality: 4/13/2023\nPortions of information on this form have been withheld at the request of _________________________ law\nenforcement agency.\nINFORMATION FOR RELEASE\nA. Date of the notification to the child welfare agency of the fatality/near fatality of a child:\n4/13/2023\nB. Location of child at the time of death or near fatality (city/county):\nLas Vegas, Clark\nC. A summary of the report of abuse or neglect and a factual description of the contents of the 

Unnamed: 0,Date,Agency Name,Agency Address,Date of written notification to the Division of Child and Family Services and Legislative Auditor,Internal reference UNITY Case Number,Internal reference UNITY Case Number or Report Number,Child Fatality Date of Death,Near Fatality Date of Near Fatality,A summary of the report of abuse or neglect and a factual description of the contents of the report,"The cause of the fatality or near fatality, if such information has been determined"
0,5/2/2023,Clark County Department of Family Services,"500 S. Grand Central Pkwy, 5th Floor, Las Vega...",4/20/2023,1446073,,,4/13/2023,,


In [11]:
def merge_dicts(data, new_dict):
    for key in new_dict:
        data[key] = new_dict[key]
    return data

data = {"A": "hi", "B": ""}
new_dict = {"B": "hey", "C": "you"}
merge_dicts(data, new_dict)

{'A': 'hi', 'B': 'hey', 'C': 'you'}

In [12]:
import re
import pandas as pd
import numpy as np

def restructure_alphabetical_dict(alphabetical_dict):
    new_dict = {}
    for key, value in alphabetical_dict.items():
        # Split the value into a key-value pair
        split_value = value.split(':', 1)
        if len(split_value) > 1:
            # The key is everything before ":" and the value is everything after ":"
            new_key = split_value[0]
            # Replace all '\n' in the value with empty string
            new_value = split_value[1].replace('\n', '')
            new_dict[new_key] = new_value
    return new_dict

def merge_dicts(data, new_dict):
    for key in new_dict:
        data[key] = new_dict[key]
    return data

def scrape_individual_pdf(pages_text, keys):

    # Initialize a dictionary with the keys and empty values
    data = {key: '' for key in keys}
    alphabetical_dict = {}
    current_alphabetical_key = None
    information_for_release_index = None

    # Find the index of 'INFORMATION FOR RELEASE'
    for page_text in pages_text:
        lines = page_text.split('\n')
        for i, line in enumerate(lines):
            if 'INFORMATION FOR RELEASE' in line:
                information_for_release_index = i
                break

    for page_text in pages_text:
        lines = page_text.split('\n')
        
        for i, line in enumerate(lines):
            if i < information_for_release_index:
                # Split each line into a key-value pair
                if ': ' in line:
                    key, value = line.split(': ', 1)
                    # Only add the key-value pair to the dictionary if the key is in the list of keys
                    if key in keys and not data[key]:
                        data[key] = value
            else:
                # For lines after "INFORMATION FOR RELEASE"
                match = re.match(r"([A-Z])\.", line)
                if match:
                    # Alphabetical key detected
                    current_alphabetical_key = match.group(1)
                    # Initialize the new key in the dictionary
                    alphabetical_dict[current_alphabetical_key] = line[3:] + "\n"  # remove the redundant key by slicing from index 3
                elif current_alphabetical_key:
                    # If it's not a new key, append the line to the last key's value
                    alphabetical_dict[current_alphabetical_key] += line + "\n"
    
    # Needed to restructure alphabetical dictionary for messiness inside
    print(alphabetical_dict)
    new_dict = restructure_alphabetical_dict(alphabetical_dict)
    print(new_dict)
    
    # Put new_dict = {'A summary of the report of abuse or neglect and a factual description of the contents of the report': 'CCDFS received ....'}
    # into "data" dict which currently has blank values for the associated keys
    data = merge_dicts(data, new_dict)
    
    # Now you can create a DataFrame from your data
    df = pd.DataFrame(data, index=[0])

    # Replace empty strings with NaN
    df.replace('', np.nan, inplace=True)
    
    return df


keys = [
'Date',
'Agency Name',
'Agency Address',
'Date of written notification to the Division of Child and Family Services and Legislative Auditor',
'Internal reference UNITY Case Number',
'Internal reference UNITY Case Number or Report Number',
'Child Fatality Date of Death',
'Near Fatality Date of Near Fatality',
'A summary of the report of abuse or neglect and a factual description of the contents of the report',
'The cause of the fatality or near fatality, if such information has been determined']

file = r'C:\Users\kkurek\OneDrive - State of Nevada\Desktop\DCFS_Child\Clark_pdfs\2023_04_13_ID_1446073.pdf'
with pdfplumber.open(file) as pdf:
    # Extract text from each page
    pages_text = [page.extract_text() for page in pdf.pages]
    individual_df = scrape_individual_pdf(pages_text, keys)

# print(data)
# print(alpha_dict)
individual_df

{'A': 'Date of the notification to the child welfare agency of the fatality/near fatality of a child:\n4/13/2023\n', 'B': 'Location of child at the time of death or near fatality (city/county):\nLas Vegas, Clark\n', 'C': 'A summary of the report of abuse or neglect and a factual description of the contents of the report:\nCCDFS received a report that the child’s adult caretaker called 9-1-1 due to the child being in physical distress.\nThe child was transported to a local hospital emergency room by emergency medical services. Upon medical\nassessment, the child was determined to be critically ill. The child was admitted for ongoing care and treatment. A\nconcern was noted that no explanation was provided for the child’s condition. Local law enforcement is\ninvestigating the event.\n', 'D': 'The date of birth and gender of child:\n7/19/2021, Male\n', 'E': 'The date that the child suffered the fatality or near fatality:\n4/13/2023\n', 'F': 'The cause of the fatality or near fatality, if 

Unnamed: 0,Date,Agency Name,Agency Address,Date of written notification to the Division of Child and Family Services and Legislative Auditor,Internal reference UNITY Case Number,Internal reference UNITY Case Number or Report Number,Child Fatality Date of Death,Near Fatality Date of Near Fatality,A summary of the report of abuse or neglect and a factual description of the contents of the report,"The cause of the fatality or near fatality, if such information has been determined",Date of the notification to the child welfare agency of the fatality/near fatality of a child,Location of child at the time of death or near fatality (city/county),The date of birth and gender of child,The date that the child suffered the fatality or near fatality,"Whether the agency had any contact with the child or a member of the child’s family or household before\nthe fatality or near fatality and, if so…\n(1) The frequency of any contact or communication with the child or a member of the child’s family or household before the fatality or near\nfatality and the date on which the last contact or communication occurred before the fatality or near fatality;\n(2) Whether the agency which provides child welfare services provided any child welfare services to the child or to a member of the child’s\nfamily or household before or at the time of the fatality or near fatality;\n(3) Whether the agency which provides child welfare services made any referrals for child welfare services for the child or for a member of the\nchild’s family or household before or at the time of the fatality or near fatality;\n(4) Whether the agency which provides child welfare services took any other actions concerning the welfare of the child before or at the time\nof the fatality or near fatality; and\n(5) A summary of the status of the child’s case at the time of the fatality or near fatality, including, without limitation, whether the child’s case\nwas closed by the agency which provides child welfare services before the fatality or near fatality and, if so, the reasons that the case was\nclosed.\nThe information contained in this section is limited to contact(s) with the child who is the subject of this disclosure or\na member of that child's family or household that is related to the fatality or near fatality incident. This limitation is\nrequired to preserve the confidentiality of all child abuse and neglect reports and records in order to protect the\nrights of the child and family as mandated by the Child Abuse Prevention and Treatment Act (CAPTA), as amended\n(42 U.S.C. 5101 et seq.).\nDate","Whether the agency which provides child welfare services, in response to the fatality or near fatality…\n(1) Has provided or intends to provide child welfare services to the child or to a member of the child’s family or household; and\n(2) Has made or intends to make a referral for child welfare services for the child or for a member of the child’s family or household; and\n(3) Has taken or intends to take any other action concerning the welfare and safety of the child or any member of the child’s family or\nhousehold.\nCCDFS has opened a case for investigation and family assessment. CCDFS will, as deemed appropriate, provide\nchild welfare services, make referrals for child welfare services, and/or take action concerning the welfare and\nsafety of the child and/or relevant members of the child’s family or household.\nNON-DISCLOSURE NOTICE\nThe following information must not be released (see Policy on Public Disclosure of Child Fatality and Near Fatality Information, page 5)"
0,5/2/2023,Clark County Department of Family Services,"500 S. Grand Central Pkwy, 5th Floor, Las Vega...",4/20/2023,1446073,,,4/13/2023,CCDFS received a report that the child’s adult...,The cause of the near fatality is under invest...,4/13/2023,"Las Vegas, Clark","7/19/2021, Male",4/13/2023,12/03/2021 FPO 0401A – Child Welfare Agency P...,1) Information regarding the sibling(s) of a d...


In [13]:
# def restructure_alphabetical_dict(alphabetical_dict):
#     new_dict = {}
#     for key, value in alphabetical_dict.items():
#         # Split the value into a key-value pair
#         split_value = value.split(':', 1)
#         if len(split_value) > 1:
#             # The key is everything before ":" and the value is everything after ":"
#             new_key = split_value[0]
#             # Replace all '\n' in the value with empty string
#             new_value = split_value[1].replace('\n', '')
#             new_dict[new_key] = new_value
#     return new_dict

# new_dict = restructure_alphabetical_dict(alpha_dict)
# new_dict

In [None]:
def merge_dicts(data, new_dict):
    for key in new_dict:
        if key in data:
            data[key] = new_dict[key]
    return data
data =  {'Date': '5/2/2023', 'Agency Name': 'Clark County Department of Family Services', 
         'Agency Address': '500 S. Grand Central Pkwy, 5th Floor, Las Vegas, NV 89155', 
         'Date of written notification to the Division of Child and Family Services and Legislative Auditor': '4/20/2023', 
         'Internal reference UNITY Case Number': '1446073', 'Internal reference UNITY Case Number or Report Number': '',
         'Child Fatality Date of Death': '', 'Near Fatality Date of Near Fatality': '4/13/2023', 
         'A summary of the report of abuse or neglect and a factual description of the contents of the report': '', 
         'The cause of the fatality or near fatality, if such information has been determined': ''}

new_dict = {'Date of the notification to the child welfare agency of the fatality/near fatality of a child': '4/13/2023',
 'Location of child at the time of death or near fatality (city/county)': 'Las Vegas, Clark',
 'A summary of the report of abuse or neglect and a factual description of the contents of the report': 
            'CCDFS received a report that the child’s adult caretaker called 9-1-1 due to the child being in physical distress.The child was transported to a local hospital emergency room by emergency medical services. Upon medicalassessment, the child was determined to be critically ill. The child was admitted for ongoing care and treatment. Aconcern was noted that no explanation was provided for the child’s condition. Local law enforcement isinvestigating the event.'}
new = merge_dicts(data, new_dict)
new

{'Date': '5/2/2023',
 'Agency Name': 'Clark County Department of Family Services',
 'Agency Address': '500 S. Grand Central Pkwy, 5th Floor, Las Vegas, NV 89155',
 'Date of written notification to the Division of Child and Family Services and Legislative Auditor': '4/20/2023',
 'Internal reference UNITY Case Number': '1446073',
 'Internal reference UNITY Case Number or Report Number': '',
 'Child Fatality Date of Death': '',
 'Near Fatality Date of Near Fatality': '4/13/2023',
 'A summary of the report of abuse or neglect and a factual description of the contents of the report': 'CCDFS received a report that the child’s adult caretaker called 9-1-1 due to the child being in physical distress.The child was transported to a local hospital emergency room by emergency medical services. Upon medicalassessment, the child was determined to be critically ill. The child was admitted for ongoing care and treatment. Aconcern was noted that no explanation was provided for the child’s condition. Lo

In [46]:
!pip install zipcodes

Collecting zipcodes
  Downloading zipcodes-1.2.0-py2.py3-none-any.whl (719 kB)
     -------------------------------------- 719.6/719.6 kB 7.5 MB/s eta 0:00:00
Installing collected packages: zipcodes
Successfully installed zipcodes-1.2.0


In [78]:
def get_city_by_zip(zip_code):
    """
    This function returns the city corresponding to the given zip code.
    It uses the `matching` function from the `zipcodes` package to find the city.

    Parameters:
    zip_code (str): The zip code for which to find the corresponding city.

    Returns:
    str: The city that corresponds to the given zip code. If no city is found, it returns None.

    Example:
    >>> get_city_by_zip('89706')
    'Carson City'
    
    >>> get_city_by_zip('00000')
    np.nan

    >>> get_city_by_zip('abcd-123')
    np.nan
    """
    
    try:
        result = zipcodes.is_real(str(zip_code))
    except Exception:
        return np.nan

    result = zipcodes.matching(str(zip_code))
    # If a result was found, return the city from the first match
    if result:
        return result[0]['city']

    # If no result was found, return np.nan
    return np.nan

codes = [np.nan, '', '25', '89706', 'a2', 89706]

for zip_code in codes:
    print(get_city_by_zip(zip_code))

nan
nan
nan
Carson City
nan
Carson City


In [48]:
import pandas as pd
import re
import zipcodes

# Function to return city based on zip code
def get_city_by_zip(zip_code):
    result = zipcodes.matching(zip_code)
    if result:  # if a result was found
        return result[0]['city']  # return the city from the first match
    return None  # if no result was found

# Assuming df is your DataFrame
addresses = [
    "1010 Ruby Vista Drive, Suite 101 Elko, Nevada 89801",
    "2533 N. Carson Street #100 Carson City, Nevada 89706",
    "2533 N. Carson Street #100 Carson City, Nevada 89706",
    "740 Park Ave, Ely, NV 89301",
    "1780 E. Basin Street, Pahrump NV 89060",
    "1010 Ruby Vista Drive, Ste. 101, Elko NV 89801",
    "2533 N. Carson Street #100 Carson City, Nevada 89706",
    "2533 N. Carson Street #100 Carson City, Nevada 89701",
    "2533 N Carson St. Suite 100 Carson City NV 89706",
    "55 N. Center Street #3 Fernley, Nevada 89408",
    "1735 Kaiser Street, Fallon, Nevada 89406",
    "1010 Ruby Vista Drive #101, Elko NV 89801"
]


df = pd.DataFrame(addresses, columns=["agency_address"])
df['zip'] = df['agency_address'].apply(lambda x: x.split()[-1])

# Apply the function to the zip column and create a new column 'city'
df['city'] = df['zip'].apply(get_city_by_zip)


df

Unnamed: 0,agency_address,zip,city
0,"1010 Ruby Vista Drive, Suite 101 Elko, Nevada ...",89801,Elko
1,"2533 N. Carson Street #100 Carson City, Nevada...",89706,Carson City
2,"2533 N. Carson Street #100 Carson City, Nevada...",89706,Carson City
3,"740 Park Ave, Ely, NV 89301",89301,Ely
4,"1780 E. Basin Street, Pahrump NV 89060",89060,Pahrump
5,"1010 Ruby Vista Drive, Ste. 101, Elko NV 89801",89801,Elko
6,"2533 N. Carson Street #100 Carson City, Nevada...",89706,Carson City
7,"2533 N. Carson Street #100 Carson City, Nevada...",89701,Carson City
8,2533 N Carson St. Suite 100 Carson City NV 89706,89706,Carson City
9,"55 N. Center Street #3 Fernley, Nevada 89408",89408,Fernley


In [23]:
import os

def list_files(path):
    files = []
    for file in os.listdir(path):
        if os.path.isfile(os.path.join(path, file)):
            files.append(file)
    return files

# Provide the path you want to list the files for
path = r"C:\Users\kkurek\OneDrive - State of Nevada\Desktop\DCFS_Child\Clark_pdfs"

# Call the function
file_list = list_files(path)
print(len(file_list))
# file_list

191


In [74]:
import pdfplumber
import pandas as pd
import numpy as np

# Define the keys you want to extract
keys = [
    'Date',
    'Agency Name',
    'Agency Address',
    'Date of written notification to the Division of Child and Family Services and Legislative Auditor',
    'Internal reference UNITY Case Number',
    'Internal reference UNITY Case Number or Report Number',
    'Child Fatality Date of Death',
    'Near Fatality Date of Near Fatality'
]

def scrape_individual_pdf(pages_text, keys):

    # Initialize a dictionary with the keys and empty values
    data = {key: '' for key in keys}

    for page_text in pages_text:
        lines = page_text.split('\n')
        for line in lines:
            # Split each line into a key-value pair
            if ': ' in line:
                key, value = line.split(': ', 1)
                # Only add the key-value pair to the dictionary if the key is in the list of keys
                if key in keys and not data[key]:
                    data[key] = value

    # Now you can create a DataFrame from your data
    df = pd.DataFrame(data, index=[0])

    # Replace empty strings with NaN
    df.replace('', np.nan, inplace=True)
    
    return df

def loop_pdf_scrape(file_list, path):
    
    df_list = []
    for file in file_list:
        
        # make sure full directory is appended before opening
        file = path + "\\" + file
        
        with pdfplumber.open(file) as pdf:
            # Extract text from each page
            pages_text = [page.extract_text() for page in pdf.pages]
            individual_df = scrape_individual_pdf(pages_text, keys)
            df_list.append(individual_df)
            
    return df_list

# list all local pdfs
path = r"C:\Users\kkurek\OneDrive - State of Nevada\Desktop\DCFS_Child\Clark_pdfs"
file_list = list_files(path)

# scrape each individual pdf from file_list
df_list = loop_pdf_scrape(file_list, path)

# rename columns
rename_cols = {'date_of_written_notification_to_the_division_of_child_and_family_services_and_legislative_auditor': 
                  'date_of_written_notification',
              'near_fatality_date_of_near_fatality': 
                   'near_fatality_date'}

# renamed time columns
time_cols = ['date', 
             'date_of_written_notification', 
             'child_fatality_date_of_death', 
             'near_fatality_date']

def cleaning_df(df_list, rename_cols, time_cols):

    # concatenate final list of dataframes
    df = pd.concat(df_list).reset_index(drop=True)
    
    # make column names pretty
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_', regex=False).str.replace('(', '', regex=False).str.replace(')', '', regex=False)
    
    # rename long columns
    df = df.rename(columns=rename_cols)
    
    # convert to pandas datetime dtype
    df[time_cols] = df[time_cols].apply(pd.to_datetime, format="%m/%d/%Y", errors='coerce')
    
    # unify 'Internal reference UNITY Case Number' & 'Internal reference UNITY Case Number or Report Number' cols
    col1, col2 = 'internal_reference_unity_case_number_or_report_number', 'internal_reference_unity_case_number'
    df[col2] = df[col2].fillna(df[col1])
    
    # drop old column
    df = df.drop([col1], axis=1)
    
    # sort values by date, newest at top
    df = df.sort_values(by='date', ascending=False).reset_index(drop=True)
    
    return df

final_df = cleaning_df(df_list, rename_cols, time_cols)
print(final_df.shape)
final_df.head()

(191, 7)


Unnamed: 0,date,agency_name,agency_address,date_of_written_notification,internal_reference_unity_case_number,child_fatality_date_of_death,near_fatality_date
0,2023-06-08,Clark County Department of Family Services,"500 S. Grand Central Pkwy, 5th Floor, Las Vega...",2023-06-08,1486611,NaT,2023-06-04
1,2023-06-06,Clark County Department of Family Services,"500 S. Grand Central Pkwy, 5th Floor, Las Vega...",2023-06-06,1504446,2023-06-04,2023-05-31
2,2023-06-06,Clark County Department of Family Services,"500 S. Grand Central Pkwy, 5th Floor, Las Vega...",2023-06-06,1511413,NaT,NaT
3,2023-05-31,Clark County Department of Family Services,"500 S. Grand Central Pkwy, 5th Floor, Las Vega...",2023-05-31,1511295,NaT,2023-05-25
4,2023-05-16,Clark County Department of Family Services,"500 S. Grand Central Pkwy, 5th Floor, Las Vega...",2023-05-03,1468484,NaT,NaT


In [75]:
final_df.tail()

Unnamed: 0,date,agency_name,agency_address,date_of_written_notification,internal_reference_unity_case_number,child_fatality_date_of_death,near_fatality_date
186,2021-01-26,Clark County Department of Family Services (CC...,121 S. Martin Luther King Blvd.,2021-01-26,1475771,NaT,2021-01-17
187,2021-01-25,Clark County Department of Family Services (CC...,121 S. Martin Luther King Blvd.,2021-01-25,1439777,2021-01-22,NaT
188,2021-01-19,Clark County Department of Family Services (CC...,121 S. Martin Luther King Blvd.,2021-01-19,1458420,2021-01-16,NaT
189,2021-01-12,Clark County Department of Family Services (CC...,121 S. Martin Luther King Blvd.,2021-01-12,1478970,NaT,2021-01-04
190,2021-01-04,Clark County Department of Family Services (CC...,121 S. Martin Luther King Blvd.,2021-01-04,1478934,2021-01-01,NaT


In [77]:
print(final_df['internal_reference_unity_case_number'].isna().sum())

1
