In [1]:
# Import necessary modules
import sys
import os

# Append the directory of your script to the Python system path
# If your notebook is in the 'research' directory and your script is in the 'scripts' directory, 
# you will go up one level from 'research' and then into 'scripts'
sys.path.append(os.path.abspath('../scripts'))

# Now you can import your module
from child_fatality_scrape import * # replace 'your_function_name' with actual function names


In [5]:
import fitz  # PyMuPDF
import re

# Function to count dates in the "G." section of a PDF
def count_dates_in_g_section(pdf_data):
    # Open the PDF from the binary data
    doc = fitz.open(stream=pdf_data, filetype="pdf")
    text = ""
    # Extract text from each page
    for page in doc:
        text += page.get_text()
    doc.close()
    
    # Find the section starting with "G." and ending before "H." or the end of the document
    section_g = re.search(r'(?<=G\.).+?(?=H\.|\Z)', text, re.DOTALL)
    if not section_g:
        return 0  # If "G." section is not found, return 0

    section_g_text = section_g.group(0)
    # print(section_g_text)
    # Define a regex pattern for dates (MM/DD/YYYY or MM/DD/YY)
    date_pattern = re.compile(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b')
    # Find all dates in the "G." section
    dates = date_pattern.findall(section_g_text)

    # kevin intervention, ChatGPT cannot see multi-line 0400 12/03/2021 date from footer, simply subtract 1
    total_dates = len(dates) - 1

    return total_dates

# Paths to the PDF files
pdf_paths = [
    "Public_Disclosure_Fatality_1377353.pdf",
    "1453180_5_2_23.pdf",
    "Public_Disclosure_1510126_F_4.24.23_v2.pdf",
    "2023_04_13_ID_1446073.pdf",
]


def count_pdf_paths(pdf_paths):
    # Dictionary to hold the counts of dates for each PDF
    date_counts = {}

    # Process each PDF file and count the dates in the "G." section
    for pdf_path in pdf_paths:
        # Read the binary data of the PDF file
        with open(pdf_path, "rb") as f:
            pdf_data = f.read()
        # Count the dates in the "G." section
        date_counts[pdf_path] = count_dates_in_g_section(pdf_data)

    return date_counts

count_pdf_paths(pdf_paths)


{'Public_Disclosure_Fatality_1377353.pdf': 12,
 '1453180_5_2_23.pdf': 8,
 'Public_Disclosure_1510126_F_4.24.23_v2.pdf': 0,
 '2023_04_13_ID_1446073.pdf': 7}

In [6]:
# Now you can call your imported functions as needed
pdf_paths = list_files("/Users/kevinkurek/Desktop/github/state_of_nv_child_fatalities/output_files/Clark_pdfs", 
                       append_base_path=True)

In [25]:
import pandas as pd

def turn_pdf_counts_into_dataframe(pdf_paths):

    count_pdf_dict = count_pdf_paths(pdf_paths)
    # display(count_pdf_dict)
    processed_dict = {'/'.join(key.split('/')[-2:]): value for key, value in count_pdf_dict.items()}
    # processed_dict
    df = pd.DataFrame(list(processed_dict.items()), columns=['original_pdf', 'prior_cases_count'])
    df[['original_region', 'original_pdf']] = df['original_pdf'].str.split('/', expand=True)

    # Remove '_pdfs' from the 'original_region' column
    df['original_region'] = df['original_region'].str.replace('_pdfs', '')

    # Get a list of all other columns excluding 'original_region' and 'original_pdf'
    other_columns = [col for col in df.columns if col not in ['original_region', 'original_pdf']]

    # Define the new column order with 'original_region' and 'original_pdf' first
    new_column_order = ['original_region', 'original_pdf'] + other_columns

    # Reassign the DataFrame with the new column order
    df = df[new_column_order]
    
    return df

turn_pdf_counts_into_dataframe(pdf_paths=pdf_paths)

Unnamed: 0,original_region,original_pdf,prior_cases_count
0,Clark,2022-04-15_ID_1488030.pdf,2
1,Clark,2022-05-08_ID_1497671.pdf,0
2,Clark,2021_12_11_ID_1483068.pdf,2
3,Clark,2021_11_10_ID_1489652.pdf,1
4,Clark,2021_02_13_ID_1480482.pdf,0
...,...,...,...
197,Clark,2021_04_06_ID_1471160.pdf,2
198,Clark,2021-06-06_ID1454929_1(1).pdf,2
199,Clark,2023_04_28_ID_1352542.pdf,0
200,Clark,2021_03_12_ID_1481546.pdf,0


In [34]:
import os

def list_files_and_folders(directory):

    items = os.listdir(directory)
    files_and_folders = [(item, "folder" if os.path.isdir(os.path.join(directory, item)) else "file") for item in items]

    folders = [os.path.join(directory, i[0]) for i in files_and_folders if i[-1]=='folder']
    return folders

list_files_and_folders("../output_files/")

['../output_files/Washoe_pdfs',
 '../output_files/Clark_pdfs',
 '../output_files/Rural_pdfs']

In [31]:
import os

def list_csv_files(directory):
    """
    Lists all CSV files in a specified directory.

    Args:
        directory (str): Path of the directory to list CSV files from.

    Returns:
        List of CSV file names in the directory.
    """
    # List everything in the directory
    items = os.listdir(directory)
    
    # Filter for files that end with .csv
    csv_files = [item for item in items if item.endswith('.csv') and os.path.isfile(os.path.join(directory, item))]
    
    return csv_files

list_csv_files("../output_files/")

['child_fatality_Washoe.csv',
 'child_fatality_Rural.csv',
 'child_fatality_Clark.csv']