### This script takes a large multi case multi report pdf file and does the following:
    -Find all specimen IDs and their page number in the multi-report pdf file
    -Identifes pages taht include the OncoKids report based on the keyword pattern
    -Extracts OncoKids report for each case based on the page number (start and end) and case ID pattern (specimen_pattern)
    -Stores each pdf using specimen ID as their name
    -Input: large pdf file, year, specimen_pattern (format of case ID)
    -Output: split OncoKids pdf files for each case ID

In [67]:
import re
import pandas as pd
import os
import sys

# Add the src directory to the path so we can import our modules
sys.path.append('..')

# Import our custom modules
from src.pdf_utils import (
    search_specimen_in_pdf,
    generate_table,
    print_unique_values,
    find_pages_btw_keywords,
    extract_pdf_pages
)
from src.file_utils import get_matching_files, ensure_directory_exists

In [68]:
### define parameters
work_dir=".."
years = ["2022"]
output_path = work_dir+"/data/CoPath_All_pdfs/OncoKids_split"
specimen_pattern=r'[SCHM]\d{2}-\d{1,5}'

# Ensure output directory exists
ensure_directory_exists(output_path)

## construct the result tables 

In [69]:
# Create empty DataFrame for final results
final_results = pd.DataFrame()

# Process each year
for year in years:
    print(f"\nProcessing year {year}")
    pdf_input_path = work_dir+"/data/CoPath_All_pdfs/" + year
    file_pattern = f"PDMReport.{year}*.pdf"
    
    # Get all matching files for this year
    pdf_files = get_matching_files(pdf_input_path, file_pattern)
    print(f"Found {len(pdf_files)} files to process for year {year}")


Processing year 2022
Found 28 files to process for year 2022


In [70]:
# Process each file in the current year
for pdf_file in pdf_files:
    print(f"\nProcessing file: {os.path.basename(pdf_file)}")
    
    try:
        # Process specimen IDs
        result_Specimen = search_specimen_in_pdf(pdf_file, pattern=specimen_pattern)
        result_Specimen = pd.DataFrame(result_Specimen).drop_duplicates(subset="Page")
        result_Specimen.columns = ["Specimen", "Specimen_Page"]
        
        # Process OncoKids pages
        regex1 = r'OncoKids Cancer Panel (?![is])'
        regex2 = r'The OncoKids Cancer Panel is'
        result_oncokids = find_pages_btw_keywords(pdf_file, regex1, regex2, max_dist=7)
        
        # Convert to DataFrame with explicit column names
        if result_oncokids:  # Check if any results were found
            result_oncokids_df = pd.DataFrame(result_oncokids)
            
            # Merge results
            results_tbl = pd.merge(
                result_oncokids_df,
                result_Specimen,
                left_on="start_page_number",
                right_on="Specimen_Page",
                how="left"
            )
            
            # Clean results
            results_tbl = results_tbl[results_tbl["Specimen_Page"].notnull()]
            
            # Convert to integers
            results_tbl['start_page_number'] = results_tbl['start_page_number'].astype(int)
            results_tbl['end_page_number'] = results_tbl['end_page_number'].astype(int)
            results_tbl['Specimen_Page'] = results_tbl['Specimen_Page'].astype(int)
            
            # Add file and year information
            results_tbl['source_file'] = os.path.basename(pdf_file)
            results_tbl['year'] = year
            
            # Append to final results
            final_results = pd.concat([final_results, results_tbl], ignore_index=True)
            
            print(f"Found {len(results_tbl)} OncoKids reports in this file")
            
            # Extract and save individual PDFs
            output_template = os.path.join(output_path, "OncoKids_{year}_{specimen}.pdf")
            
            # Extract pages for each report
            with open(pdf_file, "rb") as infile:
                for _, row in results_tbl.iterrows():
                    output_filename = output_template.format(year=year, specimen=row['Specimen'])
                    # Extract and save the pages using our utility function
                    extract_pages(pdf_file, output_filename, row['start_page_number'], row['end_page_number'])
                    print(f"Saved {os.path.basename(output_filename)}")
        else:
            print("No OncoKids reports found in this file")
            
    except Exception as e:
        print(f"Error processing file {os.path.basename(pdf_file)}: {str(e)}")
        continue


Processing file: PDMReport.20221228020139.pdf
Found 1 OncoKids reports in this file
Error processing file PDMReport.20221228020139.pdf: name 'extract_pages' is not defined

Processing file: PDMReport.20221130020300.pdf
Found 4 OncoKids reports in this file
Error processing file PDMReport.20221130020300.pdf: name 'extract_pages' is not defined

Processing file: PDMReport.20221128020217.pdf
Found 1 OncoKids reports in this file
Error processing file PDMReport.20221128020217.pdf: name 'extract_pages' is not defined

Processing file: PDMReport.20221212110132.pdf
Found 1 OncoKids reports in this file
Error processing file PDMReport.20221212110132.pdf: name 'extract_pages' is not defined

Processing file: PDMReport.20221207020257.pdf
Found 2 OncoKids reports in this file
Error processing file PDMReport.20221207020257.pdf: name 'extract_pages' is not defined

Processing file: PDMReport.20221223020310.pdf
Found 3 OncoKids reports in this file
Error processing file PDMReport.20221223020310.pdf

In [71]:
print(f"\nTotal processing complete:")
print(f"Total OncoKids reports found: {len(final_results)}")

if len(final_results) > 0:
    print(f"Reports by year:")
    year_counts = final_results['year'].value_counts()
    for year in years:
        count = year_counts.get(year, 0)  # Use get() to handle missing years
        print(f"- {year}: {count} reports")
else:
    print("No reports found in any year")

# Optional: Save the final results to a CSV file
if len(final_results) > 0:
    results_filename = os.path.join(output_path, "oncokids_report_summary.csv")
    final_results.to_csv(results_filename, index=False)
    print(f"\nSummary saved to: {results_filename}")


Total processing complete:
Total OncoKids reports found: 63
Reports by year:
- 2022: 63 reports

Summary saved to: ../data/CoPath_All_pdfs/OncoKids_split/oncokids_report_summary.csv
