<a href="https://colab.research.google.com/github/jmccentee1488/Tawhiri/blob/master/Extraction_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import os
import csv
import pdfminer
import camelot
import fitz  # PyMuPDF for image extraction
import pandas as pd
from pdfminer.high_level import extract_text
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument


# Define directories
BASE_DIR = "/content/Bio Mechanical PDF"
OUTPUT_DIR = "/content/extracted_data"
METADATA_FILE = os.path.join(OUTPUT_DIR, "processed_files.csv")

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# CSV Paths
TEXT_CSV = os.path.join(OUTPUT_DIR, "extracted_text.csv")
TABLE_CSV = os.path.join(OUTPUT_DIR, "extracted_tables.csv")
IMAGE_CSV = os.path.join(OUTPUT_DIR, "extracted_images.csv")

# Function to scan all PDFs in subdirectories
def get_pdf_files(directory):
    pdf_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))
    return pdf_files

# Function to check if a file has already been processed
def is_processed(file_name):
    if os.path.exists(METADATA_FILE):
        try:
            processed_files = pd.read_csv(METADATA_FILE)
            return file_name in processed_files['File_Name'].values
        except pd.errors.EmptyDataError:  # Handle empty CSV
            return False
    return False

# Function to normalize extracted text
def normalize_text(text):
    text = text.replace("\n", " ").strip()  # Remove line breaks
    text = " ".join(text.split())  # Normalize spacing
    return text

# Function to extract text from a PDF
def extract_text_from_pdf(file_path):
    try:
        text = extract_text(file_path)
        return normalize_text(text)
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
        return ""

# Function to extract tables from a PDF
def extract_tables_from_pdf(file_path):
    tables = camelot.read_pdf(file_path, pages="all")
    extracted_tables = []
    for i, table in enumerate(tables):
        df = table.df
        extracted_tables.append((i+1, df.to_csv(index=False)))  # Store table with page number
    return extracted_tables

# Function to extract images from a PDF
def extract_images_from_pdf(file_path):
    try:
        doc = fitz.open(file_path)
        image_records = []
        for page_num, page in enumerate(doc):
            for img_index, img in enumerate(page.get_images(full=True)):
                image_records.append([os.path.basename(file_path), os.path.dirname(file_path), page_num + 1])
        return image_records
    except Exception as e:
        print(f"Error extracting images from {file_path}: {e}")
        return []

def extract_metadata(file_path):
    """Extracts metadata from a PDF file."""
    metadata = {}
    try:
        with open(file_path, 'rb') as f:
            parser = PDFParser(f)
            document = PDFDocument(parser)

            metadata['File_Name'] = os.path.basename(file_path)
            metadata['Folder_Name'] = os.path.dirname(file_path)

            # Check if document.info is a dictionary
            if isinstance(document.info, dict):
                metadata['Keywords'] = document.info.get('Keywords', 'Unknown')
                metadata['Author'] = document.info.get('Author', 'Unknown')
                metadata['Year'] = document.info.get('CreationDate', 'Unknown')

                if metadata['Year'] and metadata['Year'] != 'Unknown':
                    metadata['Year'] = metadata['Year'][:4]  # Extract year
                else:
                    metadata['Year'] = 'Unknown'
            else:
                # Handle the case where document.info is not a dictionary (e.g., a list)
                metadata['Keywords'] = 'Unknown'
                metadata['Author'] = 'Unknown'
                metadata['Year'] = 'Unknown'

    except Exception as e:
        print(f"Error extracting metadata from {file_path}: {e}")
        metadata['Author'] = 'Unknown'
        metadata['Year'] = 'Unknown'
        metadata['Keywords'] = 'Unknown' # Added to handle Keywords in case of error

    return metadata

pdf_files = get_pdf_files(BASE_DIR)

data_records = []
table_records = []
image_records = []

for file_path in pdf_files:
    file_name = os.path.basename(file_path)
    folder_name = os.path.dirname(file_path)

    if is_processed(file_name):
        print(f"Skipping {file_name}, already processed.")
        continue

    print(f"Processing {file_name}...")

    # Extract text and tables
    text_content = extract_text_from_pdf(file_path)
    tables = extract_tables_from_pdf(file_path)
    images = extract_images_from_pdf(file_path)

    # Extract metadata
    metadata = extract_metadata(file_path)


    # Get image and table counts
    image_count = len(images)
    table_count = len(tables)

    # Save extracted text as structured data
    data_records.append([
        metadata['File_Name'],
        metadata['Folder_Name'],
        metadata['Folder_Name'],  # Category (using folder name for now)
        text_content,
        metadata['Author'],
        metadata['Year'],
        image_count, # Added image count
        table_count, # Added table count
        metadata.get('Keywords', 'Unknown') # Added keywords, use get to handle missing keys
    ])

    # Save extracted tables separately
    for page_num, table_content in tables:
        table_records.append([file_name, page_num, table_content])

    # Save extracted images separately
    image_records.extend(images)

# Save text-based extractions to CSV
# 'a' mode for appending
with open(TEXT_CSV, "a", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    # Write header only if file is empty or doesn't exist
    if not os.path.exists(TEXT_CSV) or os.stat(TEXT_CSV).st_size == 0:
        writer.writerow(["File_Name", "Folder_Name", "Category", "Normalized Text", "Author", "Year", "Image_Count", "Table_Count", "Keywords"])
    writer.writerows(data_records)

# Save table-based extractions to CSV
# 'a' mode for appending
with open(TABLE_CSV, "a", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    # Write header only if file is empty or doesn't exist
    if not os.path.exists(TABLE_CSV) or os.stat(TABLE_CSV).st_size == 0:
        writer.writerow(["File_Name", "Page Number", "Table Content"])
    writer.writerows(table_records)

# Save image-based extractions to CSV
# 'a' mode for appending
with open(IMAGE_CSV, "a", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    # Write header only if file is empty or doesn't exist
    if not os.path.exists(IMAGE_CSV) or os.stat(IMAGE_CSV).st_size == 0:
        writer.writerow(["File_Name", "Category", "Page Number"])
    writer.writerows(image_records)

# Update the list of processed files
# 'a' mode for appending
with open(METADATA_FILE, "a", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    # Write header only if file is empty or doesn't exist
    if not os.path.exists(METADATA_FILE) or os.stat(METADATA_FILE).st_size == 0:
        writer.writerow(["File_Name"])
    for record in data_records:
        writer.writerow([record[0]])

print("Extraction complete. Data saved to CSV.")

Skipping Muscle_activity_during_the_golf_swing.pdf, already processed.
Processing Enhance_Information_Propagation_for_Grap.pdf...
Skipping assafspu,+Journal+manager,+1262-3672-1-CE.pdf, already processed.
Skipping Three_Dimensional_Kinematics_Observed_Be.pdf, already processed.
Skipping A_Comprehensive_Survey_on_Graph_Neural_N.pdf, already processed.
Skipping SAJSM+37+Masoudi+final+2.pdf, already processed.
Skipping SAJSM+36+Radulovic+final.pdf, already processed.
Processing Estimating_Upper_Extremity_Joint_Contrib.pdf...
Processing Convolutional_Graph_Neural_Networks.pdf...
Extraction complete. Data saved to CSV.


In [None]:
!pip install pymupdf

In [None]:
!pip install "camelot-py[base]"

In [None]:
!pip install pdfminer.six

In [12]:
pdf_files = get_pdf_files(BASE_DIR)
print(pdf_files)

['/content/Bio Mechanical PDF/Muscle_activity_during_the_golf_swing.pdf', '/content/Bio Mechanical PDF/assafspu,+Journal+manager,+1262-3672-1-CE.pdf', '/content/Bio Mechanical PDF/Three_Dimensional_Kinematics_Observed_Be.pdf', '/content/Bio Mechanical PDF/SAJSM+37+Masoudi+final+2.pdf', '/content/Bio Mechanical PDF/SAJSM+36+Radulovic+final.pdf']


Document: Comprehensive Guide to PDF Extraction Script

Overview

This document provides a detailed explanation of the PDF extraction script, which processes PDF files located in /Users/johnmcentee/Project_Golfusion/PDF_Directory and its subdirectories. The script extracts text, tables, and images, organizing the data into structured CSV files for further analysis.

Functionalities of the Script

1. Recursive Directory Scanning

The script scans all subdirectories within /Users/johnmcentee/Project_Golfusion/PDF_Directory to find all available PDF files.

Uses the os.walk() function to recursively iterate over all folders.

Collects paths to PDF files for processing.

2. Deduplication of Processed Files

A metadata file (processed_files.csv) is maintained to track processed files.

If a file has already been processed, it is skipped to avoid duplicate work.

Uses pandas to check existing records before proceeding.

3. Text Extraction

Utilizes pdfminer.six to extract textual content from PDFs.

Extracted text is normalized by:

Removing excessive whitespace.

Eliminating unnecessary line breaks.

Converting text to a uniform format.

The extracted text is stored in extracted_text.csv.

CSV Structure for Extracted Text (extracted_text.csv)

File_Name

Folder_Name

Category

Normalized Text

Author

Year Published

sample.pdf

Folder1

Folder1

Golf swing ...

Unknown

Unknown

4. Table Extraction

Uses Camelot to detect and extract tabular data from PDFs.

Extracts tables from all pages and converts them into structured CSV format.

Tables are stored in extracted_tables.csv.

CSV Structure for Extracted Tables (extracted_tables.csv)

File_Name

Page Number

Table Content

sample.pdf

2

"Column1, Column2 ... "

5. Image Extraction

Uses PyMuPDF (fitz) to extract images embedded in PDFs.

Identifies all images and logs their file name, category (folder name), and page number.

Metadata is stored in extracted_images.csv.

CSV Structure for Extracted Images (extracted_images.csv)

File_Name

Category

Page Number

sample.pdf

Folder1

2

6. Storing Extracted Data

Three structured CSV files are generated in the extracted_data/ directory:

extracted_text.csv (contains extracted text content)

extracted_tables.csv (contains extracted tabular data)

extracted_images.csv (contains metadata of extracted images)

A metadata file (processed_files.csv) maintains a record of processed files to prevent reprocessing.

Technologies and Libraries Used

os: For file and directory operations.

csv: For handling CSV file writing.

pandas: For efficient data management and deduplication.

pdfminer.six: For extracting text from PDF files.

camelot: For extracting tables from PDF files.

PyMuPDF (fitz): For extracting images from PDF files.

Execution Process

Locate all PDFs in /Users/johnmcentee/Project_Golfusion/PDF_Directory and subfolders.

Check if each file has been processed using processed_files.csv.

Extract and normalize text, then save to extracted_text.csv.

Extract tables, then save to extracted_tables.csv.

Extract images, then save metadata to extracted_images.csv.

Update processed_files.csv with newly processed files.

Enhancements & Next Steps

Integrate NLP: Implement Named Entity Recognition (NER) or Semantic Search to analyze extracted text.

Database Storage: Instead of CSV, store extracted data in BigQuery or SQLite for better querying.

Image Processing: Extract and store actual image files for advanced analysis.

Conclusion

This script provides an automated and efficient pipeline for extracting structured data from PDFs, making it easier to analyze and integrate into machine learning models or business intelligence tools. The modular design allows future enhancements such as AI-based insights, database integration, and real-time querying.