<a href="https://colab.research.google.com/github/jmccentee1488/Tawhiri/blob/master/Extraction_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pymupdf

In [None]:
!pip install "camelot-py[base]"

In [14]:
import os
import csv
import pdfminer
import camelot
import fitz  # PyMuPDF for image extraction
import pandas as pd
from pdfminer.high_level import extract_text
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
import re # Import the 're' module for regular expressions

# Define directories
BASE_DIR = "/content/Bio_ Mechanical PDF"
OUTPUT_DIR = "/content/extracted_data"
METADATA_FILE = os.path.join(OUTPUT_DIR, "processed_files.csv")

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "images"), exist_ok=True) # Create images subdirectory

# CSV Paths
TEXT_CSV = os.path.join(OUTPUT_DIR, "extracted_text.csv")
TABLE_CSV = os.path.join(OUTPUT_DIR, "extracted_tables.csv")
IMAGE_CSV = os.path.join(OUTPUT_DIR, "extracted_images.csv")

# Function to scan all PDFs in subdirectories
def get_pdf_files(directory):
    pdf_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))
    return pdf_files

# Function to check if a file has already been processed
def is_processed(file_name):
    if os.path.exists(METADATA_FILE):
        processed_files = pd.read_csv(METADATA_FILE)
        return file_name in processed_files['File_Name'].values
    return False

# Function to normalize extracted text
def normalize_text(text):
    text = text.replace("\n", " ").strip()  # Remove line breaks
    text = " ".join(text.split())  # Normalize spacing
    return text

# Function to extract text from a PDF
def extract_text_from_pdf(file_path):
    try:
        text = extract_text(file_path)
        return normalize_text(text)
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
        return ""

# Function to extract tables from a PDF
def extract_tables_from_pdf(file_path):
    tables = camelot.read_pdf(file_path, pages="all")
    extracted_tables = []
    for i, table in enumerate(tables):
        df = table.df
        extracted_tables.append((i+1, df.to_csv(index=False)))  # Store table with page number
    return extracted_tables

# Function to extract images from a PDF (Modified to save images)
def extract_images_from_pdf(file_path):
    try:
        doc = fitz.open(file_path)
        image_records = []
        for page_num, page in enumerate(doc):
            for img_index, img in enumerate(page.get_images(full=True)):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]

                # Create a filename for the extracted image
                image_name = f"{os.path.splitext(os.path.basename(file_path))[0]}_page_{page_num + 1}_{img_index}.{image_ext}"
                image_path = os.path.join(OUTPUT_DIR, "images", image_name)

                # Save the image
                with open(image_path, "wb") as image_file:
                    image_file.write(image_bytes)

                image_records.append([os.path.basename(file_path), os.path.dirname(file_path), page_num + 1, image_name])
        return image_records
    except Exception as e:
        print(f"Error extracting images from {file_path}: {e}")
        return []

# ... (rest of your code)

def extract_metadata(file_path):
    """Extracts metadata from a PDF file."""
    # Placeholder - Implement your metadata extraction logic here
    # Using PyMuPDF (fitz)
    doc = fitz.open(file_path)
    metadata = doc.metadata

    # Check for 'author' key and handle if missing
    author = metadata.get('author', 'Unknown')

    # Check for 'keywords' key and handle if missing
    keywords = metadata.get('keywords', 'Unknown')

    # Check for 'creationDate' key and handle if missing
    creation_date = metadata.get('creationDate', None)
    year = None
    if creation_date:
        match = re.search(r'D:(\d{4})', creation_date)  # Search for year pattern
        if match:
            year = match.group(1)

    return {
        'File_Name': os.path.basename(file_path),
        'Folder_Name': os.path.dirname(file_path),
        'Author': author, # Provide default value if missing
        'Year': year, # Provide default value if missing
        'Keywords': keywords # Provide default value if missing
    }

# ... (rest of your code)



# Main loop for PDF extraction
pdf_files = get_pdf_files(BASE_DIR)

data_records = []
table_records = []
image_records = []

for file_path in pdf_files:
    file_name = os.path.basename(file_path)
    folder_name = os.path.dirname(file_path)

    if is_processed(file_name):
        print(f"Skipping {file_name}, already processed.")
        continue

    print(f"Processing {file_name}...")

    # Extract text and tables
    text_content = extract_text_from_pdf(file_path)
    tables = extract_tables_from_pdf(file_path)
    images = extract_images_from_pdf(file_path)
    metadata = extract_metadata(file_path)

    # Save extracted text as structured data
    data_records.append([
        metadata['File_Name'],
        metadata['Folder_Name'],
        metadata['Folder_Name'],  # Category (using folder name for now)
        text_content,
        metadata['Author'],
        metadata['Year'],
        len(images), # Image count
        len(tables), # Table count
        metadata['Keywords'] # Keywords
    ])

    # Save extracted tables separately
    for page_num, table_content in tables:
        table_records.append([file_name, page_num, table_content])

    # Save extracted images separately
    image_records.extend(images)

# Save text-based extractions to CSV
with open(TEXT_CSV, "w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["File_Name", "Folder_Name", "Category", "Normalized Text", "Author", "Year", "Image_Count", "Table_Count", "Keywords"])
    writer.writerows(data_records)

# Save table-based extractions to CSV
with open(TABLE_CSV, "w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["File_Name", "Page Number", "Table Content"])
    writer.writerows(table_records)

# Save image-based extractions to CSV
with open(IMAGE_CSV, "w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["File_Name", "Category", "Page Number", "Image_Name"]) # Include image name
    writer.writerows(image_records)

# Update the list of processed files
with open(METADATA_FILE, "w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["File_Name"])
    for record in data_records:
        writer.writerow([record[0]])

print("Extraction complete. Data saved to CSV.")

Processing Muscle_activity_during_the_golf_swing.pdf...
Processing assafspu,+Journal+manager,+1262-3672-1-CE.pdf...
Processing SAJSM+37+Masoudi+final+2.pdf...




Processing SAJSM+36+Radulovic+final.pdf...
Processing assafspu,+Journal+manager,+216_sasma_2006_18_3_45779_80_91.pdf...
Processing assafspu,+Journal+manager,+527-914-1-CE.pdf...
Extraction complete. Data saved to CSV.


In [None]:
!pip install pdfminer.six

In [4]:
pdf_files = get_pdf_files(BASE_DIR)
print(pdf_files)

[]


Document: Comprehensive Guide to PDF Extraction Script

Overview

This document provides a detailed explanation of the PDF extraction script, which processes PDF files located in /Users/johnmcentee/Project_Golfusion/PDF_Directory and its subdirectories. The script extracts text, tables, and images, organizing the data into structured CSV files for further analysis.

Functionalities of the Script

1. Recursive Directory Scanning

The script scans all subdirectories within /Users/johnmcentee/Project_Golfusion/PDF_Directory to find all available PDF files.

Uses the os.walk() function to recursively iterate over all folders.

Collects paths to PDF files for processing.

2. Deduplication of Processed Files

A metadata file (processed_files.csv) is maintained to track processed files.

If a file has already been processed, it is skipped to avoid duplicate work.

Uses pandas to check existing records before proceeding.

3. Text Extraction

Utilizes pdfminer.six to extract textual content from PDFs.

Extracted text is normalized by:

Removing excessive whitespace.

Eliminating unnecessary line breaks.

Converting text to a uniform format.

The extracted text is stored in extracted_text.csv.

CSV Structure for Extracted Text (extracted_text.csv)

File_Name

Folder_Name

Category

Normalized Text

Author

Year Published

sample.pdf

Folder1

Folder1

Golf swing ...

Unknown

Unknown

4. Table Extraction

Uses Camelot to detect and extract tabular data from PDFs.

Extracts tables from all pages and converts them into structured CSV format.

Tables are stored in extracted_tables.csv.

CSV Structure for Extracted Tables (extracted_tables.csv)

File_Name

Page Number

Table Content

sample.pdf

2

"Column1, Column2 ... "

5. Image Extraction

Uses PyMuPDF (fitz) to extract images embedded in PDFs.

Identifies all images and logs their file name, category (folder name), and page number.

Metadata is stored in extracted_images.csv.

CSV Structure for Extracted Images (extracted_images.csv)

File_Name

Category

Page Number

sample.pdf

Folder1

2

6. Storing Extracted Data

Three structured CSV files are generated in the extracted_data/ directory:

extracted_text.csv (contains extracted text content)

extracted_tables.csv (contains extracted tabular data)

extracted_images.csv (contains metadata of extracted images)

A metadata file (processed_files.csv) maintains a record of processed files to prevent reprocessing.

Technologies and Libraries Used

os: For file and directory operations.

csv: For handling CSV file writing.

pandas: For efficient data management and deduplication.

pdfminer.six: For extracting text from PDF files.

camelot: For extracting tables from PDF files.

PyMuPDF (fitz): For extracting images from PDF files.

Execution Process

Locate all PDFs in /Users/johnmcentee/Project_Golfusion/PDF_Directory and subfolders.

Check if each file has been processed using processed_files.csv.

Extract and normalize text, then save to extracted_text.csv.

Extract tables, then save to extracted_tables.csv.

Extract images, then save metadata to extracted_images.csv.

Update processed_files.csv with newly processed files.

Enhancements & Next Steps

Integrate NLP: Implement Named Entity Recognition (NER) or Semantic Search to analyze extracted text.

Database Storage: Instead of CSV, store extracted data in BigQuery or SQLite for better querying.

Image Processing: Extract and store actual image files for advanced analysis.

Conclusion

This script provides an automated and efficient pipeline for extracting structured data from PDFs, making it easier to analyze and integrate into machine learning models or business intelligence tools. The modular design allows future enhancements such as AI-based insights, database integration, and real-time querying.

In [12]:
import os
import re
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from PIL import Image, ImageEnhance
import cv2
import nltk
from io import StringIO

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Directory where extracted CSVs are stored
OUTPUT_DIR = "/content/extracted_data" # Update this to your CSV directory

# Define EXTRACTION_DIR here, likely to be the same as OUTPUT_DIR
EXTRACTION_DIR = OUTPUT_DIR  # Or set to another path if needed

PROCESSED_DIR = os.path.join(EXTRACTION_DIR, "processed_data")
os.makedirs(PROCESSED_DIR, exist_ok=True)

# File paths
TEXT_CSV = os.path.join(EXTRACTION_DIR, "extracted_text.csv")
TABLE_CSV = os.path.join(EXTRACTION_DIR, "extracted_tables.csv")
IMAGE_CSV = os.path.join(EXTRACTION_DIR, "extracted_images.csv")

# Processed file paths
PROCESSED_TEXT_CSV = os.path.join(PROCESSED_DIR, "processed_text.csv")
PROCESSED_TABLE_CSV = os.path.join(PROCESSED_DIR, "processed_tables.csv")
PROCESSED_IMAGE_DIR = os.path.join(PROCESSED_DIR, "processed_images")
os.makedirs(PROCESSED_IMAGE_DIR, exist_ok=True)

# --- 1. Text Cleaning ---
def clean_text(text):
    """Clean and normalize text data."""
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = text.split()
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(cleaned_words)

def preprocess_text():
    """Clean and preprocess extracted text."""
    print("Processing text data...")
    df = pd.read_csv(TEXT_CSV)
    df['Normalized Text'] = df['Normalized Text'].apply(clean_text)
    df.to_csv(PROCESSED_TEXT_CSV, index=False)
    print(f"Processed text saved to {PROCESSED_TEXT_CSV}")

# --- 2. Table Cleaning ---
def clean_table(dataframe):
    """Clean and normalize table data."""
    # Fill missing values with 'Unknown'
    dataframe.fillna('Unknown', inplace=True)
    # Example: Normalize weights if a 'Weight' column exists
    if 'Weight' in dataframe.columns:
        dataframe['Weight'] = dataframe['Weight'].apply(
            lambda x: x * 0.453592 if 'lbs' in str(x) else x
        )
    return dataframe

def preprocess_tables():
    """Clean and preprocess extracted table data."""
    print("Processing table data...")
    df = pd.read_csv(TABLE_CSV)
    # Assumes the table content is stored as strings in one column
    df['Cleaned Table Content'] = df['Table Content'].apply(
        lambda x: clean_table(pd.read_csv(StringIO(x))).to_csv(index=False)
    )
    df.to_csv(PROCESSED_TABLE_CSV, index=False)
    print(f"Processed tables saved to {PROCESSED_TABLE_CSV}")

# --- 3. Image Preprocessing ---
def preprocess_images():
    """Preprocess images and save them to the processed directory."""
    print("Processing images...")
    df = pd.read_csv(IMAGE_CSV)

    # Assuming the image name is the last column in your CSV
    image_name_column = df.columns[-1]  # Get the name of the last column

    for _, row in df.iterrows():
        file_name = row['File_Name']
        category = row['Category']
        page_number = row['Page Number']
        image_name = row[image_name_column] # Get the extracted image filename using the dynamic column name

        # Construct the input image path (using the extracted image filename)
        input_image_path = os.path.join(EXTRACTION_DIR, "images", image_name)

        # Construct the output image path
        output_image_path = os.path.join(PROCESSED_IMAGE_DIR, f"{os.path.splitext(file_name)[0]}_page_{page_number}.jpg")

        # Assuming you have a preprocess_image function
        #preprocess_image(input_image_path, output_image_path)

        print(f"Processed images saved to {PROCESSED_IMAGE_DIR}")

# --- Main Pipeline Execution ---
def main():
    """Run the full preprocessing pipeline."""
    if not os.path.exists(TEXT_CSV):
        print(f"Text CSV not found at {TEXT_CSV}. Skipping text processing.")
    else:
        preprocess_text()

    if not os.path.exists(TABLE_CSV):
        print(f"Table CSV not found at {TABLE_CSV}. Skipping table processing.")
    else:
        preprocess_tables()

    if not os.path.exists(IMAGE_CSV):
        print(f"Image CSV not found at {IMAGE_CSV}. Skipping image processing.")
    else:
        preprocess_images()

    print("Preprocessing complete.")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processing text data...
Processed text saved to /content/extracted_data/processed_data/processed_text.csv
Processing table data...
Processed tables saved to /content/extracted_data/processed_data/processed_tables.csv
Processing images...
Processed images saved to /content/extracted_data/processed_data/processed_images
Processed images saved to /content/extracted_data/processed_data/processed_images
Processed images saved to /content/extracted_data/processed_data/processed_images
Processed images saved to /content/extracted_data/processed_data/processed_images
Processed images saved to /content/extracted_data/processed_data/processed_images
Processed images saved to /content/extracted_data/processed_data/processed_images
Processed images saved to /content/extracted_data/processed_data/processed_images
Processed images saved to /content/extracted_data/processed_data/processed_images
Processed images saved to /content/extracted_data/processed_data/processed_images
Processed images saved t

  dataframe.fillna('Unknown', inplace=True)
  dataframe.fillna('Unknown', inplace=True)
  dataframe.fillna('Unknown', inplace=True)
  dataframe.fillna('Unknown', inplace=True)
  dataframe.fillna('Unknown', inplace=True)
  dataframe.fillna('Unknown', inplace=True)
  dataframe.fillna('Unknown', inplace=True)
  dataframe.fillna('Unknown', inplace=True)
  dataframe.fillna('Unknown', inplace=True)
  dataframe.fillna('Unknown', inplace=True)
