In [6]:
import fitz  # PyMuPDF
import pdfplumber
import os
import cv2
import numpy as np

In [7]:
# === CONFIGURATION ===

pdf_path = r"C:\kathir\all_file_1\temp_1.pdf"
output_dir = r"C:\kathir\all_file_1\output_new_approach"

images_dir = os.path.join(output_dir, "images")
graphs_dir = os.path.join(output_dir, "graphs")

os.makedirs(output_dir, exist_ok=True)
os.makedirs(images_dir, exist_ok=True)
os.makedirs(graphs_dir, exist_ok=True)

In [8]:
output_sequence = []

# === [STEP 1] PROCESS WITH PyMuPDF ===
doc = fitz.open(pdf_path)

for page_num in range(len(doc)):
    page = doc[page_num]
    blocks = page.get_text("dict")["blocks"]

    # --- TEXT BLOCKS (Column-wise) ---
    left_column = []
    right_column = []
    column_split_x = page.rect.width / 2

    for block in blocks:
        if block["type"] != 0:
            continue
        x0 = block["bbox"][0]
        y0 = block["bbox"][1]
        text = ""
        for line in block["lines"]:
            for span in line["spans"]:
                text += span["text"] + " "
        text = text.strip()

        if x0 < column_split_x:
            left_column.append((y0, text))
        else:
            right_column.append((y0, text))

    left_column.sort(key=lambda x: x[0])
    right_column.sort(key=lambda x: x[0])
    sorted_blocks = left_column + right_column

    for i, (_, text) in enumerate(sorted_blocks):
        output_sequence.append(f"[Text Page {page_num + 1} Block {i + 1}]\n{text}")

    # --- RASTER IMAGE EXTRACTION (normal images only) ---
    image_list = page.get_images(full=True)
    for img_index, img in enumerate(image_list):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        image_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
        image_path = os.path.join(images_dir, image_filename)

        with open(image_path, "wb") as f:
            f.write(image_bytes)

        output_sequence.append(f"[Image Page {page_num + 1} Image {img_index + 1}] -> {image_path}")

    # --- CROP GRAPHS USING OPENCV ---
    zoom = 4
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat)
    img_path = os.path.join(output_dir, f"temp_page_{page_num + 1}.png")
    pix.save(img_path)

    image = cv2.imread(img_path)
    if image is None:
        continue

    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    mask_red = cv2.inRange(hsv, (0, 50, 50), (10, 255, 255))
    mask_blue = cv2.inRange(hsv, (100, 150, 0), (140, 255, 255))
    mask_yellow = cv2.inRange(hsv, (20, 100, 100), (30, 255, 255))
    mask_green = cv2.inRange(hsv, (40, 40, 40), (80, 255, 255))

    color_mask = cv2.bitwise_or(mask_red, mask_blue)
    color_mask = cv2.bitwise_or(color_mask, mask_yellow)
    color_mask = cv2.bitwise_or(color_mask, mask_green)

    kernel = np.ones((15, 15), np.uint8)
    dilated_mask = cv2.dilate(color_mask, kernel, iterations=2)

    contours, _ = cv2.findContours(dilated_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for i, cnt in enumerate(contours):
        x, y, w, h = cv2.boundingRect(cnt)
        if w < 200 or h < 200 or w / h > 5 or h / w > 5:
            continue
        cropped = image[y:y+h, x:x+w]
        graph_filename = f"page_{page_num+1}_graph_{i+1}.png"
        graph_path = os.path.join(graphs_dir, graph_filename)
        cv2.imwrite(graph_path, cropped)
        output_sequence.append(f"[Graph Page {page_num + 1} Graph {i + 1}] -> {graph_path}")

doc.close()

In [9]:
# === [STEP 2] TABLE EXTRACTION ===
with pdfplumber.open(pdf_path) as pdf:
    for page_num, page in enumerate(pdf.pages):
        tables = page.extract_tables()
        for t_index, table in enumerate(tables, start=1):
            output_sequence.append(f"[Table Page {page_num + 1} Table {t_index}]")
            for row in table:
                output_sequence.append("\t".join(cell or "" for cell in row))

# === [STEP 3] WRITE TEXT OUTPUT ===
output_file = os.path.join(output_dir, "pdf_extracted_output.txt")
with open(output_file, "w", encoding="utf-8") as f:
    for item in output_sequence:
        f.write(item + "\n\n")

print(f"\u2705 Extraction Complete!\nText: {output_file}\nImages: {images_dir}\nGraphs: {graphs_dir}")


✅ Extraction Complete!
Text: C:\kathir\all_file_1\output_new_approach\pdf_extracted_output.txt
Images: C:\kathir\all_file_1\output_new_approach\images
Graphs: C:\kathir\all_file_1\output_new_approach\graphs


##version 1 - scanned pdf checks

In [10]:
# import os
# from PyPDF2 import PdfReader
# from docx import Document

# def check_pdf_type(pdf_path):
#     try:
#         reader = PdfReader(pdf_path)
#         for page in reader.pages:
#             text = page.extract_text()
#             if text and text.strip():
#                 return "unscanned"
#         return "scanned"
#     except Exception as e:
#         return f"error ({e})"

# def check_doc_type(doc_path):
#     try:
#         doc = Document(doc_path)
#         for para in doc.paragraphs:
#             if para.text.strip():
#                 return "unscanned"
#         return "scanned"
#     except Exception as e:
#         return f"error ({e})"

# def check_file_type(file_path):
#     ext = os.path.splitext(file_path)[1].lower()
#     if ext == '.pdf':
#         return check_pdf_type(file_path)
#     elif ext in ['.doc', '.docx']:
#         return check_doc_type(file_path)
#     else:
#         return "skipped (unsupported type)"

# def check_all_files_in_directory(directory_path):
#     for file_name in os.listdir(directory_path):
#         file_path = os.path.join(directory_path, file_name)
#         if os.path.isfile(file_path):
#             result = check_file_type(file_path)
#             print(f"{file_name}: {result}")

# # === USAGE ===
# directory_path = r"C:\Users\VMSKNLFST\Documents\GitHub\Doc_parser"  # Change this to your directory
# check_all_files_in_directory(directory_path)