Loading the PDF

In [6]:
pdf_path = r"R:\Job\FileSure Internship Project\Form ADT-1-29092023_signed.pdf"

Checking that if correctly extracting information from the PDF

In [7]:
import fitz 

def extract_text_preserving_order(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = ""
    for page_num, page in enumerate(doc):
        blocks = page.get_text("blocks")
        blocks = sorted(blocks, key=lambda b: (b[1], b[0]))

        print(f"\n--- Page {page_num + 1} ---")
        for block in blocks:
            block_text = block[4].strip()
            if block_text:
                print(block_text)
                all_text += block_text + "\n"
    return all_text


extract_text_preserving_order(pdf_path)



--- Page 1 ---
FORM NO. ADT-1
Notice to the Registrar by 
company for appointment of 
auditor
[Pursuant to section 139 of the Companies Act, 
2013 and Rule 4(2) of the Companies  
(Audit and Auditors) Rules, 2014]
Form language
English
Hindi
Refer the instruction kit for filing the form.
Pre-fill
U74999KA2016PTC095981
1.(a) *Corporate identity number (CIN) of company
(b)  Global location number (GLN) of company
2.(a)  Name of the company
ALUPA FOODS PRIVATE LIMITED
DHANYALAXMI RICE MILL, 
5-110A, PUTTUR, 
UDUPI 
Udupi 
Karnataka 
576105
(b) Address of the registered office  
      of  the company
*
(c)   email id of the company
mail@alupafoods.in
3.(a)   Whether company is falling under any class of companies as per section 139(2)   
Yes
No
*
(b)   Nature of appointment    
*
Appointment/Re-appointment in AGM
*
4.   Whether joint auditors have been appointed
Yes
No
Number of auditor(s) appointed
*
1
Individual
Auditor's Firm
I. (a) *Category of Auditor
(b) *Income Tax permanent accoun

"FORM NO. ADT-1\nNotice to the Registrar by \ncompany for appointment of \nauditor\n[Pursuant to section 139 of the Companies Act, \n2013 and Rule 4(2) of the Companies  \n(Audit and Auditors) Rules, 2014]\nForm language\nEnglish\nHindi\nRefer the instruction kit for filing the form.\nPre-fill\nU74999KA2016PTC095981\n1.(a) *Corporate identity number (CIN) of company\n(b)  Global location number (GLN) of company\n2.(a)  Name of the company\nALUPA FOODS PRIVATE LIMITED\nDHANYALAXMI RICE MILL, \n5-110A, PUTTUR, \nUDUPI \nUdupi \nKarnataka \n576105\n(b) Address of the registered office  \n      of  the company\n*\n(c)   email id of the company\nmail@alupafoods.in\n3.(a)   Whether company is falling under any class of companies as per section 139(2)   \nYes\nNo\n*\n(b)   Nature of appointment    \n*\nAppointment/Re-appointment in AGM\n*\n4.   Whether joint auditors have been appointed\nYes\nNo\nNumber of auditor(s) appointed\n*\n1\nIndividual\nAuditor's Firm\nI. (a) *Category of Auditor\n(b

Creating a json file with upper extracted information

In [8]:
import fitz
import pandas as pd

def extract_text_as_table(pdf_path, y_threshold=10):
    doc = fitz.open(pdf_path)
    all_rows = []

    for page_num, page in enumerate(doc, start=1):
        blocks = page.get_text("blocks")
        blocks = [b for b in blocks if b[4].strip()]  # Filter empty text
        # Sort blocks top to bottom, left to right
        blocks = sorted(blocks, key=lambda b: (b[1], b[0]))

        # Group blocks into rows based on y0 proximity
        rows = []
        current_row = []
        last_y = None

        for b in blocks:
            x0, y0, x1, y1, text, block_no, block_type = b
            if last_y is None:
                current_row.append(b)
                last_y = y0
            else:
                if abs(y0 - last_y) <= y_threshold:
                    current_row.append(b)
                    last_y = (last_y + y0) / 2  # average for smoothing
                else:
                    rows.append(current_row)
                    current_row = [b]
                    last_y = y0
        if current_row:
            rows.append(current_row)

        # For each row, sort blocks by x0 and collect texts as columns
        for row_blocks in rows:
            row_blocks = sorted(row_blocks, key=lambda b: b[0])  # sort by x0
            row_texts = [b[4].strip() for b in row_blocks]
            all_rows.append([page_num] + row_texts)

    # Normalize rows to equal columns
    max_cols = max(len(r) for r in all_rows)
    for r in all_rows:
        while len(r) < max_cols:
            r.append("")

    columns = ["Page"] + [f"Col_{i}" for i in range(1, max_cols)]
    df = pd.DataFrame(all_rows, columns=columns)
    return df

# File path


# Extract table
df_table = extract_text_as_table(pdf_path)



# Save as JSON
df_table.to_json("pdf_extracted_table.json", orient="records", indent=4)

Creating New Organized Json File and deleting the old json file

In [9]:
input_file = "pdf_extracted_table.json"
output_file = "final_qa_output.json"

In [10]:
import json
import re
import os

def clean(text):
    return re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip()

def is_unwanted_line(text):
    # Matches "7.", "10.", etc.
    if re.fullmatch(r'\d+\.', text):
        return True
    # Matches "Page 2 of 3", "Page 10 of 10"
    if re.fullmatch(r'Page \d+ of \d+', text, re.IGNORECASE):
        return True
    return False

def is_blank_answer(text):
    return not text or text == "*"



with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

qa_dict = {}

for item in data:
    col1 = clean(item.get("Col_1", ""))
    col2 = clean(item.get("Col_2", ""))
    col3 = clean(item.get("Col_3", ""))

    # Skip rows where all blank or '*'
    if is_blank_answer(col1) and is_blank_answer(col2) and is_blank_answer(col3):
        continue

    # Skip unwanted lines
    if is_unwanted_line(col1):
        continue

    # Skip if no answers in col2 and col3
    if is_blank_answer(col2) and is_blank_answer(col3):
        continue

    # Compose answer from col2 and col3
    answer = col2
    if not is_blank_answer(col3):
        answer += " " + col3

    qa_dict[col1] = answer.strip()

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(qa_dict, f, indent=4, ensure_ascii=False)

try:
    os.remove(input_file)
    print(f"Saved output to '{output_file}' and deleted '{input_file}'")
except Exception as e:
    print(f"Could not delete input file: {e}")


Saved output to 'final_qa_output.json' and deleted 'pdf_extracted_table.json'


Summary of the Json File using hugging face model and creating a summary txt file

In [11]:
import json
from pathlib import Path
from transformers import pipeline, AutoTokenizer


with open("final_qa_output.json", "r", encoding="utf-8") as f:
    qa_data = json.load(f)


def fuse_qa(q, a):
    q = q.strip("?").strip()
    return f"{q} is {a.strip()}."

full_text = " ".join([fuse_qa(q, a) for q, a in qa_data.items() if a.strip() != "*"])


tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
encoded_input = tokenizer.encode(full_text, max_length=512, truncation=True, add_special_tokens=True)
truncated_text = tokenizer.decode(encoded_input, skip_special_tokens=True)

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(
    truncated_text,
    max_length=500,
    min_length=200,
    length_penalty=2.0,
    no_repeat_ngram_size=3,
    do_sample=False
)[0]["summary_text"]

# Save result
output_path = Path("Summary_of_Json.md")
output_path.write_text(f"# Automated Summary\n\n{summary}", encoding="utf-8")

print("Generated summary ")

Device set to use cuda:0


Generated summary 


Extractinng the Embedded pdf

In [12]:
import fitz  
import os
import pandas as pd
from PIL import Image
import io

def extract_all_from_pdf(pdf_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    doc = fitz.open(pdf_path)

    text_output = os.path.join(output_folder, "extracted_text.txt")
    with open(text_output, "w", encoding="utf-8") as f:
        for page in doc:
            f.write(page.get_text())

    print(f"Text extracted to: {text_output}")

    # 2. Extract Images
    images_folder = os.path.join(output_folder, "images")
    os.makedirs(images_folder, exist_ok=True)
    
    for i, page in enumerate(doc):
        img_list = page.get_images(full=True)
        for img_index, img in enumerate(img_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            img_bytes = base_image["image"]
            img_ext = base_image["ext"]
            img_path = os.path.join(images_folder, f"page_{i+1}_img_{img_index}.{img_ext}")
            
            with open(img_path, "wb") as f:
                f.write(img_bytes)
    
    print(f"Images extracted to: {images_folder}")

    # 3. Extract Tables (using PyMuPDF + Pandas)
    tables_folder = os.path.join(output_folder, "tables")
    os.makedirs(tables_folder, exist_ok=True)
    
    for i, page in enumerate(doc):
        tables = page.find_tables()
        if tables.tables:
            for table_num, table in enumerate(tables.tables):
                df = table.to_pandas()
                csv_path = os.path.join(tables_folder, f"page_{i+1}_table_{table_num}.csv")
                df.to_csv(csv_path, index=False)
    
    print(f"Tables extracted to: {tables_folder}")

    # 4. Extract Attachments
    attachments_folder = os.path.join(output_folder, "attachments")
    os.makedirs(attachments_folder, exist_ok=True)
    
    for i in range(doc.embfile_count()):
        try:
            info = doc.embfile_info(i)
            filename = info.get("filename", f"attachment_{i}")
            filename = "".join(c for c in filename if c.isalnum() or c in ('.', '_', '-')).strip()
            
            if not filename:
                filename = f"attachment_{i}"
            
            file_data = doc.embfile_get(i)
            attachment_path = os.path.join(attachments_folder, filename)
            
            with open(attachment_path, "wb") as f:
                f.write(file_data)
            
            print(f"Attachment saved: {attachment_path}")
        except Exception as e:
            print(f"Failed to extract attachment {i}: {e}")

    doc.close()
    print("All content extracted successfully!")



output_folder = "extracted_content"
extract_all_from_pdf(pdf_path, output_folder)

Text extracted to: extracted_content\extracted_text.txt
Images extracted to: extracted_content\images
Tables extracted to: extracted_content\tables
Attachment saved: extracted_content\attachments\IntimationLetterSigned.pdf
Attachment saved: extracted_content\attachments\Consentsigned.pdf
Attachment saved: extracted_content\attachments\ResolutionforappointmentofAuditorSigned.pdf
Failed to extract attachment 3: 'utf-8' codec can't encode characters in position 14-15: surrogates not allowed
All content extracted successfully!


In [13]:
import os
from pathlib import Path
from transformers import pipeline

def generate_summary(extraction_folder, output_file=r"R:\Job\FileSure Internship Project\Codes\Embedded_System_Summary.md"):
    summary = {}
    extraction_folder = Path(extraction_folder)
    text_path = extraction_folder / "extracted_text.txt"

    # Detect text
    if text_path.exists():
        text = text_path.read_text(encoding="utf-8")
        summary["text"] = {
            "word_count": len(text.split()),
            "char_count": len(text),
            "sample": text[:100] + "..." if len(text) > 100 else text
        }

    # Detect subfolders and their contents
    for item in extraction_folder.iterdir():
        if item.is_dir():
            content_type = item.name.lower()
            files = list(item.iterdir())
            file_list = [f.name for f in files if f.is_file()]
            if not file_list:
                continue

            # Heuristics: map folder type based on file extensions
            if any(f.suffix.lower() == ".csv" for f in files):
                summary["tables"] = {
                    "folder": item.name,
                    "count": len(file_list),
                    "sample": file_list[:2]
                }
            elif any(f.suffix.lower() in [".png", ".jpg", ".jpeg"] for f in files):
                summary["images"] = {
                    "folder": item.name,
                    "count": len(file_list),
                    "sample": file_list[:2]
                }
            elif any(f.suffix for f in files):
                summary["attachments"] = {
                    "folder": item.name,
                    "count": len(file_list),
                    "sample": file_list[:10]
                }

    # Optional: AI-based text summarization
    try:
        if "text" in summary and len(summary["text"]["sample"]) > 50:
            summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0)
            ai_summary = summarizer(summary["text"]["sample"], max_length=36, min_length=30, do_sample=False)[0]["summary_text"]
            summary["ai_summary"] = ai_summary
    except Exception as e:
        summary["ai_summary"] = f"[AI summary failed: {str(e)}]"

    # Write summary
    output_path = extraction_folder / output_file
    with output_path.open("w", encoding="utf-8") as f:
        f.write("# PDF Extraction Summary\n\n")
        if "text" in summary:
            f.write("## Text\n")
            f.write(f"- Words: {summary['text']['word_count']}\n")
            f.write(f"- Chars: {summary['text']['char_count']}\n")
            f.write(f"- Sample: {summary['text']['sample']}\n\n")
        for key in ["tables", "images", "attachments"]:
            if key in summary:
                f.write(f"## {key.capitalize()}\n")
                f.write(f"- Folder: {summary[key]['folder']}\n")
                f.write(f"- Count: {summary[key]['count']}\n")
                f.write(f"- Sample: {summary[key]['sample']}\n\n")
        if "ai_summary" in summary:
            f.write("## AI Summary\n")
            f.write(summary["ai_summary"] + "\n")

    print(f"Summary saved at: {output_path}")

# Example usage
generate_summary("extracted_content")


Device set to use cuda:0


Summary saved at: R:\Job\FileSure Internship Project\Codes\Embedded_System_Summary.md
