In [2]:
import os
import pdfplumber
import pandas as pd
import re
import uuid
from typing import List, Dict
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

DATA_FOLDER = "/Users/saurabhjain/Desktop/RAG/Data"
OUTPUT_FOLDER = "output_extracted"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# -----------------------------
# Clean text function
# -----------------------------
def clean_text(text):
    text = re.sub(r"https?://\S+", " ", text)       # remove links
    text = re.sub(r"Page \d+ of \d+", " ", text)    # remove page numbers
    text = re.sub(r"\s+", " ", text)                # normalize whitespace
    text = " ".join(text.split())                   # Remove excessive whitespace
    text = text.replace("ﬁ", "fi")                  # Fix ligatures
    text = text.replace("ﬂ", "fl")
    return text.strip()

# -----------------------------
# Extract text & tables
# -----------------------------
def extract_from_pdf(pdf_path, filename):
    all_text = []
    all_tables = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            # Extract text
            text = page.extract_text() or ""
            if text.strip():
                all_text.append(text)

            # Extract tables
            tables = page.extract_tables()
            for idx, table in enumerate(tables):
                if table and len(table) > 1:  # skip empty
                    df = pd.DataFrame(table[1:], columns=table[0])
                    all_tables.append((page_num, idx, df))

    # Save cleaned text
    cleaned_text = clean_text("\n".join(all_text))
    txt_path = os.path.join(OUTPUT_FOLDER, f"{filename}.txt")
    with open(txt_path, "w") as f:
        f.write(cleaned_text)
    print(f"✅ Saved text: {txt_path}")

    # Save tables as Markdown
    for page_num, idx, df in all_tables:
        md_content = df.to_markdown(index=False)
        md_filename = f"{filename}_page{page_num}_table{idx}.md"
        md_path = os.path.join(OUTPUT_FOLDER, md_filename)
        with open(md_path, "w") as f:
            f.write(f"# Table from {filename} (Page {page_num}, Table {idx})\n\n")
            f.write(md_content)
        print(f"✅ Saved table: {md_path}")

# -----------------------------
# MAIN
# -----------------------------
if __name__ == "__main__":
    for file in os.listdir(DATA_FOLDER):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(DATA_FOLDER, file)
            filename = os.path.splitext(file)[0]
            print(f"\n📂 Processing {file} ...")
            extract_from_pdf(pdf_path, filename)


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats



📂 Processing Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2024 Financial Results.pdf ...


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBB

✅ Saved text: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2024 Financial Results.txt
✅ Saved table: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2024 Financial Results_page1_table0.md
✅ Saved table: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2024 Financial Results_page2_table0.md
✅ Saved table: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2024 Financial Results_page3_table0.md
✅ Saved table: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2024 Financial Results_page3_table2.md
✅ Saved table: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2024 Financial Results_page4_table0.md
✅ Saved table: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full 

Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


✅ Saved text: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results.txt
✅ Saved table: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results_page1_table0.md
✅ Saved table: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results_page2_table0.md
✅ Saved table: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results_page3_table0.md
✅ Saved table: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results_page3_table2.md
✅ Saved table: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results_page4_table0.md
✅ Saved table: output_extracted/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full 

In [3]:

OUTPUT_FOLDER = "output_extracted"
SEGMENTED_FOLDER = "segmented_reports"
os.makedirs(SEGMENTED_FOLDER, exist_ok=True)

# -----------------------------
# Section Segmentation
# -----------------------------
def segment_sections(text):
    sections = {}
    
    # Define regex patterns for key sections
    patterns = {
        "financial_highlights": r"(Fourth Quarter and Full Year Financial Highlights.*?)(?=(Company Highlights|Guidance|$))",
        "company_highlights": r"(Fourth Quarter and Full Year Company Highlights.*?)(?=(Guidance|About|$))",
        "guidance": r"(FY\d{2} Guidance.*?)(?=(Share Repurchase|Conference Call|$))",
        "share_repurchases": r"(Share Repurchase Authorization.*?)(?=(Conference Call|About|$))",
        "about_company": r"(About Pure Storage.*?)(?=(Analyst Recognition|Connect|$))",
        "analyst_recognition": r"(Analyst Recognition.*?)(?=(Connect|$))"
    }

    for section, pattern in patterns.items():
        match = re.search(pattern, text, flags=re.I | re.S)
        if match:
            sections[section] = match.group(1).strip()

    return sections


# -----------------------------
# MAIN: Segment text files
# -----------------------------
if __name__ == "__main__":
    for file in os.listdir(OUTPUT_FOLDER):
        if file.endswith(".txt"):
            file_path = os.path.join(OUTPUT_FOLDER, file)
            with open(file_path, "r") as f:
                text = f.read()

            sections = segment_sections(text)
            seg_file = os.path.join(SEGMENTED_FOLDER, f"{os.path.splitext(file)[0]}_segmented.md")

            with open(seg_file, "w") as f:
                f.write(f"# Segmented Report: {file}\n\n")
                for sec, content in sections.items():
                    f.write(f"## {sec.replace('_',' ').title()}\n\n{content}\n\n")

            print(f"✅ Segmented report saved: {seg_file}")


✅ Segmented report saved: segmented_reports/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2024 Financial Results_segmented.md
✅ Segmented report saved: segmented_reports/Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results_segmented.md


Loaded 39 raw documents
✅ Created 6854 chunks (100 chars)
✅ Created 1293 chunks (400 chars)

Sample Chunk Metadata: {'source_file': 'Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results_page5_table0.md', 'folder': 'output_extracted', 'chunk_id': 'f60fd2dd-9fdc-4d64-993f-f5e7f35ab0b0', 'chunk_index': 0, 'chunk_size': 100, 'parent_source': 'Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023 Financial Results_page5_table0.md'}
Sample Chunk Text:
 # Table from Pure Storage, Inc. - Pure Storage Announces Fiscal Fourth Quarter and Full Year 2023
