In [None]:
#GOOGLE COLAB LINK:-
#https://colab.research.google.com/drive/1DZLzhJK9ucD2k07WkxbuoJ09h2J29PBM

In [None]:
!pip install torch==2.7.1
!pip install torchvision==0.22.1
!pip install marker-pdf
!pip3 install Pillow




[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: C:\Users\HP\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip


In [None]:
!marker_single "sample.pdf" --output_format json --output_dir output --format_lines

In [9]:
import json
import os
import base64
import re
from bs4 import BeautifulSoup

def clean_html(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style"]):
        tag.decompose()
    return soup.get_text(separator=" ", strip=True)

def sanitize_id(block_id):
    return block_id.replace("/", "_").replace(":", "_").replace("\\", "_")

def decode_and_save_images(images, block_id, images_dir):
    saved_paths = {}
    for img_id, base64_str in (images or {}).items():
        try:
            img_filename = f"{sanitize_id(block_id)}_{sanitize_id(img_id)}.png"
            img_path = os.path.join(images_dir, img_filename)
            with open(img_path, "wb") as img_file:
                img_file.write(base64.b64decode(base64_str))
            saved_paths[img_id] = img_path.replace("\\", "/")
        except Exception as e:
            print(f"❌ Error decoding image {img_id} in block {block_id}: {e}")
    return saved_paths

def extract_parent_title(path_parts):
    for part in reversed(path_parts):
        if "chapter" in part.lower() or "section" in part.lower():
            return part.strip()
    return ""

def flatten_blocks(blocks, page_title="", page_num="", context_path="", parent_title="", flat_list=None, images_dir=None):
    if flat_list is None:
        flat_list = []

    last_section = parent_title
    para_counter = 1

    for idx, block in enumerate(blocks):
        block_id = block.get("id", f"block_{idx}")
        block_type = block.get("block_type", "Unknown")
        html = block.get("html", "")
        text = block.get("text", "")

        # Remove unnecessary keys
        block.pop("bbox", None)
        block.pop("polygon", None)

        # Decode and save image
        image_paths = decode_and_save_images(block.get("images"), block_id, images_dir)

        cleaned_text = clean_html(html) if html else text.strip()

        if block_type == "SectionHeader":
            last_section = cleaned_text.strip()

        # Skip plain text for structured tables
        if block_type == "Table" and "structured_table" in block:
            cleaned_text = ""

        contextual_path = f"{parent_title} > Page {page_num}"
        if last_section and block_type != "SectionHeader":
            contextual_path += f" > {last_section}"
        if block_type == "Text":
            contextual_path += f" > para {para_counter}"
            para_counter += 1

        flat_block = {
            "id": block_id,
            "text": cleaned_text,
            "blockType": block_type,
            "section_path": contextual_path,
            "parent_title": last_section
        }

        if block_type == "Picture" and image_paths:
            flat_block["image_path"] = list(image_paths.values())[0]

        if "structured_table" in block:
            flat_block["structured_table"] = block["structured_table"]

        flat_list.append(flat_block)

        if "children" in block and isinstance(block["children"], list):
            flatten_blocks(
                block["children"],
                page_title=page_title,
                page_num=page_num,
                context_path=contextual_path,
                parent_title=last_section,
                flat_list=flat_list,
                images_dir=images_dir
            )
    return flat_list

def assign_prev_next(flat_blocks):
    # Group by full section path (excluding para info)
    section_map = {}

    for block in flat_blocks:
        section_key = " > ".join(block["section_path"].split(" > ")[:-1])  # Drop para X
        if block["blockType"] == "Text":  # Only include Text blocks
            section_map.setdefault(section_key, []).append(block)

    for section, blocks in section_map.items():
        for i, block in enumerate(blocks):
            block["prev_text"] = blocks[i - 1]["text"] if i > 0 else ""
            block["next_text"] = blocks[i + 1]["text"] if i + 1 < len(blocks) else ""

def clean_and_flatten_json(input_json_path):
    base_dir = os.path.dirname(input_json_path)
    images_dir = os.path.join(base_dir, "images")
    os.makedirs(images_dir, exist_ok=True)

    output_json_path = os.path.join(base_dir, "final_cleaned_output.json")

    with open(input_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    children = data.get("children", [])
    flat_blocks = []

    for page in children:
        page_title = page.get("title", "")
        page_num_match = re.search(r'/page/(\d+)', page.get("id", ""))
        page_num = page_num_match.group(1) if page_num_match else "?"
        flat_blocks.extend(flatten_blocks(
            page.get("children", []),
            page_title=page_title,
            page_num=page_num,
            parent_title=extract_parent_title([page_title]),
            flat_list=[],
            images_dir=images_dir
        ))

    assign_prev_next(flat_blocks)

    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(flat_blocks, f, indent=2, ensure_ascii=False)

    print(f"\n✅ Final cleaned and flattened JSON saved to: {output_json_path}")

# 🔁 Run this
clean_and_flatten_json("D:/aksharaplus/output/sample/sample.json")



✅ Final cleaned and flattened JSON saved to: D:/aksharaplus/output/sample\final_cleaned_output.json


In [2]:
import json
import re

def fix_structure_with_tables(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    fixed_data = []
    current_parent_title = ""
    page_number = ""
    buffer = []
    i = 0
    total = len(data)

    def flush_buffer():
        for i, block in enumerate(buffer):
            if block["blockType"] in ["Text", "ListItem"]:
                block["section_path"] = f"> Page {page_number} > {current_parent_title} > para {i+1}"
                block["parent_title"] = current_parent_title
                block["prev_text"] = buffer[i-1]["text"] if i > 0 else ""
                block["next_text"] = buffer[i+1]["text"] if i < len(buffer) - 1 else ""
            elif block["blockType"] == "Table":
                block["section_path"] = f"> Page {page_number} > {current_parent_title} > table"
                block["parent_title"] = current_parent_title
                block["prev_text"] = buffer[i-1]["text"] if i > 0 else ""
                block["next_text"] = buffer[i+1]["text"] if i < len(buffer) - 1 else ""
            fixed_data.append(block)
        buffer.clear()

    while i < total:
        block = data[i]
        page_match = re.search(r'/page/(\d+)', block["id"])
        page_number = page_match.group(1) if page_match else page_number

        if block["blockType"] == "SectionHeader":
            flush_buffer()
            current_parent_title = block["text"].strip()
            block["section_path"] = f"> Page {page_number}"
            block["parent_title"] = current_parent_title
            fixed_data.append(block)

        elif block["blockType"] in ["Text", "ListItem"]:
            buffer.append(block)

        elif block["blockType"] == "Table":
            # Extract associated TableCell blocks
            table_cells = []
            j = i + 1
            while j < total and data[j]["blockType"] == "TableCell":
                cell_text = data[j]["text"].strip()
                if cell_text:
                    table_cells.append(cell_text)
                j += 1

            # Reconstruct the table
            num_columns = 0
            headers = []
            rows = []

            # Heuristic: first row of TableCell after Table block is header
            headers = table_cells[:3]  # Adjust dynamically if needed
            num_columns = len(headers)
            cell_data = table_cells[3:]

            # Group remaining cells into rows
            for k in range(0, len(cell_data), num_columns + 1):  # +1 to skip index
                row = cell_data[k+1:k+1+num_columns]
                if len(row) == num_columns:
                    rows.append(row)

            block["text"] = {
                "table": {
                    "columns": headers,
                    "rows": rows
                }
            }

            buffer.append(block)
            i = j - 1  # Skip past TableCells

        elif block["blockType"] == "TableCell":
            # Skip all TableCell blocks
            pass

        else:
            # Other blocks like PageFooter, Image, etc.
            block["section_path"] = f"> Page {page_number} > {current_parent_title}"
            block["parent_title"] = current_parent_title
            block["prev_text"] = ""
            block["next_text"] = ""
            fixed_data.append(block)

        i += 1

    flush_buffer()

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(fixed_data, f, indent=2, ensure_ascii=False)

    print(f"✅ Updated structured JSON saved to: {output_path}")

# 🔁 Run the fixer
fix_structure_with_tables(
    input_path="D:/aksharaplus/output/sample/final_cleaned_output.json",
    output_path="D:/aksharaplus/output/sample/final_structured_output_with_tables.json"
)


✅ Updated structured JSON saved to: D:/aksharaplus/output/sample/final_structured_output_with_tables.json


In [1]:
import json

# Load the original JSON data (which is a list of blocks)
with open("D:/aksharaplus/output/sample/final_structured_output_with_tables.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Filter blocks: keep those that are not PageFooter, blockType is str, and either:
# (a) text is a non-empty string, OR (b) blockType is "Picture" (even if text is empty)
cleaned_blocks = [
    block for block in data
    if isinstance(block.get("blockType"), str)
    and block.get("blockType") != "PageFooter"
    and (
        (isinstance(block.get("text"), str) and block["text"].strip() != "")
        or block.get("blockType") == "Picture"
    )
]

# Save cleaned data
with open("D:/aksharaplus/output/sample/final_cleaned_output_filtered.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_blocks, f, indent=2, ensure_ascii=False)

print(f"✅ Cleaned {len(data) - len(cleaned_blocks)} blocks.")
print("✅ Saved to: D:/aksharaplus/output/sample/final_cleaned_output_filtered.json")

✅ Cleaned 727 blocks.
✅ Saved to: D:/aksharaplus/output/sample/final_cleaned_output_filtered.json


In [2]:
import json

# Load the JSON file
input_file = "D:/aksharaplus/output/sample/final_cleaned_output_filtered.json"
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Preview the structure
print(f"Total entries: {len(data)}")
print("Sample entry:", data[0])

Total entries: 3153
Sample entry: {'id': '/page/14/SectionHeader/0', 'text': 'CHAPTER 1 Introduction', 'blockType': 'SectionHeader', 'section_path': '> Page 14', 'parent_title': 'CHAPTER 1 Introduction'}


In [7]:
import json

def clean_text(text):
    # Convert to string if input is a dict
    if isinstance(text, dict):
        text = json.dumps(text)  # Or extract specific key if known, e.g., text.get("value", "")
    # Ensure text is a string
    if not isinstance(text, str):
        text = str(text)
    # Fix common typographical errors
    text = text.replace("hand‐ coded", "hand-coded").replace("‐", "-")
    # Normalize whitespace
    text = " ".join(text.split())
    return text

cleaned_data = []
for item in data:
    cleaned_item = {
        "text": clean_text(item["text"]),
        "blockType": item["blockType"],
        "parent_title": item.get("parent_title", ""),  # Safely get parent_title
        "prev_text": clean_text(item.get("prev_text", "")) if item.get("prev_text") else "",  # Safely get prev_text
        "next_text": clean_text(item.get("next_text", "")) if item.get("next_text") else ""   # Safely get next_text
    }
    cleaned_data.append(cleaned_item)

# Save cleaned data
with open("D:/aksharaplus/output/sample/cleaned_data.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2)

In [9]:
grouped_data = {}
for item in cleaned_data:
    if item["blockType"] in ["Text", "ListItem"]:  # Focus on content, exclude headers
        parent = item["parent_title"] or "General"
        if parent not in grouped_data:
            grouped_data[parent] = []
        # Add context with prev_text and next_text if relevant
        context = f"{item['prev_text'] + ' ' if item['prev_text'] else ''}{item['text']}{(' ' + item['next_text']) if item['next_text'] else ''}"
        grouped_data[parent].append(context)

# Combine texts under each parent_title
combined_data = [
    {"parent_title": parent, "text": " ".join(texts)}
    for parent, texts in grouped_data.items()
]

# Save grouped data
with open("D:/aksharaplus/output/sample/grouped_data.json", "w", encoding="utf-8") as f:
    json.dump(combined_data, f, indent=2)

In [19]:
import json
import re

# Load grouped_data.json
with open("D:/aksharaplus/output/sample/grouped_data.json", "r", encoding="utf-8") as f:
    grouped_data = json.load(f)

# Debugging: List all parent_title values
parent_titles = sorted(set(item["parent_title"] for item in grouped_data))
print("All parent titles:", parent_titles)

# Filter entries where parent_title is a single letter (A-Z)
index_entries = [item for item in grouped_data if re.match(r"^[A-Z]$", item["parent_title"])]

# Debugging: Verify index entries found
print(f"Found {len(index_entries)} index entries (A-Z)")
if index_entries:
    print("Sample index entries:", [item["parent_title"] for item in index_entries[:3]])

index_terms = []
for item in index_entries:
    parent_title = item["parent_title"]
    text = item["text"]
    # Debugging: Print first 100 chars of text
    print(f"Processing parent_title '{parent_title}': {text[:100]}...")

    # Split text into lines for hierarchical parsing
    lines = text.split(" ")
    current_main_term = None
    current_subtopics = []
    current_pages = ""

    for line in lines:
        line = line.strip()
        if not line:
            continue
        # Match main term or subtopic with pages (e.g., "A/B testing, 359" or "evaluating and comparing, 191")
        match = re.match(r"(.+?)(,\s*([\d\-,\sxi]+))?$", line)
        if match:
            term, _, pages = match.groups()
            term = term.strip()
            pages = pages.strip() if pages else ""
            
            # Check if this is a main term (starts with parent_title letter, ignoring case)
            if term.lower().startswith(parent_title.lower()):
                if current_main_term:
                    # Save previous main term and its subtopics
                    index_terms.append({
                        "term": current_main_term,
                        "pages": current_pages,
                        "subtopics": current_subtopics,
                        "parent_title": parent_title
                    })
                current_main_term = term
                current_subtopics = []
                current_pages = pages
            else:
                # This is a subtopic under the current main term
                if current_main_term:
                    current_subtopics.append({
                        "subtopic": term,
                        "pages": pages
                    })
        else:
            # Line might be a continuation or malformed; log for debugging
            print(f"Unmatched line in '{parent_title}': {line}")

    # Save the last main term
    if current_main_term:
        index_terms.append({
            "term": current_main_term,
            "pages": current_pages,
            "subtopics": current_subtopics,
            "parent_title": parent_title
        })

# Save index terms
with open("D:/aksharaplus/output/sample/index_terms.json", "w", encoding="utf-8") as f:
    json.dump(index_terms, f, indent=2)

# Debugging output
print(f"Extracted {len(index_terms)} index terms")
print("Sample terms:", index_terms[:5])

All parent titles: ['A', 'A First Application: Classifying Iris Species', 'About the Authors', 'Accessing Attributes in a Grid-Searched Pipeline', 'Accessing Step Attributes', 'Advanced Tokenization, Stemming, and Lemmatization', 'Agglomerative Clustering', 'Anaconda', 'Analyzing KNeighborsClassifier', 'Analyzing KNeighborsRegressor', 'Analyzing decision trees', 'Analyzing the result of cross-validation', 'Applying Bag-of-Words to a Toy Dataset', 'Applying Data Transformations', 'Applying NMF to face images', 'Applying NMF to synthetic data', 'Applying PCA to the cancer dataset for visualization', 'Approaching a Machine Learning Problem', 'Automatic Feature Selection', 'B', 'Bag-of-Words for Movie Reviews', 'Bag-of-Words with More Than One Word (n-Grams)', 'Benefits of Cross-Validation', 'Binning, Discretization, Linear Models, and Trees', 'Building Pipelines', 'Building Your First Model: k-Nearest Neighbors', 'Building Your Own Estimator', 'Building decision trees', 'C', 'CHAPTER 1 In

In [20]:
import json

# Load grouped_data.json
with open("D:/aksharaplus/output/sample/grouped_data.json", "r", encoding="utf-8") as f:
    grouped_data = json.load(f)

# Create fine-tunable JSONL
finetune_data = []
for item in grouped_data:
    # Only include the text field
    finetune_data.append({"text": item["text"]})

# Save as JSONL
with open("D:/aksharaplus/output/sample/finetune_data.jsonl", "w", encoding="utf-8") as f:
    for entry in finetune_data:
        f.write(json.dumps(entry) + "\n")

# Debugging: Preview first few entries
print(f"Created {len(finetune_data)} JSONL entries")
print("Sample entries:")
for entry in finetune_data[:3]:
    print(entry)

Created 397 JSONL entries
Sample entries:
{'text': "Machine learning is about extracting knowledge from data. It is a research field at the intersection of statistics, artificial intelligence, and computer science and is also known as predictive analytics or statistical learning. The application of machine learning methods has in recent years become ubiquitous in everyday life. From auto- matic recommendations of which movies to watch, to what food to order or which products to buy, to personalized online radio and recognizing your friends in your photos, many modern websites and devices have machine learning algorithms at their core. When you look at a complex website like Facebook, Amazon, or Netflix, it is very likely that every part of the site contains multiple machine learning models. Outside of commercial applications, machine learning has had a tremendous influ- ence on the way data-driven research is done today. The tools introduced in this book have been applied to diverse sc

In [21]:
import json
import re

# Load grouped_data.json
with open("D:/aksharaplus/output/sample/grouped_data.json", "r", encoding="utf-8") as f:
    grouped_data = json.load(f)

# Create fine-tunable JSONL, excluding index entries
finetune_data = []
for item in grouped_data:
    # Skip entries where parent_title is a single letter (A-Z)
    if re.match(r"^[A-Z]$", item["parent_title"]):
        continue
    finetune_data.append({"text": item["text"]})

# Save as JSONL
with open("D:/aksharaplus/output/sample/finetune_data.jsonl", "w", encoding="utf-8") as f:
    for entry in finetune_data:
        f.write(json.dumps(entry) + "\n")

# Debugging: Preview results
print(f"Created {len(finetune_data)} JSONL entries")
print("Sample entries:")
for entry in finetune_data[:3]:
    print(entry)

Created 374 JSONL entries
Sample entries:
{'text': "Machine learning is about extracting knowledge from data. It is a research field at the intersection of statistics, artificial intelligence, and computer science and is also known as predictive analytics or statistical learning. The application of machine learning methods has in recent years become ubiquitous in everyday life. From auto- matic recommendations of which movies to watch, to what food to order or which products to buy, to personalized online radio and recognizing your friends in your photos, many modern websites and devices have machine learning algorithms at their core. When you look at a complex website like Facebook, Amazon, or Netflix, it is very likely that every part of the site contains multiple machine learning models. Outside of commercial applications, machine learning has had a tremendous influ- ence on the way data-driven research is done today. The tools introduced in this book have been applied to diverse sc