In [None]:
import logging 
from pathlib import Path 

from docling.document_conveter import DocumentConveter ,PdfFormatOption 
from docling.datamodel.base_models import InputFormat 
from docling.chunking import HybridChunker 
from docling.datamodel.pipeline_options import PdfPipelineOptions ,TesseractOcrOptions 
from docling_core.types.doc import DoclingDocument,ImageRefMode 

import google.generativeai as genai

import os
import re 
import base64 
from typing import List,Optional 

from pymongo import MongoClient 
from pymongo.errors import ConnectionFailure 

from datetime import datetime 
import pytz 




In [None]:
logging.basicConfig(level=logging.DEBUG,format='%(asctime)s - %(levelname)s - %(message)s')
_log=logging.getLogger(__name__)


In [None]:
SOURCE=''
pdf_path=SOURCE


In [None]:
pipeline_options=PdfPipelineOptions(
    do_table_structure=True,
    do_ocr=True,
    ocr_options=TesseractOcrOptions(lang=['eng','hin']),
    generate_page_images=True,
    generate_picture_images=True,
    images_scale=3.0 
)

In [None]:
doc_converter=DocumentConveter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [None]:
def clean_ocr_text(text:str)->str:
    '''clean up unicode escape sequences and glyph IDs from OCR options'''
    unicode_map={
        'uni092F': 'य', 'uni093F': 'ि', 'uni092Fा': 'या', 'uni093F/g7021': '',
        'uni0927': 'ध',
    }
    text=re.sub(r'/g\d{4}','',text)
    for code,char in unicode_map.items():
          text = re.sub(r'/uni[0-9A-Fa-f]{4}', '', text)
          text = re.sub(r'\s+', ' ', text).strip()
    return text 

In [None]:
#connect to mongodb 
mongo_uri='mongodb://127/.0.0.1:27017/Markdown-parsed'

try:
    client=MongoClient(mongo_uri,serverSelectionTimeoutMS=5000)
    markdown_db=client['Markdown-parsed']
    posts_db=client['postReviewSystem']
    markdown_collection=markdown_db['markdown_files']
    posts_collection=posts_db['posts']
    client.list_database_names() 
    _log.info(f"connected to mongodb at {mongo_uri}")
except ConnectionFailure as e:
    _log.error(f"connected to mongodb at {mongo_uri}")
    exit(1)

In [None]:
try:
    result=doc_converter.convert(SOURCE)
    raw_text=result.document.text if hasattr(result.document,'text') else 'no text attribute'
    _log.debug(f'raw extarcted text (forst 500 chars ): {raw_text[:500]}')
    if hasattr(result.document,'text'):
       result.document.text=clean_ocr_text(raw_text)

except FileNotFoundError:
       _log.error(f"pdf file not found at {SOURCE}")
       exit(1)
       
except Exception as e:
       _log.error(f'error during document conversion {e}')
       exit(1)

In [4]:
output_dir=Path('parsed-doc')
output_dir.mkdir(parents=True,exist_ok=True)
doc_filename=Path(SOURCE).stem 
md_filename =output_dir/f'(doc_filename)-with-images.md '

try:
    result.document.save_as_markdown(md_filename,image_mode=ImageRefMode.EMBEDDED)
    _log.info(f'markdown content has been saved to {md_filename}')
    
except Exception as e:
    _log.error(f'error saving markdown {e}')
    exit(1)

NameError: name 'Path' is not defined

In [None]:
image_dir=output_dir/'images'
image_dir.mkdir(parents=True,exist_ok=True)

try:
    with open(md_filename,'r',encoding='utf-8') as f:
        markdown_content=f.read() 
        markdown_content=clean_ocr_text(markdown_content)
        _log.debug(f'cleaned markdown content (first 500 character ):{markdown_content[:500]}')
        _log.debug(f"total length of markdown_content:{len(markdown_content)} characters")

except Exception as e:
       _log.error(f'error reading markdown file :{e}')
       exit(1)

In [None]:
def process_base64_images(content:str,image_dir:Path,doc_filename:str)->str:
    '''decode base64 images save them and update markdown content with refrences of images attached  '''
    base64_pattern = r'!\[(.*?)\]\(data:image/(\w+);base64,([^)]+)\)'
    updated_content=content
    image_count=0 
    for match in re.finditer(base64_pattern,content):
        alt_text ,image_type,base64_data=match.groups() 
        try:
            image_data=base64.b64decode(base64_data)
            image_filename=f'{doc_filename}_image_{image_count}.{image_type}'
            image_path = image_dir / image_filename
            with open(image_path, 'wb') as f:
                f.write(image_data)
            _log.info(f"Saved image to {image_path}")
            relative_image_path = f"images/{image_filename}"
            updated_content = updated_content.replace(match.group(0), f"![{alt_text}]({relative_image_path})")
            image_count += 1
        except Exception as e:
            _log.error(f'error decoding base64 images {image_count} :{e}')
            continue 
updated_md_filename=output_dir/f"{doc_filename}-with-images-links.md"


SyntaxError: incomplete input (516051123.py, line 8)