In [1]:
# First, set up imports and paths
import sys
import os

sys.path.append("/home/jovyan/api")  # Add api directory to Python path

from jobs import process_with_marker
import hashlib
from datetime import datetime
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient("mongodb://db:27017/")
db = client["chelle"]
raw_assets = db["raw_assets"]


# Function to simulate file upload and create asset record
def setup_test_file(file_path: str):
    # Read file and calculate hash
    with open(file_path, "rb") as f:
        content = f.read()
        file_hash = hashlib.sha256(content).hexdigest()

    # Get file details
    filename = os.path.basename(file_path)
    file_size = len(content)
    file_ext = os.path.splitext(filename)[1].lower()

    # Determine mime type
    mime_types = {
        ".pdf": "application/pdf",
        ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    }
    file_type = mime_types.get(file_ext, "application/octet-stream")

    # Save file to filestore
    raw_dir = "/home/jovyan/api/filestore/raw"
    os.makedirs(raw_dir, exist_ok=True)
    stored_filename = f"{file_hash}{file_ext}"
    stored_path = os.path.join(raw_dir, stored_filename)

    with open(stored_path, "wb") as f:
        f.write(content)

    # Create asset record
    asset_record = {
        "original_name": filename,
        "stored_name": stored_filename,
        "file_path": stored_path,
        "file_hash": file_hash,
        "file_type": file_type,
        "file_size": file_size,
        "upload_date": datetime.now(),
        "status": "uploaded",
        "processed": False,
    }

    # Insert or update record
    raw_assets.update_one({"file_hash": file_hash}, {"$set": asset_record}, upsert=True)

    return file_hash


# Test with a file
test_file = "/home/jovyan/nb/Health Byte_ Product Roadmap.docx"  # Replace with your test file path
file_hash = setup_test_file(test_file)

# Process with marker
result = process_with_marker(file_hash)

# Check results
asset = raw_assets.find_one({"file_hash": file_hash})
print("Status:", asset.get("status"))
print("Processed:", asset.get("processed"))
print("Processing details:", asset.get("processing_details"))

# If processed successfully, check the extracted content
if asset.get("processed"):
    processed_paths = asset.get("processed_paths", {})

    # Check markdown content
    if "markdown" in processed_paths:
        with open(processed_paths["markdown"], "r") as f:
            print("\nFirst 500 characters of markdown content:")
            print(f.read(500))

    # Check images
    if "images" in processed_paths:
        print("\nExtracted images:")
        for img_name, img_path in processed_paths["images"].items():
            print(f"- {img_name}: {img_path}")

DEBUG:pymongo.topology:{"topologyId": {"$oid": "6724f2c0a8704f1f0cbd0e73"}, "message": "Starting topology monitoring"}
DEBUG:pymongo.topology:{"topologyId": {"$oid": "6724f2c0a8704f1f0cbd0e73"}, "previousDescription": "<TopologyDescription id: 6724f2c0a8704f1f0cbd0e73, topology_type: Unknown, servers: []>", "newDescription": "<TopologyDescription id: 6724f2c0a8704f1f0cbd0e73, topology_type: Unknown, servers: [<ServerDescription ('db', 27017) server_type: Unknown, rtt: None>]>", "message": "Topology description changed"}
DEBUG:pymongo.topology:{"topologyId": {"$oid": "6724f2c0a8704f1f0cbd0e73"}, "serverHost": "db", "serverPort": 27017, "message": "Starting server monitoring"}
DEBUG:pymongo.connection:{"clientId": {"$oid": "6724f2c0a8704f1f0cbd0e73"}, "message": "Connection pool created", "serverHost": "db", "serverPort": 27017}
DEBUG:pymongo.topology:{"topologyId": {"$oid": "6724f2c0a8704f1f0cbd0e73"}, "driverConnectionId": 1, "serverHost": "db", "serverPort": 27017, "awaited": false, "

KeyboardInterrupt: 

DEBUG:pymongo.topology:{"topologyId": {"$oid": "6724f2c0a8704f1f0cbd0e74"}, "serverHost": "localhost", "serverPort": 27017, "awaited": false, "durationMS": 1.155124991782941, "failure": "\"AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')\"", "message": "Server heartbeat failed"}
DEBUG:pymongo.topology:{"topologyId": {"$oid": "6724f2c0a8704f1f0cbd0e74"}, "previousDescription": "<TopologyDescription id: 6724f2c0a8704f1f0cbd0e74, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>", "newDescription": "<TopologyDescription id: 6724f2c0a8704f1f0cbd0e74, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:2701

In [3]:
!pip uninstall PyMuPDF -y
!pip install PyMuPDF==1.22.0

Found existing installation: PyMuPDF 1.24.13
Uninstalling PyMuPDF-1.24.13:
  Successfully uninstalled PyMuPDF-1.24.13
Collecting PyMuPDF==1.22.0
  Downloading PyMuPDF-1.22.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (8.3 kB)
Downloading PyMuPDF-1.22.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (14.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.22.0
