In [1]:
# Import required libraries
import os
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
from datetime import datetime

# Connect to MongoDB
client = MongoClient('mongodb://db:27017/')
db = client.chelle
raw_assets = db.raw_assets

# Fetch the file object
file = raw_assets.find_one()

# Function to format dates for better readability
def format_datetime(dt):
    if isinstance(dt, datetime):
        return dt.isoformat()
    return dt

# Function to convert app paths to jupyter paths
def convert_path(app_path):
    if not app_path:
        return None
    return app_path.replace('/app/filestore', '/home/jovyan/api/filestore')

# Function to get a flattened view of processed paths with converted paths
def get_processed_paths(file):
    paths = file.get('processed_paths', {})
    return {
        'markdown_path': convert_path(paths.get('markdown')),
        'image_count': len(paths.get('images', {})),
        'table_count': len(paths.get('tables', {})),
        'meta_path': convert_path(paths.get('meta'))
    }

# Create a more readable summary of the file
file_summary = {
    'Basic Info': {
        'ID': str(file['_id']),
        'Name': file['original_name'],
        'Type': file['file_type'],
        'Size': f"{file['file_size'] / 1024:.2f} KB",
        'Status': file.get('status', 'unknown')
    },
    'Dates': {
        'Upload Date': format_datetime(file['upload_date']),
        'Processed Date': format_datetime(file.get('processed_date'))
    },
    'Processing': {
        'Is Processed': file.get('processed', False),
        'Has Images': file.get('has_images', False),
        'Image Count': file.get('image_count', 0),
        'Has Tables': file.get('has_tables', False),
        'Table Count': file.get('table_count', 0)
    },
    'Paths': get_processed_paths(file)
}

# Print the formatted summary
print("=== File Summary ===")
for section, data in file_summary.items():
    print(f"\n{section}:")
    for key, value in data.items():
        print(f"  {key}: {value}")

# If you want to access the file content
markdown_path = convert_path(file.get('processed_paths', {}).get('markdown'))
if markdown_path and os.path.exists(markdown_path):
    with open(markdown_path, 'r') as f:
        content = f.read()
        print("\n=== First 500 characters of content ===")
        print(content[:500] + "...")

# If there are tables, create a DataFrame for each
if file.get('processed_paths', {}).get('tables'):
    print("\n=== Tables ===")
    for table_name, table_path in file['processed_paths']['tables'].items():
        jupyter_path = convert_path(table_path)
        if os.path.exists(jupyter_path):
            print(f"\nTable: {table_name}")
            df = pd.read_csv(jupyter_path)
            print(df.head())
            print(f"Shape: {df.shape}")

# If there are images, list them
if file.get('processed_paths', {}).get('images'):
    print("\n=== Images ===")
    for img_name, img_path in file['processed_paths']['images'].items():
        jupyter_path = convert_path(img_path)
        print(f"Image: {img_name}")
        print(f"Path: {jupyter_path}")
        print(f"Exists: {os.path.exists(jupyter_path)}")

=== File Summary ===

Basic Info:
  ID: 672d59878ccbfaf2d9b5abbd
  Name: sample.docx
  Type: application/vnd.openxmlformats-officedocument.wordprocessingml.document
  Size: 152.87 KB
  Status: processing

Dates:
  Upload Date: 2024-11-08T00:21:27.595000
  Processed Date: 2024-11-08T00:29:16.526000

Processing:
  Is Processed: True
  Has Images: True
  Image Count: 1
  Has Tables: True
  Table Count: 1

Paths:
  markdown_path: /home/jovyan/api/filestore/processed/f0d2591e65a7e60e4c0dc5e4656a95905e4e891215cf134f61ebf6e98cdbc1f7/content.md
  image_count: 1
  table_count: 1
  meta_path: /home/jovyan/api/filestore/processed/f0d2591e65a7e60e4c0dc5e4656a95905e4e891215cf134f61ebf6e98cdbc1f7/meta.json

=== First 500 characters of content ===
# Product Roadmap

# Next 3 Months (Q4 2024)

### Feature Development

1. Implement sleep tracking functionality
- Analyze sleep patterns and quality - Integrate with existing health metrics 2. Enhance data visualization
- Introduce interactive graphs and c

In [22]:
# Import required libraries
import os
import json
import requests
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb://db:27017/')
db = client.chelle
raw_assets = db.raw_assets

# Fetch the file object
file = raw_assets.find_one()

# Read the prompt template
with open('/home/jovyan/api/prompts/assets/metadata.txt', 'r') as f:
    prompt_template = f.read()

# Get the file content
markdown_path = file['processed_paths']['markdown'].replace('/app/filestore', '/home/jovyan/api/filestore')
with open(markdown_path, 'r') as f:
    file_content = f.read()

# Prepare the prompt by combining template and content
prompt = prompt_template + "\n\nDocument Content:\n" + file_content

# Call the chat API
response = requests.post(
    'http://api:8000/chat',
    json={
        'query': prompt,
        'messages': []  # No prior conversation context needed
    }
)

# Parse the response
if response.ok:
    chat_response = response.json()
    
    # Print the raw response first
    print("=== Raw Chat Response ===")
    print(chat_response['message'])
    
    print("\n=== Attempting to parse JSON from response ===")
    try:
        # Try to extract JSON from the response if it contains JSON
        response_text = chat_response['message']
        
        # Look for JSON-like structure in the response
        json_start = response_text.find('{')
        json_end = response_text.rfind('}') + 1
        
        if json_start >= 0 and json_end > json_start:
            json_str = response_text[json_start:json_end]
            metadata = json.loads(json_str)
            
            print("\nExtracted Metadata:")
            print(json.dumps(metadata, indent=2))
            
            # Optionally, update the MongoDB document with the new metadata
            print("\nWould you like to update the MongoDB document with this metadata?")
            print("To update, run: raw_assets.update_one({'_id': file['_id']}, {'$set': {'extracted_metadata': metadata}})")
            
    except json.JSONDecodeError:
        print("Could not parse JSON from response. The response might be in a different format.")
else:
    print(f"Error calling chat API: {response.status_code}")
    print(response.text)

=== Raw Chat Response ===
{
  "summary": "This appears to be a comprehensive product roadmap document (95% confidence) for a health and fitness app, outlining planned features, user experience enhancements, infrastructure improvements, and business expansion initiatives over the next 12 months. The document demonstrates high structural quality (90/100) with clear sections, consistent formatting, and visualizations. It covers current and upcoming app capabilities in detail, suggesting the roadmap targets a technical/product audience.",
  "documentMetadata": {
    "primaryType": {
      "category": "technical",
      "subType": "product roadmap",
      "confidence": 95,
      "customType": null,
      "customTypeJustification": null
    },
    "contentProperties": {
      "targetAudience": [
        {
          "type": "product team",
          "confidence": 90
        },
        {
          "type": "technical",
          "confidence": 85
        }
      ],
      "formalityLevel": {
    