In [None]:
import boto3
from get_smartsearch_data import *

In [None]:
response = get_ocr_data('6f1b02b9-3eef-4d21-b4bf-1db8487d7e56')

response['currentVersion']['file']['text']

In [None]:
# Bedrock Tools Integration
class ClaimEvidenceTools:
    def __init__(self):
        self.bedrock_runtime = boto3.client('bedrock-runtime')
        self.system_prompt = self._get_system_prompt()

    def _get_system_prompt(self):
        return """You are a VA Claim Evidence Assistant designed to help process and analyze veteran claim documents. You have access to specialized tools for interacting with the VA's Claim Evidence API system.

YOUR CAPABILITIES:
1. **OCR Data Extraction**: Extract and analyze text from document files using OCR technology
2. **File Metadata Retrieval**: Get detailed information about files including properties and administrative data
3. **Folder Search**: Search veteran folders using OpenSearch queries with full query DSL support
4. **Document Upload**: Upload summary documents to veteran files

DATA STRUCTURE KNOWLEDGE:

**OCR Data Structure:**
- OCR data contains hierarchical structure: file ‚Üí pages ‚Üí lines ‚Üí words with geometry and confidence data
- Full document text available at currentVersion.file.text
- Page-by-page breakdown with coordinate information for precise positioning

**File Metadata Structure (get_metadata response):**
Complete file metadata includes the same structure as search results but with full detail:
- `owner`: {type: str, id: str}
- `uuid`: Unique file identifier
- `currentVersionUuid`: Current version identifier
- `currentVersion`:
  - `systemData`: {uploadedDateTime, contentName, mimeType, uploadSource}
  - `providerData`: {subject, documentTypeId, ocrStatus, newMail, systemSource, noEvidentiaryValue, modifiedDateTime, archiveBin, readByCurrentUser, veteranLastName, dateVaReceivedDocument, veteranFirstName, contentSource, actionable}


**Search Results Structure:**
Search results return an object with:
- `page`: Pagination info {totalPages, requestedResultsPerPage, currentPage, totalResults}
- `files`: Array of document objects, each containing:
  - `uuid`: Unique file identifier (use this for get_ocr_data and get_metadata calls)
  - `currentVersionUuid`: Current version identifier
  - `owner`: Document ownership info {type, id}
  - `currentVersion.systemData`: Technical metadata
    - `uploadedDateTime`: When uploaded to system (format: "2019-12-19T17:54:29")
    - `contentName`: Original filename
    - `mimeType`: File type (PDF, image, etc.)
    - `uploadSource`: Where it came from
  - `currentVersion.providerData`: Business metadata
    - `subject`: Document subject/title
    - `documentTypeId`: Numeric document type (181=summary, 719=exam request, etc.)
    - `ocrStatus`: OCR processing status
    - `newMail`: Whether document is new mail
    - `systemSource`: Source system identifier
    - `modifiedDateTime`: Last modification timestamp (format: "2025-02-03T20:14:48")
    - `readByCurrentUser`: User has viewed this document
    - `veteranFirstName`, `veteranLastName`: Veteran identification
    - `dateVaReceivedDocument`: When VA received the document (format: "2019-12-19")
    - `contentSource`: Source system (VBMS, etc.)
    - `actionable`: Whether document requires action
    - `associatedClaimIds`: Array of related claim IDs

**Search Query Structure:**
The search_folder tool uses a custom API structure with specific filter syntax. Use these as examples:
```json
{
  "pageRequest": {
    "resultsPerPage": 30,  // Number of results per page (default: 30)
    "page": 1              // Page number (starts at 1)
  },
  "filters": {
    "providerData.documentTypeId": {
      "evaluationType": "EQUALS",
      "value": "[\"719\", \"181\"]"  // JSON-encoded array of strings
    },
    "providerData.ocrStatus": {
      "evaluationType": "EQUALS", 
      "value": "[\"Searchable\"]"   // JSON-encoded array
    },
    "textContent": {
      "evaluationType": "QUERY",
      "query": {
        "includesAll": ["medical", "exam"],           // All terms must be present
        "includesAtLeastOne": ["request", "report"],  // At least one term must be present
        "excludes": ["cancelled", "denied"]           // None of these terms should be present
      }
    }
  },
  "dateFilters": {
    "dateVaReceivedDocument": {
      "gte": "2024-01-01",            // Greater than or equal to date
      "lte": "2024-12-31"             // Less than or equal to date
    }
  },
  "sort": [
    {
      "field": "dateVaReceivedDocument",  // Field to sort by
      "order": "desc"                     // "asc" or "desc"
    }
  ]
}"""
    def filter_ocr_data(self, ocr_data):
        """Filter OCR data to remove geometry and keep only essential text information"""
        try:
            if not isinstance(ocr_data, dict) or 'currentVersion' not in ocr_data:
                return ocr_data
            
            filtered_data = {
                "currentVersion": {
                    "file": {}
                }
            }
            
            file_data = ocr_data.get('currentVersion', {}).get('file', {})
            
            # Filter pages to keep only pageNumber and text
            if 'pages' in file_data and isinstance(file_data['pages'], list):
                filtered_pages = []
                for page in file_data['pages']:
                    filtered_page = {}
                    if 'pageNumber' in page:
                        filtered_page['pageNumber'] = page['pageNumber']
                    if 'text' in page:
                        filtered_page['text'] = page['text']
                    filtered_pages.append(filtered_page)
                filtered_data['currentVersion']['file']['pages'] = filtered_pages
            
            return filtered_data
            
        except Exception as e:
            print(f"Error filtering OCR data: {str(e)}")
            return {"error": f"Error filtering OCR data: {str(e)}"}
    
    def get_tool_definitions(self):
        """Tool definitions with complete schemas"""
        return [
            {
                "toolSpec": {
                    "name": "get_ocr_data",
                    "description": "Extract OCR text data from a file using its UUID. Returns structured data with pages, lines, words, geometry, and confidence scores.",
                    "inputSchema": {
                        "json": {
                            "type": "object",
                            "properties": {
                                "file_id": {
                                    "type": "string",
                                    "description": "The UUID of the file to extract OCR data from (use the 'uuid' field from search results)"
                                }
                            },
                            "required": ["file_id"]
                        }
                    }
                }
            },
            {
                "toolSpec": {
                    "name": "get_metadata",
                    "description": "Retrieve detailed metadata for a specific file including file properties, upload info, and business data.",
                    "inputSchema": {
                        "json": {
                            "type": "object",
                            "properties": {
                                "file_id": {
                                    "type": "string",
                                    "description": "The UUID of the file to get metadata for (use the 'uuid' field from search results)"
                                }
                            },
                            "required": ["file_id"]
                        }
                    }
                }
            },
            {
                "toolSpec": {
                    "name": "search_folder",
                    "description": "Search for files within a veteran's folder using custom search API with pagination, optional filters, and date filtering. The 'filters' field is REQUIRED in the query_body (can be empty object). Returns object with 'page' (pagination info) and 'files' (array of documents).",
                    "inputSchema": {
                        "json": {
                            "type": "object",
                            "properties": {
                                "filenumber": {
                                    "type": "string",
                                    "description": "The veteran's file number to search within"
                                },
                                "query_body": {
                                    "type": "object",
                                    "description": "Search query with pageRequest and REQUIRED filters field (can be empty), optional dateFilters and sort",
                                    "properties": {
                                        "pageRequest": {
                                            "type": "object",
                                            "properties": {
                                                "resultsPerPage": {
                                                    "type": "integer",
                                                    "description": "Number of results per page (default: 30)",
                                                    "default": 5
                                                },
                                                "page": {
                                                    "type": "integer",
                                                    "description": "Page number starting from 1",
                                                    "default": 1
                                                }
                                            },
                                            "required": ["resultsPerPage", "page"]
                                        },
                                        "filters": {
                                            "type": "object",
                                            "description": "REQUIRED field - can be empty object {} or contain optional filter specifications",
                                        },
                                        "dateFilters": {
                                            "type": "object",
                                            "description": "Optional date-based filters using gte/lte operators",
                                            "properties": {
                                                "dateVaReceivedDocument": {
                                                    "type": "object",
                                                    "properties": {
                                                        "gte": {"type": "string", "format": "date", "description": "Greater than or equal to date (YYYY-MM-DD)"},
                                                        "lte": {"type": "string", "format": "date", "description": "Less than or equal to date (YYYY-MM-DD)"}
                                                    }
                                                },
                                                "uploadedDateTime": {
                                                    "type": "object",
                                                    "properties": {
                                                        "gte": {"type": "string", "format": "date", "description": "Greater than or equal to date (YYYY-MM-DD)"},
                                                        "lte": {"type": "string", "format": "date", "description": "Less than or equal to date (YYYY-MM-DD)"}
                                                    }
                                                },
                                                "modifiedDateTime": {
                                                    "type": "object",
                                                    "properties": {
                                                        "gte": {"type": "string", "format": "date", "description": "Greater than or equal to date (YYYY-MM-DD)"},
                                                        "lte": {"type": "string", "format": "date", "description": "Less than or equal to date (YYYY-MM-DD)"}
                                                    }
                                                }
                                            }
                                        },
                                        "sort": {
                                            "type": "array",
                                            "items": {
                                                "type": "object",
                                                "properties": {
                                                    "field": {"type": "string", "description": "Field to sort by (e.g., 'dateVaReceivedDocument', 'uploadedDateTime', 'modifiedDateTime')"},
                                                    "order": {"type": "string", "enum": ["asc", "desc"], "description": "Sort order"}
                                                }
                                            }
                                        }
                                    },
                                    "required": ["pageRequest", "filters"]
                                }
                            },
                            "required": ["filenumber", "query_body"]
                        }
                    }
                }
            },
            {
                "toolSpec": {
                    "name": "upload_summary_doc",
                    "description": "Upload a summary document to a veteran's file using their participant ID.",
                    "inputSchema": {
                        "json": {
                            "type": "object",
                            "properties": {
                                "participant_id": {
                                    "type": "string",
                                    "description": "The veteran's participant ID"
                                },
                                "local_file": {
                                    "type": "string",
                                    "description": "Path to the local file to upload"
                                }
                            },
                            "required": ["participant_id", "local_file"]
                        }
                    }
                }
            },
        ]
    
    def execute_tool(self, tool_name: str, parameters: dict):
        """Execute a tool function"""
        try:
            if tool_name == "get_ocr_data":
                response = get_ocr_data(parameters["file_id"])
                return self.filter_ocr_data(response)
            elif tool_name == "get_metadata":
                return get_metadata(parameters["file_id"])
            elif tool_name == "search_folder":
                return search_folder(parameters["filenumber"], parameters["query_body"])
            elif tool_name == "upload_summary_doc":
                return upload_summary_doc(parameters["participant_id"], parameters["local_file"])
            else:
                raise ValueError(f"Unknown tool: {tool_name}")
        except Exception as e:
            return {"error": str(e)}
    
    def summarize_search_results(self, search_results):
        """Helper method to create a readable summary of search results"""
        try:
            if isinstance(search_results, list):
                summary = {
                    "total_documents": len(search_results),
                    "documents": []
                }
                
                for doc in search_results:
                    doc_summary = {
                        "uuid": doc.get("uuid"),
                        "content_name": doc.get("currentVersion", {}).get("systemData", {}).get("contentName"),
                        "subject": doc.get("currentVersion", {}).get("providerData", {}).get("subject"),
                        "document_type_id": doc.get("currentVersion", {}).get("providerData", {}).get("documentTypeId"),
                        "date_received": doc.get("currentVersion", {}).get("providerData", {}).get("dateVaReceivedDocument"),
                        "veteran_name": f"{doc.get('currentVersion', {}).get('providerData', {}).get('veteranFirstName', '')} {doc.get('currentVersion', {}).get('providerData', {}).get('veteranLastName', '')}".strip(),
                        "actionable": doc.get("currentVersion", {}).get("providerData", {}).get("actionable"),
                        "read_by_user": doc.get("currentVersion", {}).get("providerData", {}).get("readByCurrentUser"),
                        "ocr_status": doc.get("currentVersion", {}).get("providerData", {}).get("ocrStatus")
                    }
                    summary["documents"].append(doc_summary)
                
                return summary
            else:
                return {"error": "Search results not in expected format"}
        except Exception as e:
            return {"error": f"Error summarizing results: {str(e)}"}
    
    def extract_text_from_ocr(self, ocr_data):
        """Helper method to extract readable text from OCR data structure"""
        try:
            if 'currentVersion' in ocr_data and 'file' in ocr_data['currentVersion']:
                file_data = ocr_data['currentVersion']['file']
                
                # Get full text if available
                if 'text' in file_data:
                    return {
                        "full_text": file_data['text'],
                        "total_pages": file_data.get('totalPages', 0),
                        "pages_count": len(file_data.get('pages', []))
                    }
                
                # Construct from pages
                pages_text = []
                for page in file_data.get('pages', []):
                    if 'text' in page:
                        pages_text.append(f"Page {page['pageNumber']}: {page['text']}")
                
                return {
                    "full_text": "\n\n".join(pages_text),
                    "total_pages": file_data.get('totalPages', 0),
                    "pages_count": len(pages_text)
                }
            
            return {"error": "Invalid OCR data structure"}
        except Exception as e:
            return {"error": f"Error extracting text: {str(e)}"}
    
    def chat_with_tools(self, message: str, model_id: str = "anthropic.claude-3-5-sonnet-20240620-v1:0", max_iterations: int = 10):
        """Chat with Claude using your tools and system prompt with support for multiple tool calls"""
        
        # Prepare the conversation with system prompt
        messages = [
            {
                "role": "user",
                "content": [{"text": message}]
            }
        ]
        
        iteration_count = 0
        
        while iteration_count < max_iterations:
            # Call Bedrock with tools and system prompt
            response = self.bedrock_runtime.converse(
                modelId=model_id,
                messages=messages,
                system=[{"text": self.system_prompt}],
                toolConfig={
                    "tools": self.get_tool_definitions()
                }
            )
            
            # Process the response
            stop_reason = response['stopReason']
            iteration_count += 1
            
            print(f"üîÑ Iteration {iteration_count}, Stop reason: {stop_reason}")
            
            if stop_reason == 'tool_use':
                # Claude wants to use tool(s)
                tool_requests = []
                
                # Add Claude's message (which contains tool use requests)
                messages.append(response['output']['message'])
                
                for content in response['output']['message']['content']:
                    if 'toolUse' in content:
                        tool_use = content['toolUse']
                        tool_name = tool_use['name']
                        tool_input = tool_use['input']
                        tool_id = tool_use['toolUseId']
                        
                        print(f"üîß Using tool: {tool_name}")
                        print(f"üìù Input: {tool_input}")
                        
                        # Execute the tool
                        result = self.execute_tool(tool_name, tool_input)
                        
                        # Add some error handling for large results
                        result_str = json.dumps(result)
                        if len(result_str) > 50000:  # Truncate very large results
                            print(f"‚ö†Ô∏è  Large result truncated ({len(result_str)} chars)")
                            if isinstance(result, dict) and 'error' not in result:
                                # For large successful results, provide a summary
                                result = {
                                    "summary": f"Large dataset with {len(result)} items" if isinstance(result, list) else "Large response received",
                                    "first_items": result[:3] if isinstance(result, list) else str(result)[:1000],
                                    "total_size": len(result) if isinstance(result, list) else len(str(result))
                                }
                                result_str = json.dumps(result)
                        
                        tool_requests.append({
                            "toolResult": {
                                "toolUseId": tool_id,
                                "content": [{"text": result_str}]
                            }
                        })
                        
                        print(f"‚úÖ Tool completed: {tool_name}")
                
                # Send tool results back to Claude
                messages.append({
                    "role": "user",
                    "content": tool_requests
                })
                
                # Continue the loop to get Claude's next response
                continue
                
            elif stop_reason == 'end_turn':
                # Claude is done and provided a final response
                final_text = ""
                for content in response['output']['message']['content']:
                    if 'text' in content:
                        final_text += content['text']
                
                print(f"‚úÖ Final response received after {iteration_count} iterations")
                return final_text
                
            elif stop_reason == 'max_tokens':
                # Hit token limit
                print(f"‚ö†Ô∏è  Hit max tokens limit after {iteration_count} iterations")
                final_text = ""
                for content in response['output']['message']['content']:
                    if 'text' in content:
                        final_text += content['text']
                return final_text + "\n\n[Response truncated due to length limit]"
                
            else:
                # Other stop reasons (stop_sequence, etc.)
                print(f"üîö Stopped with reason: {stop_reason}")
                final_text = ""
                for content in response['output']['message']['content']:
                    if 'text' in content:
                        final_text += content['text']
                return final_text
        
        # If we hit max iterations
        print(f"‚ö†Ô∏è  Hit maximum iterations ({max_iterations})")
        return "I've reached the maximum number of tool calls. Please try breaking your request into smaller parts."

In [None]:
# Initialize the tools
tools = ClaimEvidenceTools()

# Example 1: Find specific file
response = tools.chat_with_tools("""
Search for the file '6f1b02b9-3eef-4d21-b4bf-1db8487d7e56' and give me the first page of OCR output.
""")
print(response)

In [None]:
# Example 2: Find latest three claims for veteran
response = tools.chat_with_tools("""
Search in the folder for the veteran 032010268 and find me the 3 latest claims, use the uuid from each of them and find me the pages from the ocr text and summarize them. 
""")
print(response)