In [1]:
import boto3
import json
import argparse
import sys
from botocore.exceptions import ClientError

def query_chat_history(bucket_name, prefix, search_term):
    """
    Query chat history files in an S3 bucket for a given search term.
    
    Parameters:
      bucket_name (str): The S3 bucket name.
      prefix (str): The prefix (or folder path) containing your chat files.
      search_term (str): The term to search for in the chat messages.
      
    Returns:
      list: A list of matched messages.
    """
    s3 = boto3.client('s3')
    
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
    
    matched_messages = []
    
    for page in page_iterator:
        if "Contents" in page:
            for obj in page["Contents"]:
                key = obj["Key"]
                print(f"Processing file: {key}")
                try:
                    response = s3.get_object(Bucket=bucket_name, Key=key)
                    content = response['Body'].read().decode('utf-8')
                    
                    try:
                        data = json.loads(content)
                        # Since your JSON represents a single chat message, wrap it in a list for uniformity.
                        if isinstance(data, dict) and "message_text" in data:
                            messages = [data]
                        elif isinstance(data, list):
                            messages = data
                        else:
                            messages = []
                        
                        for msg in messages:
                            if isinstance(msg, dict) and "message_text" in msg:
                                if search_term.lower() in msg["message_text"].lower():
                                    matched_messages.append(msg)
                    except json.JSONDecodeError:
                        # If the file is not valid JSON, search line by line.
                        for line in content.splitlines():
                            if search_term.lower() in line.lower():
                                matched_messages.append(line)
                except ClientError as e:
                    print(f"Error retrieving {key}: {e}")
    return matched_messages

def main():
    # Detect if we're running in an interactive environment (like a Jupyter Notebook)
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            interactive_mode = True  # Jupyter Notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            interactive_mode = False  # Terminal running IPython
        else:
            interactive_mode = False
    except NameError:
        interactive_mode = False

    if interactive_mode:
        # Defaults for interactive mode; you can modify these as needed.
        print("Running in interactive mode. Using default parameters.")
        bucket = "my-tax-chat-logs-123"
        prefix = "Chat History/2025/02/05/"
        search_term = "taxes"
    else:
        parser = argparse.ArgumentParser(description='Query chat history from an S3 bucket.')
        parser.add_argument('--bucket', required=True, help='S3 bucket name where chat history is stored.')
        parser.add_argument('--prefix', default='', help='Prefix for the chat history files (e.g., "Chat History/2025/02/05/").')
        parser.add_argument('--query', required=True, help='Search term to query in the chat history.')
        args = parser.parse_args()
        bucket = args.bucket
        prefix = args.prefix
        search_term = args.query

    results = query_chat_history(bucket, prefix, search_term)
    print(f"\nFound {len(results)} matches for '{search_term}':")
    for match in results:
        print(match)

if __name__ == '__main__':
    main()


Running in interactive mode. Using default parameters.
Processing file: Chat History/2025/02/05/
Processing file: Chat History/2025/02/05/structure.json

Found 1 matches for 'taxes':
{'conversation_id': 'abc123', 'timestamp': '2025-02-04T12:34:56Z', 'sender_type': 'user', 'message_text': 'Hello, how do I file my 2025 taxes?', 'metadata': {'user_id': 'user-456', 'tax_year': 2025, 'language': 'en'}}
