In [2]:
import boto3
import json
import argparse
import sys
from botocore.exceptions import ClientError

def query_chat_history(bucket_name, prefix, search_term):
    """
    Query chat history files in an S3 bucket for a given search term.
    
    Parameters:
      bucket_name (str): The S3 bucket name.
      prefix (str): The prefix (or folder path) containing your chat files.
      search_term (str): The term to search for in the chat messages.
      
    Returns:
      list: A list of matched messages.
    """
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
    
    matched_messages = []
    
    for page in page_iterator:
        if "Contents" in page:
            for obj in page["Contents"]:
                key = obj["Key"]
                print(f"Processing file: {key}")
                try:
                    response = s3.get_object(Bucket=bucket_name, Key=key)
                    content = response['Body'].read().decode('utf-8')
                    
                    try:
                        data = json.loads(content)
                        # If the JSON represents a single chat message, wrap it in a list.
                        if isinstance(data, dict) and "message_text" in data:
                            messages = [data]
                        elif isinstance(data, list):
                            messages = data
                        else:
                            messages = []
                        
                        for msg in messages:
                            if isinstance(msg, dict) and "message_text" in msg:
                                if search_term.lower() in msg["message_text"].lower():
                                    matched_messages.append(msg)
                    except json.JSONDecodeError:
                        # If the file is not valid JSON, search line by line.
                        for line in content.splitlines():
                            if search_term.lower() in line.lower():
                                matched_messages.append(line)
                except ClientError as e:
                    print(f"Error retrieving {key}: {e}")
    return matched_messages

def interactive_mode(bucket, prefix):
    """
    Interactive loop for entering queries and storing the conversation history.
    """
    conversation_history = []  # List to store each query and its responses
    print("Entering interactive chat mode. Type 'exit' to quit.")
    while True:
        search_term = input("\nEnter search term (or type 'exit' to quit): ").strip()
        if search_term.lower() == 'exit':
            break
        
        results = query_chat_history(bucket, prefix, search_term)
        print(f"\nFound {len(results)} matches for '{search_term}':")
        for match in results:
            print(match)
        
        # Save the query and its results in the conversation history.
        conversation_history.append({
            'query': search_term,
            'results': results
        })
        
        print("\nConversation history so far:")
        for idx, entry in enumerate(conversation_history, start=1):
            print(f"{idx}. Query: {entry['query']} - {len(entry['results'])} matches")
    
    print("Exiting interactive mode. Final conversation history:")
    for idx, entry in enumerate(conversation_history, start=1):
        print(f"{idx}. Query: {entry['query']} - {len(entry['results'])} matches")

def main():
    # Detect if we're running in an interactive environment (like a Jupyter Notebook)
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            interactive = True  # Jupyter Notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            interactive = False  # Terminal running IPython
        else:
            interactive = False
    except NameError:
        interactive = False

    if interactive:
        # Defaults for interactive mode; you can modify these as needed.
        print("Running in interactive mode. Using default parameters.")
        bucket = "my-tax-chat-logs-123"
        prefix = "Chat History/2025/02/05/"
        interactive_mode(bucket, prefix)
    else:
        parser = argparse.ArgumentParser(description='Query chat history from an S3 bucket.')
        parser.add_argument('--bucket', required=True, help='S3 bucket name where chat history is stored.')
        parser.add_argument('--prefix', default='', help='Prefix for the chat history files (e.g., "Chat History/2025/02/05/").')
        parser.add_argument('--query', required=False, help='Search term to query in the chat history.')
        args = parser.parse_args()
        bucket = args.bucket
        prefix = args.prefix
        
        # If a single query is provided via command-line, process it once.
        if args.query:
            results = query_chat_history(bucket, prefix, args.query)
            print(f"\nFound {len(results)} matches for '{args.query}':")
            for match in results:
                print(match)
        else:
            # If no query is provided, fallback to interactive mode.
            interactive_mode(bucket, prefix)

if __name__ == '__main__':
    main()


Running in interactive mode. Using default parameters.
Entering interactive chat mode. Type 'exit' to quit.
Exiting interactive mode. Final conversation history:
