From d055a021e92a94b2d5c0e11ed20987cf746fabb8 Mon Sep 17 00:00:00 2001
From: Erik M Jacobs <erikmjacobs@gmail.com>
Date: Thu, 21 Aug 2025 14:28:00 -0400
Subject: [PATCH 1/6] attempt to anonymize transcripts

---
 src/app/endpoints/conversations.py            |  16 +-
 src/app/endpoints/query.py                    |  17 +-
 src/models/database/conversations.py          |   4 +-
 src/models/database/user_mapping.py           |  28 ++
 src/utils/endpoints.py                        |   8 +-
 src/utils/transcripts.py                      |  21 +-
 src/utils/user_anonymization.py               | 122 ++++++++
 .../app/endpoints/test_query_anonymization.py | 197 +++++++++++++
 .../utils/test_endpoints_anonymization.py     | 171 +++++++++++
 .../utils/test_transcripts_anonymization.py   | 271 ++++++++++++++++++
 tests/unit/utils/test_user_anonymization.py   | 225 +++++++++++++++
 11 files changed, 1062 insertions(+), 18 deletions(-)
 create mode 100644 src/models/database/user_mapping.py
 create mode 100644 src/utils/user_anonymization.py
 create mode 100644 tests/unit/app/endpoints/test_query_anonymization.py
 create mode 100644 tests/unit/utils/test_endpoints_anonymization.py
 create mode 100644 tests/unit/utils/test_transcripts_anonymization.py
 create mode 100644 tests/unit/utils/test_user_anonymization.py

diff --git a/src/app/endpoints/conversations.py b/src/app/endpoints/conversations.py
index bbb7825d..c58ebcce 100644
--- a/src/app/endpoints/conversations.py
+++ b/src/app/endpoints/conversations.py
@@ -20,6 +20,7 @@
 from app.database import get_session
 from utils.endpoints import check_configuration_loaded, validate_conversation_ownership
 from utils.suid import check_suid
+from utils.user_anonymization import get_anonymous_user_id
 
 logger = logging.getLogger("app.endpoints.handlers")
 router = APIRouter(tags=["conversations"])
@@ -154,13 +155,22 @@ def get_conversations_list_endpoint_handler(
 
     user_id, _, _ = auth
 
-    logger.info("Retrieving conversations for user %s", user_id)
+    # Get anonymous user ID for database lookup
+    anonymous_user_id = get_anonymous_user_id(user_id)
+
+    logger.info(
+        "Retrieving conversations for user %s (anonymous: %s)",
+        user_id[:8] + "..." if len(user_id) > 8 else user_id,
+        anonymous_user_id,
+    )
 
     with get_session() as session:
         try:
-            # Get all conversations for this user
+            # Get all conversations for this user using anonymous ID
             user_conversations = (
-                session.query(UserConversation).filter_by(user_id=user_id).all()
+                session.query(UserConversation)
+                .filter_by(anonymous_user_id=anonymous_user_id)
+                .all()
             )
 
             # Return conversation summaries with metadata
diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py
index eae3ae28..92afdf8b 100644
--- a/src/app/endpoints/query.py
+++ b/src/app/endpoints/query.py
@@ -24,16 +24,17 @@
 from configuration import configuration
 from app.database import get_session
 import metrics
+import constants
 from models.database.conversations import UserConversation
 from models.responses import QueryResponse, UnauthorizedResponse, ForbiddenResponse
 from models.requests import QueryRequest, Attachment
-import constants
 from utils.endpoints import (
     check_configuration_loaded,
     get_agent,
     get_system_prompt,
     validate_conversation_ownership,
 )
+from utils.user_anonymization import get_anonymous_user_id
 from utils.mcp_headers import mcp_headers_dependency, handle_mcp_headers_with_toolgroups
 from utils.transcripts import store_transcript
 from utils.types import TurnSummary
@@ -76,25 +77,31 @@ def is_transcripts_enabled() -> bool:
 def persist_user_conversation_details(
     user_id: str, conversation_id: str, model: str, provider_id: str
 ) -> None:
-    """Associate conversation to user in the database."""
+    """Associate conversation to user in the database using anonymous user ID."""
+    # Get anonymous user ID for database storage
+    anonymous_user_id = get_anonymous_user_id(user_id)
+
     with get_session() as session:
         existing_conversation = (
             session.query(UserConversation)
-            .filter_by(id=conversation_id, user_id=user_id)
+            .filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id)
             .first()
         )
 
         if not existing_conversation:
             conversation = UserConversation(
                 id=conversation_id,
-                user_id=user_id,
+                anonymous_user_id=anonymous_user_id,
                 last_used_model=model,
                 last_used_provider=provider_id,
                 message_count=1,
             )
             session.add(conversation)
             logger.debug(
-                "Associated conversation %s to user %s", conversation_id, user_id
+                "Associated conversation %s to anonymous user %s (original: %s)",
+                conversation_id,
+                anonymous_user_id,
+                user_id[:8] + "..." if len(user_id) > 8 else user_id,
             )
         else:
             existing_conversation.last_used_model = model
diff --git a/src/models/database/conversations.py b/src/models/database/conversations.py
index 1cce8a64..ce0ff795 100644
--- a/src/models/database/conversations.py
+++ b/src/models/database/conversations.py
@@ -16,8 +16,8 @@ class UserConversation(Base):  # pylint: disable=too-few-public-methods
     # The conversation ID
     id: Mapped[str] = mapped_column(primary_key=True)
 
-    # The user ID associated with the conversation
-    user_id: Mapped[str] = mapped_column(index=True)
+    # The anonymous user ID associated with the conversation
+    anonymous_user_id: Mapped[str] = mapped_column(index=True)
 
     # The last provider/model used in the conversation
     last_used_model: Mapped[str] = mapped_column()
diff --git a/src/models/database/user_mapping.py b/src/models/database/user_mapping.py
new file mode 100644
index 00000000..f9f245ef
--- /dev/null
+++ b/src/models/database/user_mapping.py
@@ -0,0 +1,28 @@
+"""User ID anonymization mapping model."""
+
+from datetime import datetime
+
+from sqlalchemy.orm import Mapped, mapped_column
+from sqlalchemy import DateTime, func, Index
+
+from models.database.base import Base
+
+
+class UserMapping(Base):  # pylint: disable=too-few-public-methods
+    """Model for mapping real user IDs to anonymous UUIDs."""
+
+    __tablename__ = "user_mapping"
+
+    # Anonymous UUID used for all storage/analytics (primary key)
+    anonymous_id: Mapped[str] = mapped_column(primary_key=True)
+
+    # Original user ID from authentication (hashed for security)
+    user_id_hash: Mapped[str] = mapped_column(index=True, unique=True)
+
+    created_at: Mapped[datetime] = mapped_column(
+        DateTime(timezone=True),
+        server_default=func.now(),  # pylint: disable=not-callable
+    )
+
+    # Index for efficient lookups
+    __table_args__ = (Index("ix_user_mapping_hash_lookup", "user_id_hash"),)
diff --git a/src/utils/endpoints.py b/src/utils/endpoints.py
index 993dc4e8..78a949df 100644
--- a/src/utils/endpoints.py
+++ b/src/utils/endpoints.py
@@ -13,6 +13,7 @@
 from configuration import AppConfig
 from utils.suid import get_suid
 from utils.types import GraniteToolParser
+from utils.user_anonymization import get_anonymous_user_id
 
 
 logger = logging.getLogger("utils.endpoints")
@@ -21,11 +22,14 @@
 def validate_conversation_ownership(
     user_id: str, conversation_id: str
 ) -> UserConversation | None:
-    """Validate that the conversation belongs to the user."""
+    """Validate that the conversation belongs to the user using anonymous ID lookup."""
+    # Get anonymous user ID for database lookup
+    anonymous_user_id = get_anonymous_user_id(user_id)
+
     with get_session() as session:
         conversation = (
             session.query(UserConversation)
-            .filter_by(id=conversation_id, user_id=user_id)
+            .filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id)
             .first()
         )
         return conversation
diff --git a/src/utils/transcripts.py b/src/utils/transcripts.py
index e29d4319..17e9638c 100644
--- a/src/utils/transcripts.py
+++ b/src/utils/transcripts.py
@@ -14,15 +14,16 @@
 from models.requests import Attachment, QueryRequest
 from utils.suid import get_suid
 from utils.types import TurnSummary
+from utils.user_anonymization import get_anonymous_user_id
 
 logger = logging.getLogger("utils.transcripts")
 
 
-def construct_transcripts_path(user_id: str, conversation_id: str) -> Path:
-    """Construct path to transcripts."""
+def construct_transcripts_path(anonymous_user_id: str, conversation_id: str) -> Path:
+    """Construct path to transcripts using anonymous user ID."""
     # these two normalizations are required by Snyk as it detects
     # this Path sanitization pattern
-    uid = os.path.normpath("/" + user_id).lstrip("/")
+    uid = os.path.normpath("/" + anonymous_user_id).lstrip("/")
     cid = os.path.normpath("/" + conversation_id).lstrip("/")
     file_path = (
         configuration.user_data_collection_configuration.transcripts_storage or ""
@@ -46,7 +47,7 @@ def store_transcript(  # pylint: disable=too-many-arguments,too-many-positional-
     """Store transcript in the local filesystem.
 
     Args:
-        user_id: The user ID (UUID).
+        user_id: The original user ID from authentication (will be anonymized).
         conversation_id: The conversation ID (UUID).
         query_is_valid: The result of the query validation.
         query: The query (without attachments).
@@ -56,7 +57,15 @@ def store_transcript(  # pylint: disable=too-many-arguments,too-many-positional-
         truncated: The flag indicating if the history was truncated.
         attachments: The list of `Attachment` objects.
     """
-    transcripts_path = construct_transcripts_path(user_id, conversation_id)
+    # Get anonymous user ID for storage
+    anonymous_user_id = get_anonymous_user_id(user_id)
+    logger.debug(
+        "Anonymized user %s to %s for transcript storage",
+        user_id[:8] + "..." if len(user_id) > 8 else user_id,
+        anonymous_user_id,
+    )
+
+    transcripts_path = construct_transcripts_path(anonymous_user_id, conversation_id)
     transcripts_path.mkdir(parents=True, exist_ok=True)
 
     data_to_store = {
@@ -65,7 +74,7 @@ def store_transcript(  # pylint: disable=too-many-arguments,too-many-positional-
             "model": model_id,
             "query_provider": query_request.provider,
             "query_model": query_request.model,
-            "user_id": user_id,
+            "anonymous_user_id": anonymous_user_id,  # Store anonymous ID only
             "conversation_id": conversation_id,
             "timestamp": datetime.now(UTC).isoformat(),
         },
diff --git a/src/utils/user_anonymization.py b/src/utils/user_anonymization.py
new file mode 100644
index 00000000..7ab8929f
--- /dev/null
+++ b/src/utils/user_anonymization.py
@@ -0,0 +1,122 @@
+"""User ID anonymization utilities."""
+
+import hashlib
+import logging
+from typing import Optional
+
+from sqlalchemy.exc import IntegrityError
+
+from models.database.user_mapping import UserMapping
+from app.database import get_session
+from utils.suid import get_suid
+
+logger = logging.getLogger("utils.user_anonymization")
+
+
+def _hash_user_id(user_id: str) -> str:
+    """
+    Create a consistent hash of the user ID for mapping purposes.
+
+    Uses SHA-256 with a fixed salt to ensure consistent hashing
+    while preventing rainbow table attacks.
+    """
+    # Use a fixed salt - in production, this should be configurable
+    salt = "lightspeed_user_anonymization_salt_v1"
+    hash_input = f"{salt}:{user_id}".encode("utf-8")
+    return hashlib.sha256(hash_input).hexdigest()
+
+
+def get_anonymous_user_id(auth_user_id: str) -> str:
+    """
+    Get or create an anonymous UUID for a user ID from authentication.
+
+    This function:
+    1. Hashes the original user ID for secure storage
+    2. Looks up existing anonymous mapping
+    3. Creates new anonymous UUID if none exists
+    4. Returns the anonymous UUID for use in storage/analytics
+
+    Args:
+        auth_user_id: The original user ID from authentication
+
+    Returns:
+        Anonymous UUID string for this user
+    """
+    user_id_hash = _hash_user_id(auth_user_id)
+
+    with get_session() as session:
+        # Try to find existing mapping
+        existing_mapping = (
+            session.query(UserMapping).filter_by(user_id_hash=user_id_hash).first()
+        )
+
+        if existing_mapping:
+            logger.debug(
+                "Found existing anonymous ID for user hash %s", user_id_hash[:8] + "..."
+            )
+            return existing_mapping.anonymous_id
+
+        # Create new anonymous mapping
+        anonymous_id = get_suid()
+        new_mapping = UserMapping(anonymous_id=anonymous_id, user_id_hash=user_id_hash)
+
+        try:
+            session.add(new_mapping)
+            session.commit()
+            logger.info(
+                "Created new anonymous ID %s for user hash %s",
+                anonymous_id,
+                user_id_hash[:8] + "...",
+            )
+            return anonymous_id
+
+        except IntegrityError as e:
+            session.rollback()
+            # Race condition - another thread created the mapping
+            logger.warning("Race condition creating user mapping: %s", e)
+
+            # Try to fetch the mapping created by the other thread
+            existing_mapping = (
+                session.query(UserMapping).filter_by(user_id_hash=user_id_hash).first()
+            )
+
+            if existing_mapping:
+                return existing_mapping.anonymous_id
+
+            # If we still can't find it, something is wrong
+            logger.error(
+                "Failed to create or retrieve user mapping for hash %s",
+                user_id_hash[:8] + "...",
+            )
+            raise RuntimeError("Unable to create or retrieve anonymous user ID") from e
+
+
+def get_user_count() -> int:
+    """
+    Get the total number of unique users in the system.
+
+    Returns:
+        Total count of unique anonymous users
+    """
+    with get_session() as session:
+        return session.query(UserMapping).count()
+
+
+def find_anonymous_user_id(auth_user_id: str) -> Optional[str]:
+    """
+    Find existing anonymous ID for a user without creating a new one.
+
+    Args:
+        auth_user_id: The original user ID from authentication
+
+    Returns:
+        Anonymous UUID if found, None otherwise
+    """
+    user_id_hash = _hash_user_id(auth_user_id)
+
+    with get_session() as session:
+        existing_mapping = (
+            session.query(UserMapping).filter_by(user_id_hash=user_id_hash).first()
+        )
+
+        return existing_mapping.anonymous_id if existing_mapping else None
diff --git a/tests/unit/app/endpoints/test_query_anonymization.py b/tests/unit/app/endpoints/test_query_anonymization.py
new file mode 100644
index 00000000..61c76467
--- /dev/null
+++ b/tests/unit/app/endpoints/test_query_anonymization.py
@@ -0,0 +1,197 @@
+"""Tests for query endpoint anonymization functionality."""
+
+import logging
+from unittest.mock import patch, MagicMock
+
+from sqlalchemy import inspect
+
+from app.endpoints.query import persist_user_conversation_details
+from models.database.conversations import UserConversation
+
+
+class TestQueryAnonymization:
+    """Test query endpoint user anonymization."""
+
+    @patch("app.endpoints.query.get_anonymous_user_id")
+    @patch("app.endpoints.query.get_session")
+    def test_persist_user_conversation_details_new_conversation(
+        self, mock_get_session, mock_get_anonymous
+    ):
+        """Test persisting new conversation uses anonymous user ID."""
+        # Setup mocks
+        mock_get_anonymous.return_value = "anon-new-conv-123"
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+        mock_session.query.return_value.filter_by.return_value.first.return_value = None
+
+        # Call function
+        persist_user_conversation_details(
+            user_id="new_user@example.com",
+            conversation_id="new-conv-456",
+            model="test-model",
+            provider_id="test-provider",
+        )
+
+        # Verify anonymous user ID was obtained
+        mock_get_anonymous.assert_called_once_with("new_user@example.com")
+
+        # Verify database query used anonymous ID
+        mock_session.query.assert_called_once_with(UserConversation)
+        filter_call = mock_session.query.return_value.filter_by.call_args
+        assert filter_call[1]["id"] == "new-conv-456"
+        assert filter_call[1]["anonymous_user_id"] == "anon-new-conv-123"
+
+        # Verify new conversation was created with anonymous ID
+        mock_session.add.assert_called_once()
+        added_conversation = mock_session.add.call_args[0][0]
+        assert isinstance(added_conversation, UserConversation)
+        assert added_conversation.id == "new-conv-456"
+        assert added_conversation.anonymous_user_id == "anon-new-conv-123"
+        assert added_conversation.last_used_model == "test-model"
+        assert added_conversation.last_used_provider == "test-provider"
+        assert added_conversation.message_count == 1
+
+        mock_session.commit.assert_called_once()
+
+    @patch("app.endpoints.query.get_anonymous_user_id")
+    @patch("app.endpoints.query.get_session")
+    def test_persist_user_conversation_details_existing_conversation(
+        self, mock_get_session, mock_get_anonymous
+    ):
+        """Test updating existing conversation uses anonymous user ID."""
+        # Setup existing conversation
+        existing_conv = MagicMock(spec=UserConversation)
+        existing_conv.last_used_model = "old-model"
+        existing_conv.last_used_provider = "old-provider"
+        existing_conv.message_count = 5
+
+        mock_get_anonymous.return_value = "anon-existing-789"
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+        mock_session.query.return_value.filter_by.return_value.first.return_value = (
+            existing_conv
+        )
+
+        # Call function
+        persist_user_conversation_details(
+            user_id="existing_user@example.com",
+            conversation_id="existing-conv-123",
+            model="new-model",
+            provider_id="new-provider",
+        )
+
+        # Verify anonymous user ID was used for lookup
+        mock_get_anonymous.assert_called_once_with("existing_user@example.com")
+        filter_call = mock_session.query.return_value.filter_by.call_args
+        assert filter_call[1]["anonymous_user_id"] == "anon-existing-789"
+
+        # Verify existing conversation was updated
+        assert existing_conv.last_used_model == "new-model"
+        assert existing_conv.last_used_provider == "new-provider"
+        assert existing_conv.message_count == 6  # Incremented
+
+        # Verify no new conversation was added
+        mock_session.add.assert_not_called()
+        mock_session.commit.assert_called_once()
+
+    @patch("app.endpoints.query.get_anonymous_user_id")
+    def test_persist_user_conversation_logs_anonymization(
+        self, mock_get_anonymous, caplog
+    ):
+        """Test that conversation persistence logs anonymization."""
+
+        mock_get_anonymous.return_value = "anon-logging-456"
+
+        with caplog.at_level(logging.DEBUG):
+            with patch("app.endpoints.query.get_session") as mock_get_session:
+                mock_session = MagicMock()
+                mock_get_session.return_value.__enter__.return_value = mock_session
+                mock_session.query.return_value.filter_by.return_value.first.return_value = (
+                    None
+                )
+
+                persist_user_conversation_details(
+                    user_id="logging_test_user@example.com",
+                    conversation_id="logging-conv-789",
+                    model="logging-model",
+                    provider_id="logging-provider",
+                )
+
+        # Check that anonymization is logged
+        log_messages = [record.message for record in caplog.records]
+        anonymization_logs = [
+            msg
+            for msg in log_messages
+            if "Associated conversation" in msg and "anonymous user" in msg
+        ]
+        assert len(anonymization_logs) > 0
+
+        # Verify log contains both anonymous and truncated original user
+        log_msg = anonymization_logs[0]
+        assert "anon-logging-456" in log_msg
+        assert "logging-conv-789" in log_msg
+        assert "logging_..." in log_msg  # Truncated original user ID
+
+    def test_conversation_model_uses_anonymous_field(self):
+        """Test that UserConversation model uses anonymous_user_id field."""
+        # Verify the model has the correct field
+        conversation = UserConversation()
+        assert hasattr(conversation, "anonymous_user_id")
+
+        # Verify we can set the anonymous user ID
+        conversation.anonymous_user_id = "test-anon-uuid"
+        assert conversation.anonymous_user_id == "test-anon-uuid"
+
+        # Verify the field is mapped correctly (this tests the database column)
+        mapper = inspect(UserConversation)
+        assert "anonymous_user_id" in mapper.columns
+
+    @patch("app.endpoints.query.get_anonymous_user_id")
+    @patch("app.endpoints.query.get_session")
+    def test_persist_user_conversation_different_users_same_conversation_id(
+        self, mock_get_session, mock_get_anonymous
+    ):
+        """Test that different users can't access each other's conversations."""
+
+        # Simulate two different users with same conversation ID (edge case)
+        def mock_anonymous_side_effect(user_id):
+            if user_id == "user1@example.com":
+                return "anon-user1-123"
+            if user_id == "user2@example.com":
+                return "anon-user2-456"
+            return "anon-unknown"
+
+        mock_get_anonymous.side_effect = mock_anonymous_side_effect
+
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+        mock_session.query.return_value.filter_by.return_value.first.return_value = None
+
+        # User 1 creates conversation
+        persist_user_conversation_details(
+            user_id="user1@example.com",
+            conversation_id="shared-conv-id",
+            model="model1",
+            provider_id="provider1",
+        )
+
+        # Verify first user's anonymous ID was used
+        first_call = mock_session.query.return_value.filter_by.call_args_list[0]
+        assert first_call[1]["anonymous_user_id"] == "anon-user1-123"
+
+        # Reset mock for second call
+        mock_session.reset_mock()
+        mock_session.query.return_value.filter_by.return_value.first.return_value = None
+
+        # User 2 tries to create conversation with same ID
+        persist_user_conversation_details(
+            user_id="user2@example.com",
+            conversation_id="shared-conv-id",  # Same conversation ID
+            model="model2",
+            provider_id="provider2",
+        )
+
+        # Verify second user's anonymous ID was used (different from first)
+        second_call = mock_session.query.return_value.filter_by.call_args_list[0]
+        assert second_call[1]["anonymous_user_id"] == "anon-user2-456"
+        assert second_call[1]["anonymous_user_id"] != "anon-user1-123"
diff --git a/tests/unit/utils/test_endpoints_anonymization.py b/tests/unit/utils/test_endpoints_anonymization.py
new file mode 100644
index 00000000..1e14f7b0
--- /dev/null
+++ b/tests/unit/utils/test_endpoints_anonymization.py
@@ -0,0 +1,171 @@
+"""Tests for endpoint utilities anonymization functionality."""
+
+from unittest.mock import patch, MagicMock
+import pytest
+
+from utils.endpoints import validate_conversation_ownership
+from models.database.conversations import UserConversation
+
+
+class TestEndpointsAnonymization:
+    """Test endpoint utilities user anonymization."""
+
+    @patch("utils.endpoints.get_anonymous_user_id")
+    @patch("utils.endpoints.get_session")
+    def test_validate_conversation_ownership_uses_anonymous_id(
+        self, mock_get_session, mock_get_anonymous
+    ):
+        """Test that conversation ownership validation uses anonymous user ID."""
+        # Setup mocks
+        conversation = MagicMock(spec=UserConversation)
+        conversation.id = "test-conv-123"
+        conversation.anonymous_user_id = "anon-owner-456"
+
+        mock_get_anonymous.return_value = "anon-owner-456"
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+        mock_session.query.return_value.filter_by.return_value.first.return_value = (
+            conversation
+        )
+
+        # Call validation
+        result = validate_conversation_ownership(
+            user_id="owner@example.com", conversation_id="test-conv-123"
+        )
+
+        # Verify anonymous user ID was obtained
+        mock_get_anonymous.assert_called_once_with("owner@example.com")
+
+        # Verify database query used anonymous ID
+        mock_session.query.assert_called_once_with(UserConversation)
+        filter_call = mock_session.query.return_value.filter_by.call_args
+        assert filter_call[1]["id"] == "test-conv-123"
+        assert filter_call[1]["anonymous_user_id"] == "anon-owner-456"
+
+        # Verify correct conversation returned
+        assert result == conversation
+
+    @patch("utils.endpoints.get_anonymous_user_id")
+    @patch("utils.endpoints.get_session")
+    def test_validate_conversation_ownership_not_owned(
+        self, mock_get_session, mock_get_anonymous
+    ):
+        """Test validation fails when user doesn't own conversation."""
+        mock_get_anonymous.return_value = "anon-not-owner-789"
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+        mock_session.query.return_value.filter_by.return_value.first.return_value = None
+
+        # Call validation
+        result = validate_conversation_ownership(
+            user_id="notowner@example.com", conversation_id="not-owned-conv"
+        )
+
+        # Verify anonymous user ID was used for lookup
+        mock_get_anonymous.assert_called_once_with("notowner@example.com")
+        filter_call = mock_session.query.return_value.filter_by.call_args
+        assert filter_call[1]["anonymous_user_id"] == "anon-not-owner-789"
+
+        # Verify None returned for non-owned conversation
+        assert result is None
+
+    @patch("utils.endpoints.get_anonymous_user_id")
+    @patch("utils.endpoints.get_session")
+    def test_validate_conversation_ownership_different_users(
+        self, mock_get_session, mock_get_anonymous
+    ):
+        """Test that different users get different anonymous IDs for validation."""
+
+        def mock_anonymous_side_effect(user_id):
+            if user_id == "user1@example.com":
+                return "anon-user1-validation"
+            if user_id == "user2@example.com":
+                return "anon-user2-validation"
+            return "anon-unknown"
+
+        mock_get_anonymous.side_effect = mock_anonymous_side_effect
+
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+        mock_session.query.return_value.filter_by.return_value.first.return_value = None
+
+        # Validate for user 1
+        result1 = validate_conversation_ownership(
+            user_id="user1@example.com", conversation_id="shared-conv-test"
+        )
+
+        first_call = mock_session.query.return_value.filter_by.call_args
+        assert first_call[1]["anonymous_user_id"] == "anon-user1-validation"
+
+        # Reset mock for second call
+        mock_session.reset_mock()
+        mock_session.query.return_value.filter_by.return_value.first.return_value = None
+
+        # Validate for user 2
+        result2 = validate_conversation_ownership(
+            user_id="user2@example.com",
+            conversation_id="shared-conv-test",  # Same conversation ID
+        )
+
+        second_call = mock_session.query.return_value.filter_by.call_args
+        assert second_call[1]["anonymous_user_id"] == "anon-user2-validation"
+
+        # Both should return None since neither owns the conversation
+        assert result1 is None
+        assert result2 is None
+
+        # But they used different anonymous IDs (checking the calls, not constant comparison)
+        assert first_call[1]["anonymous_user_id"] != second_call[1]["anonymous_user_id"]
+
+    @patch("utils.endpoints.get_anonymous_user_id")
+    @patch("utils.endpoints.get_session")
+    def test_validate_conversation_ownership_preserves_conversation_data(
+        self, mock_get_session, mock_get_anonymous
+    ):
+        """Test that validation preserves all conversation data."""
+        # Create conversation with full data
+        conversation = UserConversation()
+        conversation.id = "full-data-conv"
+        conversation.anonymous_user_id = "anon-full-data"
+        conversation.last_used_model = "test-model"
+        conversation.last_used_provider = "test-provider"
+        conversation.message_count = 42
+
+        mock_get_anonymous.return_value = "anon-full-data"
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+        mock_session.query.return_value.filter_by.return_value.first.return_value = (
+            conversation
+        )
+
+        # Validate ownership
+        result = validate_conversation_ownership(
+            user_id="datauser@example.com", conversation_id="full-data-conv"
+        )
+
+        # Verify all conversation data is preserved
+        assert result.id == "full-data-conv"
+        assert result.anonymous_user_id == "anon-full-data"
+        assert result.last_used_model == "test-model"
+        assert result.last_used_provider == "test-provider"
+        assert result.message_count == 42
+
+    @patch("utils.endpoints.get_anonymous_user_id")
+    def test_validate_conversation_ownership_error_handling(self, mock_get_anonymous):
+        """Test that validation handles errors gracefully."""
+        mock_get_anonymous.return_value = "anon-error-test"
+
+        # Simulate database error
+        with patch("utils.endpoints.get_session") as mock_get_session:
+            mock_session = MagicMock()
+            mock_get_session.return_value.__enter__.return_value = mock_session
+            mock_session.query.side_effect = Exception("Database error")
+
+            # Should propagate the exception
+            with pytest.raises(Exception, match="Database error"):
+                validate_conversation_ownership(
+                    user_id="erroruser@example.com", conversation_id="error-conv"
+                )
+
+            # Verify anonymous ID was still obtained before error
+            mock_get_anonymous.assert_called_once_with("erroruser@example.com")
diff --git a/tests/unit/utils/test_transcripts_anonymization.py b/tests/unit/utils/test_transcripts_anonymization.py
new file mode 100644
index 00000000..869b1c55
--- /dev/null
+++ b/tests/unit/utils/test_transcripts_anonymization.py
@@ -0,0 +1,271 @@
+"""Tests for transcript anonymization functionality."""
+
+import json
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+
+from models.requests import QueryRequest, Attachment
+from utils.transcripts import store_transcript, construct_transcripts_path
+from utils.types import TurnSummary
+
+
+class TestTranscriptAnonymization:
+    """Test transcript storage with user anonymization."""
+
+    @patch("utils.transcripts.get_anonymous_user_id")
+    @patch("utils.transcripts.get_suid")
+    @patch("utils.transcripts.configuration")
+    def test_store_transcript_anonymizes_user_id(
+        self, mock_config, mock_get_suid, mock_get_anonymous
+    ):
+        """Test that store_transcript uses anonymous user ID."""
+        # Setup mocks
+        mock_get_anonymous.return_value = "anon-uuid-123"
+        mock_get_suid.return_value = "transcript-uuid"
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            mock_config.user_data_collection_configuration.transcripts_storage = (
+                temp_dir
+            )
+
+            # Create test data
+            query_request = QueryRequest(
+                query="Test query", model="test-model", provider="test-provider"
+            )
+
+            summary = TurnSummary(llm_response="Test response", tool_calls=[])
+
+            # Call store_transcript
+            store_transcript(
+                user_id="original_user@example.com",
+                conversation_id="conv-123",
+                model_id="test-model",
+                provider_id="test-provider",
+                query_is_valid=True,
+                query="Test query",
+                query_request=query_request,
+                summary=summary,
+                rag_chunks=[],
+                truncated=False,
+                attachments=[],
+            )
+
+            # Verify anonymous user ID was used
+            mock_get_anonymous.assert_called_once_with("original_user@example.com")
+
+            # Verify file was created with anonymous path
+            expected_path = (
+                Path(temp_dir) / "anon-uuid-123" / "conv-123" / "transcript-uuid.json"
+            )
+            assert expected_path.exists()
+
+            # Verify file content contains anonymous ID, not original
+            with open(expected_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+
+            assert data["metadata"]["anonymous_user_id"] == "anon-uuid-123"
+            assert "user_id" not in data["metadata"]
+            assert data["metadata"]["conversation_id"] == "conv-123"
+
+    def test_construct_transcripts_path_with_anonymous_id(self):
+        """Test that transcript path construction uses anonymous ID."""
+        # This test verifies the path construction function directly
+        result = construct_transcripts_path("anon-uuid-456", "conv-789")
+
+        # Path should contain the anonymous ID, not original user ID
+        assert "anon-uuid-456" in str(result)
+        assert "conv-789" in str(result)
+
+    @patch("utils.transcripts.get_anonymous_user_id")
+    @patch("utils.transcripts.get_suid")
+    @patch("utils.transcripts.configuration")
+    def test_store_transcript_preserves_other_metadata(
+        self, mock_config, mock_get_suid, mock_get_anonymous
+    ):
+        """Test that anonymization preserves all other metadata."""
+        mock_get_anonymous.return_value = "anon-preserved-test"
+        mock_get_suid.return_value = "metadata-test-uuid"
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            mock_config.user_data_collection_configuration.transcripts_storage = (
+                temp_dir
+            )
+
+            query_request = QueryRequest(
+                query="Metadata test",
+                model="metadata-model",
+                provider="metadata-provider",
+            )
+
+            summary = TurnSummary(llm_response="Metadata response", tool_calls=[])
+
+            # Store transcript with rich metadata
+            store_transcript(
+                user_id="metadata_user@example.com",
+                conversation_id="metadata-conv",
+                model_id="actual-model",
+                provider_id="actual-provider",
+                query_is_valid=True,
+                query="Metadata test query",
+                query_request=query_request,
+                summary=summary,
+                rag_chunks=["chunk1", "chunk2"],
+                truncated=True,
+                attachments=[],
+            )
+
+            # Read and verify all metadata is preserved
+            transcript_file = (
+                Path(temp_dir)
+                / "anon-preserved-test"
+                / "metadata-conv"
+                / "metadata-test-uuid.json"
+            )
+            with open(transcript_file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+
+            metadata = data["metadata"]
+            assert metadata["provider"] == "actual-provider"
+            assert metadata["model"] == "actual-model"
+            assert metadata["query_provider"] == "metadata-provider"
+            assert metadata["query_model"] == "metadata-model"
+            assert metadata["conversation_id"] == "metadata-conv"
+            assert "timestamp" in metadata
+
+            # Verify other data preserved
+            assert data["redacted_query"] == "Metadata test query"
+            assert data["query_is_valid"] is True
+            assert data["llm_response"] == "Metadata response"
+            assert data["rag_chunks"] == ["chunk1", "chunk2"]
+            assert data["truncated"] is True
+
+    @patch("utils.transcripts.get_anonymous_user_id")
+    @patch("utils.transcripts.get_suid")
+    @patch("utils.transcripts.configuration")
+    def test_store_transcript_with_attachments(
+        self, mock_config, mock_get_suid, mock_get_anonymous
+    ):
+        """Test transcript storage with attachments preserves anonymization."""
+        mock_get_anonymous.return_value = "anon-attachments-test"
+        mock_get_suid.return_value = "attachments-uuid"
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            mock_config.user_data_collection_configuration.transcripts_storage = (
+                temp_dir
+            )
+
+            # Create attachment
+            attachment = Attachment(
+                attachment_type="text",
+                content_type="text/plain",
+                content="Test attachment content",
+            )
+
+            query_request = QueryRequest(
+                query="Query with attachment",
+                model="attachment-model",
+                provider="attachment-provider",
+            )
+
+            summary = TurnSummary(
+                llm_response="Response with attachment", tool_calls=[]
+            )
+
+            store_transcript(
+                user_id="attachment_user@example.com",
+                conversation_id="attachment-conv",
+                model_id="attachment-actual-model",
+                provider_id="attachment-actual-provider",
+                query_is_valid=True,
+                query="Query with attachment",
+                query_request=query_request,
+                summary=summary,
+                rag_chunks=[],
+                truncated=False,
+                attachments=[attachment],
+            )
+
+            # Verify attachment data preserved with anonymization
+            transcript_file = (
+                Path(temp_dir)
+                / "anon-attachments-test"
+                / "attachment-conv"
+                / "attachments-uuid.json"
+            )
+            with open(transcript_file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+
+            assert data["metadata"]["anonymous_user_id"] == "anon-attachments-test"
+            assert len(data["attachments"]) == 1
+            assert data["attachments"][0]["attachment_type"] == "text"
+            assert data["attachments"][0]["content"] == "Test attachment content"
+
+    def test_path_sanitization_with_anonymous_ids(self):
+        """Test that path sanitization works correctly with anonymous UUIDs."""
+        # Test with various UUID formats and potential path injection
+        test_cases = [
+            ("anon-uuid-123", "conv-456"),
+            ("../anon-malicious", "conv-normal"),  # Path traversal attempt
+            ("anon/with/slashes", "conv-789"),  # Embedded slashes
+            ("anon-normal", "../conv-malicious"),  # Conversation ID traversal attempt
+        ]
+
+        for anon_id, conv_id in test_cases:
+            result = construct_transcripts_path(anon_id, conv_id)
+            result_str = str(result)
+
+            # Should not contain path traversal sequences
+            assert "../" not in result_str
+            assert not result_str.startswith("/")
+
+    @patch("utils.transcripts.get_anonymous_user_id")
+    def test_logging_shows_anonymization(self, mock_get_anonymous, caplog):
+        """Test that logging shows anonymization is happening."""
+        import logging  # pylint: disable=import-outside-toplevel
+
+        mock_get_anonymous.return_value = "anon-logging-test"
+
+        with caplog.at_level(logging.DEBUG):
+            # Create minimal test setup
+            with tempfile.TemporaryDirectory() as temp_dir:
+                with patch("utils.transcripts.configuration") as mock_config:
+                    mock_config.user_data_collection_configuration.transcripts_storage = (
+                        temp_dir
+                    )
+
+                    with patch(
+                        "utils.transcripts.get_suid", return_value="log-test-uuid"
+                    ):
+                        query_request = QueryRequest(
+                            query="Log test", model="log-model", provider="log-provider"
+                        )
+
+                        summary = TurnSummary(
+                            llm_response="Log response", tool_calls=[]
+                        )
+
+                        store_transcript(
+                            user_id="log_test_user@example.com",
+                            conversation_id="log-conv",
+                            model_id="log-model",
+                            provider_id="log-provider",
+                            query_is_valid=True,
+                            query="Log test query",
+                            query_request=query_request,
+                            summary=summary,
+                            rag_chunks=[],
+                            truncated=False,
+                            attachments=[],
+                        )
+
+        # Check that anonymization is logged
+        log_messages = [record.message for record in caplog.records]
+        anonymization_logs = [msg for msg in log_messages if "Anonymized user" in msg]
+        assert len(anonymization_logs) > 0
+
+        # Verify the log shows original user is being anonymized
+        anonymization_log = anonymization_logs[0]
+        assert "anon-logging-test" in anonymization_log
+        assert "log_test..." in anonymization_log  # Truncated original user ID
diff --git a/tests/unit/utils/test_user_anonymization.py b/tests/unit/utils/test_user_anonymization.py
new file mode 100644
index 00000000..bdbce8f5
--- /dev/null
+++ b/tests/unit/utils/test_user_anonymization.py
@@ -0,0 +1,225 @@
+"""Tests for user anonymization utilities."""
+
+import hashlib
+from unittest.mock import patch, MagicMock
+
+import pytest
+from sqlalchemy.exc import IntegrityError
+
+from utils.user_anonymization import (
+    get_anonymous_user_id,
+    get_user_count,
+    find_anonymous_user_id,
+    _hash_user_id,
+)
+from models.database.user_mapping import UserMapping
+
+
+class TestUserAnonymization:
+    """Test user anonymization functionality."""
+
+    def test_hash_user_id_consistency(self):
+        """Test that user ID hashing is consistent."""
+        user_id = "test@example.com"
+        hash1 = _hash_user_id(user_id)
+        hash2 = _hash_user_id(user_id)
+
+        assert hash1 == hash2
+        assert len(hash1) == 64  # SHA-256 hex length
+        assert hash1 != user_id  # Should be different from original
+
+    def test_hash_user_id_different_for_different_users(self):
+        """Test that different user IDs produce different hashes."""
+        user1_hash = _hash_user_id("user1@example.com")
+        user2_hash = _hash_user_id("user2@example.com")
+
+        assert user1_hash != user2_hash
+
+    @patch("utils.user_anonymization.get_session")
+    @patch("utils.user_anonymization.get_suid")
+    def test_get_anonymous_user_id_new_user(self, mock_get_suid, mock_get_session):
+        """Test creating new anonymous ID for first-time user."""
+        # Setup mocks
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+        mock_session.query.return_value.filter_by.return_value.first.return_value = None
+        mock_get_suid.return_value = "anon-123-456"
+
+        user_id = "new_user@example.com"
+        result = get_anonymous_user_id(user_id)
+
+        # Verify result
+        assert result == "anon-123-456"
+
+        # Verify database interactions
+        mock_session.add.assert_called_once()
+        mock_session.commit.assert_called_once()
+
+        # Verify the UserMapping was created correctly
+        added_mapping = mock_session.add.call_args[0][0]
+        assert isinstance(added_mapping, UserMapping)
+        assert added_mapping.anonymous_id == "anon-123-456"
+        assert added_mapping.user_id_hash == _hash_user_id(user_id)
+
+    @patch("utils.user_anonymization.get_session")
+    def test_get_anonymous_user_id_existing_user(self, mock_get_session):
+        """Test retrieving existing anonymous ID for returning user."""
+        # Setup existing mapping
+        existing_mapping = UserMapping()
+        existing_mapping.anonymous_id = "existing-anon-789"
+        existing_mapping.user_id_hash = _hash_user_id("existing@example.com")
+
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+        mock_session.query.return_value.filter_by.return_value.first.return_value = (
+            existing_mapping
+        )
+
+        result = get_anonymous_user_id("existing@example.com")
+
+        # Verify result
+        assert result == "existing-anon-789"
+
+        # Verify no new mapping was created
+        mock_session.add.assert_not_called()
+        mock_session.commit.assert_not_called()
+
+    @patch("utils.user_anonymization.get_session")
+    @patch("utils.user_anonymization.get_suid")
+    def test_get_anonymous_user_id_race_condition(
+        self, mock_get_suid, mock_get_session
+    ):
+        """Test handling race condition when creating user mapping."""
+        # Setup mocks for race condition scenario
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+
+        # First query returns None (no existing mapping)
+        # Add raises IntegrityError (race condition)
+        # Second query returns existing mapping (created by other thread)
+        existing_mapping = UserMapping()
+        existing_mapping.anonymous_id = "race-condition-uuid"
+
+        mock_session.query.return_value.filter_by.return_value.first.side_effect = [
+            None,  # First check - no existing mapping
+            existing_mapping,  # Second check after race condition
+        ]
+        mock_session.add.side_effect = IntegrityError("Duplicate key", None, None)
+        mock_get_suid.return_value = "new-uuid"
+
+        result = get_anonymous_user_id("race_user@example.com")
+
+        # Should return the mapping created by the other thread
+        assert result == "race-condition-uuid"
+        mock_session.rollback.assert_called_once()
+
+    @patch("utils.user_anonymization.get_session")
+    @patch("utils.user_anonymization.get_suid")
+    def test_get_anonymous_user_id_race_condition_failure(
+        self, mock_get_suid, mock_get_session
+    ):
+        """Test handling race condition where retrieval also fails."""
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+
+        # First query returns None, add fails, second query also returns None
+        mock_session.query.return_value.filter_by.return_value.first.side_effect = [
+            None,
+            None,
+        ]
+        mock_session.add.side_effect = IntegrityError("Duplicate key", None, None)
+        mock_get_suid.return_value = "new-uuid"
+
+        with pytest.raises(
+            RuntimeError, match="Unable to create or retrieve anonymous user ID"
+        ):
+            get_anonymous_user_id("problematic_user@example.com")
+
+    @patch("utils.user_anonymization.get_session")
+    def test_get_user_count(self, mock_get_session):
+        """Test getting total user count."""
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+        mock_session.query.return_value.count.return_value = 42
+
+        result = get_user_count()
+
+        assert result == 42
+        mock_session.query.assert_called_once_with(UserMapping)
+
+    @patch("utils.user_anonymization.get_session")
+    def test_find_anonymous_user_id_existing(self, mock_get_session):
+        """Test finding existing anonymous ID without creating new one."""
+        existing_mapping = UserMapping()
+        existing_mapping.anonymous_id = "found-uuid"
+
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+        mock_session.query.return_value.filter_by.return_value.first.return_value = (
+            existing_mapping
+        )
+
+        result = find_anonymous_user_id("existing@example.com")
+
+        assert result == "found-uuid"
+
+    @patch("utils.user_anonymization.get_session")
+    def test_find_anonymous_user_id_not_found(self, mock_get_session):
+        """Test finding non-existing anonymous ID returns None."""
+        mock_session = MagicMock()
+        mock_get_session.return_value.__enter__.return_value = mock_session
+        mock_session.query.return_value.filter_by.return_value.first.return_value = None
+
+        result = find_anonymous_user_id("nonexistent@example.com")
+
+        assert result is None
+
+    def test_salt_prevents_rainbow_attacks(self):
+        """Test that salt makes rainbow table attacks impractical."""
+        # Common passwords/emails that might be in rainbow tables
+        common_ids = ["admin", "test@test.com", "user123", "john.doe@company.com"]
+
+        for user_id in common_ids:
+            hash_result = _hash_user_id(user_id)
+            # Hash should not match simple SHA-256 of just the user ID
+            simple_hash = hashlib.sha256(user_id.encode()).hexdigest()
+            assert hash_result != simple_hash
+
+    def test_anonymization_preserves_uniqueness(self):
+        """Test that different users get different anonymous IDs."""
+        users = [
+            "user1@example.com",
+            "user2@example.com",
+            "admin@company.com",
+            "test.user@domain.org",
+        ]
+
+        hashes = [_hash_user_id(user) for user in users]
+
+        # All hashes should be unique
+        assert len(set(hashes)) == len(hashes)
+
+
+class TestUserMappingModel:
+    """Test the UserMapping database model."""
+
+    def test_user_mapping_attributes(self):
+        """Test UserMapping model has correct attributes."""
+        mapping = UserMapping()
+
+        # Check that required attributes exist
+        assert hasattr(mapping, "anonymous_id")
+        assert hasattr(mapping, "user_id_hash")
+        assert hasattr(mapping, "created_at")
+
+        # Check table name
+        assert UserMapping.__tablename__ == "user_mapping"
+
+    def test_user_mapping_creation(self):
+        """Test creating UserMapping instance."""
+        mapping = UserMapping(
+            anonymous_id="test-uuid-123", user_id_hash="hashed_user_id"
+        )
+
+        assert mapping.anonymous_id == "test-uuid-123"
+        assert mapping.user_id_hash == "hashed_user_id"

From e1682760e2e529f43dd14f72d3d00422c32e6f0b Mon Sep 17 00:00:00 2001
From: Erik M Jacobs <erikmjacobs@gmail.com>
Date: Thu, 21 Aug 2025 15:44:41 -0400
Subject: [PATCH 2/6] coderabbit fixes

---
 src/app/endpoints/conversations.py            |  3 +-
 src/app/endpoints/query.py                    |  3 +-
 src/models/database/conversations.py          |  6 +-
 src/models/database/user_mapping.py           | 10 ++-
 src/utils/endpoints.py                        | 16 ++++-
 src/utils/transcripts.py                      |  3 +-
 src/utils/user_anonymization.py               | 26 ++++++--
 .../app/endpoints/test_query_anonymization.py | 17 ++++-
 tests/unit/utils/conftest.py                  |  6 ++
 .../utils/test_endpoints_anonymization.py     | 58 +++++++++++------
 .../utils/test_transcripts_anonymization.py   | 33 +++++++---
 tests/unit/utils/test_user_anonymization.py   | 62 ++++++++++++++++++-
 12 files changed, 191 insertions(+), 52 deletions(-)
 create mode 100644 tests/unit/utils/conftest.py

diff --git a/src/app/endpoints/conversations.py b/src/app/endpoints/conversations.py
index c58ebcce..98e40ee7 100644
--- a/src/app/endpoints/conversations.py
+++ b/src/app/endpoints/conversations.py
@@ -159,8 +159,7 @@ def get_conversations_list_endpoint_handler(
     anonymous_user_id = get_anonymous_user_id(user_id)
 
     logger.info(
-        "Retrieving conversations for user %s (anonymous: %s)",
-        user_id[:8] + "..." if len(user_id) > 8 else user_id,
+        "Retrieving conversations for anonymous user %s",
         anonymous_user_id,
     )
 
diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py
index 92afdf8b..5d67892a 100644
--- a/src/app/endpoints/query.py
+++ b/src/app/endpoints/query.py
@@ -98,10 +98,9 @@ def persist_user_conversation_details(
             )
             session.add(conversation)
             logger.debug(
-                "Associated conversation %s to anonymous user %s (original: %s)",
+                "Associated conversation %s to anonymous user %s",
                 conversation_id,
                 anonymous_user_id,
-                user_id[:8] + "..." if len(user_id) > 8 else user_id,
             )
         else:
             existing_conversation.last_used_model = model
diff --git a/src/models/database/conversations.py b/src/models/database/conversations.py
index ce0ff795..73d65ac4 100644
--- a/src/models/database/conversations.py
+++ b/src/models/database/conversations.py
@@ -3,7 +3,7 @@
 from datetime import datetime
 
 from sqlalchemy.orm import Mapped, mapped_column
-from sqlalchemy import DateTime, func
+from sqlalchemy import DateTime, func, ForeignKey
 
 from models.database.base import Base
 
@@ -17,7 +17,9 @@ class UserConversation(Base):  # pylint: disable=too-few-public-methods
     id: Mapped[str] = mapped_column(primary_key=True)
 
     # The anonymous user ID associated with the conversation
-    anonymous_user_id: Mapped[str] = mapped_column(index=True)
+    anonymous_user_id: Mapped[str] = mapped_column(
+        ForeignKey("user_mapping.anonymous_id"), index=True, nullable=False
+    )
 
     # The last provider/model used in the conversation
     last_used_model: Mapped[str] = mapped_column()
diff --git a/src/models/database/user_mapping.py b/src/models/database/user_mapping.py
index f9f245ef..b01f2408 100644
--- a/src/models/database/user_mapping.py
+++ b/src/models/database/user_mapping.py
@@ -3,7 +3,7 @@
 from datetime import datetime
 
 from sqlalchemy.orm import Mapped, mapped_column
-from sqlalchemy import DateTime, func, Index
+from sqlalchemy import DateTime, func, Index, String
 
 from models.database.base import Base
 
@@ -14,10 +14,14 @@ class UserMapping(Base):  # pylint: disable=too-few-public-methods
     __tablename__ = "user_mapping"
 
     # Anonymous UUID used for all storage/analytics (primary key)
-    anonymous_id: Mapped[str] = mapped_column(primary_key=True)
+    anonymous_id: Mapped[str] = mapped_column(
+        String(36), primary_key=True, nullable=False
+    )
 
     # Original user ID from authentication (hashed for security)
-    user_id_hash: Mapped[str] = mapped_column(index=True, unique=True)
+    user_id_hash: Mapped[str] = mapped_column(
+        String(64), index=True, unique=True, nullable=False
+    )
 
     created_at: Mapped[datetime] = mapped_column(
         DateTime(timezone=True),
diff --git a/src/utils/endpoints.py b/src/utils/endpoints.py
index 78a949df..79c1d98e 100644
--- a/src/utils/endpoints.py
+++ b/src/utils/endpoints.py
@@ -21,8 +21,13 @@
 
 def validate_conversation_ownership(
     user_id: str, conversation_id: str
-) -> UserConversation | None:
-    """Validate that the conversation belongs to the user using anonymous ID lookup."""
+) -> UserConversation:
+    """
+    Validate that the conversation belongs to the user using anonymous ID lookup.
+
+    Raises HTTPException(403) if conversation is not found or doesn't belong to user.
+    Returns the conversation object if valid.
+    """
     # Get anonymous user ID for database lookup
     anonymous_user_id = get_anonymous_user_id(user_id)
 
@@ -32,6 +37,13 @@ def validate_conversation_ownership(
             .filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id)
             .first()
         )
+
+        if not conversation:
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Forbidden: conversation does not belong to user",
+            )
+
         return conversation
 
 
diff --git a/src/utils/transcripts.py b/src/utils/transcripts.py
index 17e9638c..dd28e51c 100644
--- a/src/utils/transcripts.py
+++ b/src/utils/transcripts.py
@@ -60,8 +60,7 @@ def store_transcript(  # pylint: disable=too-many-arguments,too-many-positional-
     # Get anonymous user ID for storage
     anonymous_user_id = get_anonymous_user_id(user_id)
     logger.debug(
-        "Anonymized user %s to %s for transcript storage",
-        user_id[:8] + "..." if len(user_id) > 8 else user_id,
+        "Storing transcript for anonymous user %s",
         anonymous_user_id,
     )
 
diff --git a/src/utils/user_anonymization.py b/src/utils/user_anonymization.py
index 7ab8929f..0e63467d 100644
--- a/src/utils/user_anonymization.py
+++ b/src/utils/user_anonymization.py
@@ -1,7 +1,9 @@
 """User ID anonymization utilities."""
 
 import hashlib
+import hmac
 import logging
+import os
 from typing import Optional
 
 from sqlalchemy.exc import IntegrityError
@@ -12,18 +14,30 @@
 
 logger = logging.getLogger("utils.user_anonymization")
 
+# Load HMAC pepper from environment - fail at startup if missing
+try:
+    _USER_ANON_PEPPER = os.environ["USER_ANON_PEPPER"].encode("utf-8")
+except KeyError as e:
+    raise RuntimeError(
+        "USER_ANON_PEPPER environment variable is required for user anonymization. "
+        "Set a secure random value (e.g., 'export USER_ANON_PEPPER=<your-secret-key>')"
+    ) from e
+
 
 def _hash_user_id(user_id: str) -> str:
     """
     Create a consistent hash of the user ID for mapping purposes.
 
-    Uses SHA-256 with a fixed salt to ensure consistent hashing
-    while preventing rainbow table attacks.
+    Uses HMAC-SHA256 with a server-secret pepper to ensure consistent hashing
+    while preventing rainbow table attacks and providing cryptographic security.
     """
-    # Use a fixed salt - in production, this should be configurable
-    salt = "lightspeed_user_anonymization_salt_v1"
-    hash_input = f"{salt}:{user_id}".encode("utf-8")
-    return hashlib.sha256(hash_input).hexdigest()
+    # Normalize user ID: trim whitespace and lowercase
+    normalized_user_id = user_id.strip().lower()
+
+    # Use HMAC-SHA256 with secret pepper
+    return hmac.new(
+        _USER_ANON_PEPPER, normalized_user_id.encode("utf-8"), hashlib.sha256
+    ).hexdigest()
 
 
 def get_anonymous_user_id(auth_user_id: str) -> str:
diff --git a/tests/unit/app/endpoints/test_query_anonymization.py b/tests/unit/app/endpoints/test_query_anonymization.py
index 61c76467..808b807f 100644
--- a/tests/unit/app/endpoints/test_query_anonymization.py
+++ b/tests/unit/app/endpoints/test_query_anonymization.py
@@ -1,14 +1,25 @@
 """Tests for query endpoint anonymization functionality."""
 
 import logging
+import os
 from unittest.mock import patch, MagicMock
 
+import pytest
 from sqlalchemy import inspect
 
 from app.endpoints.query import persist_user_conversation_details
 from models.database.conversations import UserConversation
 
 
+# Set up test environment variable before importing the module
+@pytest.fixture(autouse=True)
+def setup_test_pepper():
+    """Set up test pepper environment variable for all tests."""
+    test_pepper = "test-pepper-for-query-tests"
+    with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}):
+        yield
+
+
 class TestQueryAnonymization:
     """Test query endpoint user anonymization."""
 
@@ -126,11 +137,13 @@ def test_persist_user_conversation_logs_anonymization(
         ]
         assert len(anonymization_logs) > 0
 
-        # Verify log contains both anonymous and truncated original user
+        # Verify log contains anonymous user and conversation ID (but no original user ID)
         log_msg = anonymization_logs[0]
         assert "anon-logging-456" in log_msg
         assert "logging-conv-789" in log_msg
-        assert "logging_..." in log_msg  # Truncated original user ID
+        # Should NOT contain any reference to original user ID
+        assert "logging_test_user" not in log_msg
+        assert "logging_..." not in log_msg
 
     def test_conversation_model_uses_anonymous_field(self):
         """Test that UserConversation model uses anonymous_user_id field."""
diff --git a/tests/unit/utils/conftest.py b/tests/unit/utils/conftest.py
new file mode 100644
index 00000000..d39526e9
--- /dev/null
+++ b/tests/unit/utils/conftest.py
@@ -0,0 +1,6 @@
+"""Configuration for utils unit tests."""
+
+import os
+
+# Set environment variable for all tests in this directory
+os.environ.setdefault("USER_ANON_PEPPER", "test-pepper-for-utils-tests")
diff --git a/tests/unit/utils/test_endpoints_anonymization.py b/tests/unit/utils/test_endpoints_anonymization.py
index 1e14f7b0..dd5b23c1 100644
--- a/tests/unit/utils/test_endpoints_anonymization.py
+++ b/tests/unit/utils/test_endpoints_anonymization.py
@@ -1,10 +1,22 @@
 """Tests for endpoint utilities anonymization functionality."""
 
+import os
 from unittest.mock import patch, MagicMock
+
 import pytest
+from fastapi import HTTPException
 
-from utils.endpoints import validate_conversation_ownership
 from models.database.conversations import UserConversation
+from utils.endpoints import validate_conversation_ownership
+
+
+# Set up test environment variable before importing the module
+@pytest.fixture(autouse=True)
+def setup_test_pepper():
+    """Set up test pepper environment variable for all tests."""
+    test_pepper = "test-pepper-for-endpoints-tests"
+    with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}):
+        yield
 
 
 class TestEndpointsAnonymization:
@@ -50,24 +62,28 @@ def test_validate_conversation_ownership_uses_anonymous_id(
     def test_validate_conversation_ownership_not_owned(
         self, mock_get_session, mock_get_anonymous
     ):
-        """Test validation fails when user doesn't own conversation."""
+        """Test validation raises 403 when user doesn't own conversation."""
         mock_get_anonymous.return_value = "anon-not-owner-789"
         mock_session = MagicMock()
         mock_get_session.return_value.__enter__.return_value = mock_session
         mock_session.query.return_value.filter_by.return_value.first.return_value = None
 
-        # Call validation
-        result = validate_conversation_ownership(
-            user_id="notowner@example.com", conversation_id="not-owned-conv"
-        )
+        # Call validation should raise HTTPException
+        with pytest.raises(HTTPException) as exc_info:
+            validate_conversation_ownership(
+                user_id="notowner@example.com", conversation_id="not-owned-conv"
+            )
 
         # Verify anonymous user ID was used for lookup
         mock_get_anonymous.assert_called_once_with("notowner@example.com")
         filter_call = mock_session.query.return_value.filter_by.call_args
         assert filter_call[1]["anonymous_user_id"] == "anon-not-owner-789"
 
-        # Verify None returned for non-owned conversation
-        assert result is None
+        # Verify 403 error raised for non-owned conversation
+        assert exc_info.value.status_code == 403
+        assert "Forbidden: conversation does not belong to user" in str(
+            exc_info.value.detail
+        )
 
     @patch("utils.endpoints.get_anonymous_user_id")
     @patch("utils.endpoints.get_session")
@@ -89,10 +105,11 @@ def mock_anonymous_side_effect(user_id):
         mock_get_session.return_value.__enter__.return_value = mock_session
         mock_session.query.return_value.filter_by.return_value.first.return_value = None
 
-        # Validate for user 1
-        result1 = validate_conversation_ownership(
-            user_id="user1@example.com", conversation_id="shared-conv-test"
-        )
+        # Validate for user 1 - should raise 403
+        with pytest.raises(HTTPException) as exc_info1:
+            validate_conversation_ownership(
+                user_id="user1@example.com", conversation_id="shared-conv-test"
+            )
 
         first_call = mock_session.query.return_value.filter_by.call_args
         assert first_call[1]["anonymous_user_id"] == "anon-user1-validation"
@@ -101,18 +118,19 @@ def mock_anonymous_side_effect(user_id):
         mock_session.reset_mock()
         mock_session.query.return_value.filter_by.return_value.first.return_value = None
 
-        # Validate for user 2
-        result2 = validate_conversation_ownership(
-            user_id="user2@example.com",
-            conversation_id="shared-conv-test",  # Same conversation ID
-        )
+        # Validate for user 2 - should also raise 403
+        with pytest.raises(HTTPException) as exc_info2:
+            validate_conversation_ownership(
+                user_id="user2@example.com",
+                conversation_id="shared-conv-test",  # Same conversation ID
+            )
 
         second_call = mock_session.query.return_value.filter_by.call_args
         assert second_call[1]["anonymous_user_id"] == "anon-user2-validation"
 
-        # Both should return None since neither owns the conversation
-        assert result1 is None
-        assert result2 is None
+        # Both should raise 403 since neither owns the conversation
+        assert exc_info1.value.status_code == 403
+        assert exc_info2.value.status_code == 403
 
         # But they used different anonymous IDs (checking the calls, not constant comparison)
         assert first_call[1]["anonymous_user_id"] != second_call[1]["anonymous_user_id"]
diff --git a/tests/unit/utils/test_transcripts_anonymization.py b/tests/unit/utils/test_transcripts_anonymization.py
index 869b1c55..abadc4e5 100644
--- a/tests/unit/utils/test_transcripts_anonymization.py
+++ b/tests/unit/utils/test_transcripts_anonymization.py
@@ -1,16 +1,27 @@
 """Tests for transcript anonymization functionality."""
 
 import json
+import os
 import tempfile
 from pathlib import Path
 from unittest.mock import patch
 
+import pytest
 
 from models.requests import QueryRequest, Attachment
 from utils.transcripts import store_transcript, construct_transcripts_path
 from utils.types import TurnSummary
 
 
+# Set up test environment variable before importing the module
+@pytest.fixture(autouse=True)
+def setup_test_pepper():
+    """Set up test pepper environment variable for all tests."""
+    test_pepper = "test-pepper-for-transcript-tests"
+    with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}):
+        yield
+
+
 class TestTranscriptAnonymization:
     """Test transcript storage with user anonymization."""
 
@@ -260,12 +271,18 @@ def test_logging_shows_anonymization(self, mock_get_anonymous, caplog):
                             attachments=[],
                         )
 
-        # Check that anonymization is logged
+        # Check that transcript storage is logged
         log_messages = [record.message for record in caplog.records]
-        anonymization_logs = [msg for msg in log_messages if "Anonymized user" in msg]
-        assert len(anonymization_logs) > 0
-
-        # Verify the log shows original user is being anonymized
-        anonymization_log = anonymization_logs[0]
-        assert "anon-logging-test" in anonymization_log
-        assert "log_test..." in anonymization_log  # Truncated original user ID
+        storage_logs = [
+            msg
+            for msg in log_messages
+            if "Storing transcript for anonymous user" in msg
+        ]
+        assert len(storage_logs) > 0
+
+        # Verify the log shows only anonymous user ID (no original user ID)
+        storage_log = storage_logs[0]
+        assert "anon-logging-test" in storage_log
+        # Should NOT contain any reference to original user ID
+        assert "log_test_user" not in storage_log
+        assert "log_test..." not in storage_log
diff --git a/tests/unit/utils/test_user_anonymization.py b/tests/unit/utils/test_user_anonymization.py
index bdbce8f5..16f28b5d 100644
--- a/tests/unit/utils/test_user_anonymization.py
+++ b/tests/unit/utils/test_user_anonymization.py
@@ -1,18 +1,34 @@
 """Tests for user anonymization utilities."""
 
 import hashlib
+import hmac
+import importlib
+import os
 from unittest.mock import patch, MagicMock
 
 import pytest
 from sqlalchemy.exc import IntegrityError
 
+from models.database.user_mapping import UserMapping
 from utils.user_anonymization import (
     get_anonymous_user_id,
     get_user_count,
     find_anonymous_user_id,
     _hash_user_id,
 )
-from models.database.user_mapping import UserMapping
+
+
+# Set up test environment variable for each test
+@pytest.fixture(autouse=True)
+def setup_test_pepper():
+    """Set up test pepper environment variable for all tests."""
+    test_pepper = "test-pepper-for-anonymization-tests"
+    with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}):
+        # Reload the module to pick up the new environment variable
+        import utils.user_anonymization  # pylint: disable=import-outside-toplevel
+
+        importlib.reload(utils.user_anonymization)
+        yield
 
 
 class TestUserAnonymization:
@@ -174,8 +190,8 @@ def test_find_anonymous_user_id_not_found(self, mock_get_session):
 
         assert result is None
 
-    def test_salt_prevents_rainbow_attacks(self):
-        """Test that salt makes rainbow table attacks impractical."""
+    def test_hmac_prevents_rainbow_attacks(self):
+        """Test that HMAC makes rainbow table attacks impractical."""
         # Common passwords/emails that might be in rainbow tables
         common_ids = ["admin", "test@test.com", "user123", "john.doe@company.com"]
 
@@ -185,6 +201,12 @@ def test_salt_prevents_rainbow_attacks(self):
             simple_hash = hashlib.sha256(user_id.encode()).hexdigest()
             assert hash_result != simple_hash
 
+            # Hash should not match HMAC without pepper
+            hmac_without_pepper = hmac.new(
+                b"", user_id.strip().lower().encode(), hashlib.sha256
+            ).hexdigest()
+            assert hash_result != hmac_without_pepper
+
     def test_anonymization_preserves_uniqueness(self):
         """Test that different users get different anonymous IDs."""
         users = [
@@ -199,6 +221,40 @@ def test_anonymization_preserves_uniqueness(self):
         # All hashes should be unique
         assert len(set(hashes)) == len(hashes)
 
+    def test_user_id_normalization(self):
+        """Test that user ID normalization works correctly."""
+        # Test case variations should produce the same hash
+        variations = [
+            "User@Example.Com",
+            "user@example.com",
+            " user@example.com ",
+            "USER@EXAMPLE.COM",
+            "  USER@EXAMPLE.COM  ",
+        ]
+
+        hashes = [_hash_user_id(variation) for variation in variations]
+
+        # All variations should produce the same hash
+        assert (
+            len(set(hashes)) == 1
+        ), "All case/whitespace variations should hash the same"
+
+        # But different actual users should still be different
+        different_user = _hash_user_id("different@example.com")
+        assert different_user not in hashes
+
+    def test_missing_pepper_env_var(self):
+        """Test that missing pepper environment variable raises clear error."""
+        import utils.user_anonymization  # pylint: disable=import-outside-toplevel
+
+        # Test that module import fails with missing env var
+        with patch.dict(os.environ, {}, clear=True):
+            with pytest.raises(
+                RuntimeError, match="USER_ANON_PEPPER environment variable is required"
+            ):
+                # Force reimport to trigger the env var check
+                importlib.reload(utils.user_anonymization)
+
 
 class TestUserMappingModel:
     """Test the UserMapping database model."""

From fd32c3e44a1298c9ce655504c6b85101a563b02b Mon Sep 17 00:00:00 2001
From: Erik M Jacobs <erikmjacobs@gmail.com>
Date: Thu, 21 Aug 2025 16:14:54 -0400
Subject: [PATCH 3/6] coderabbit

---
 tests/conftest.py                           |  7 ++
 tests/unit/app/endpoints/test_query.py      |  6 +-
 tests/unit/utils/test_user_anonymization.py | 85 ++++++++++++---------
 3 files changed, 58 insertions(+), 40 deletions(-)
 create mode 100644 tests/conftest.py

diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..4be2f7fe
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,7 @@
+"""Global configuration for all tests."""
+
+import os
+
+# Set environment variable for all tests globally
+# This ensures the anonymization module can be imported in CI environments
+os.environ.setdefault("USER_ANON_PEPPER", "test-pepper-for-all-tests")
diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py
index 3356f2b2..c6acb888 100644
--- a/tests/unit/app/endpoints/test_query.py
+++ b/tests/unit/app/endpoints/test_query.py
@@ -169,7 +169,7 @@ async def _test_query_endpoint_handler(mocker, store_transcript_to_file=False):
     # Assert the store_transcript function is called if transcripts are enabled
     if store_transcript_to_file:
         mock_transcript.assert_called_once_with(
-            user_id="mock_user_id",
+            anonymous_user_id="mock_user_id",
             conversation_id=conversation_id,
             model_id="fake_model_id",
             provider_id="fake_provider_id",
@@ -1409,7 +1409,7 @@ def test_no_tools_parameter_backward_compatibility():
         (
             UserConversation(
                 id="conv1",
-                user_id="user1",
+                anonymous_user_id="user1",
                 last_used_provider="foo",
                 last_used_model="bar",
                 message_count=1,
@@ -1428,7 +1428,7 @@ def test_no_tools_parameter_backward_compatibility():
         (
             UserConversation(
                 id="conv1",
-                user_id="user1",
+                anonymous_user_id="user1",
                 last_used_provider="foo",
                 last_used_model="bar",
                 message_count=1,
diff --git a/tests/unit/utils/test_user_anonymization.py b/tests/unit/utils/test_user_anonymization.py
index 16f28b5d..a0a559ea 100644
--- a/tests/unit/utils/test_user_anonymization.py
+++ b/tests/unit/utils/test_user_anonymization.py
@@ -10,12 +10,8 @@
 from sqlalchemy.exc import IntegrityError
 
 from models.database.user_mapping import UserMapping
-from utils.user_anonymization import (
-    get_anonymous_user_id,
-    get_user_count,
-    find_anonymous_user_id,
-    _hash_user_id,
-)
+
+# Functions under test are accessed via a module fixture (see `ua` below)
 
 
 # Set up test environment variable for each test
@@ -31,29 +27,40 @@ def setup_test_pepper():
         yield
 
 
-class TestUserAnonymization:
+@pytest.fixture
+def user_anon_module():
+    """Return a fresh reference to the reloaded utils.user_anonymization module."""
+    import utils.user_anonymization as _ua  # pylint: disable=import-outside-toplevel
+
+    importlib.reload(_ua)
+    return _ua
+
+
+class TestUserAnonymization:  # pylint: disable=redefined-outer-name,protected-access
     """Test user anonymization functionality."""
 
-    def test_hash_user_id_consistency(self):
+    def test_hash_user_id_consistency(self, user_anon_module):
         """Test that user ID hashing is consistent."""
         user_id = "test@example.com"
-        hash1 = _hash_user_id(user_id)
-        hash2 = _hash_user_id(user_id)
+        hash1 = user_anon_module._hash_user_id(user_id)
+        hash2 = user_anon_module._hash_user_id(user_id)
 
         assert hash1 == hash2
         assert len(hash1) == 64  # SHA-256 hex length
         assert hash1 != user_id  # Should be different from original
 
-    def test_hash_user_id_different_for_different_users(self):
+    def test_hash_user_id_different_for_different_users(self, user_anon_module):
         """Test that different user IDs produce different hashes."""
-        user1_hash = _hash_user_id("user1@example.com")
-        user2_hash = _hash_user_id("user2@example.com")
+        user1_hash = user_anon_module._hash_user_id("user1@example.com")
+        user2_hash = user_anon_module._hash_user_id("user2@example.com")
 
         assert user1_hash != user2_hash
 
     @patch("utils.user_anonymization.get_session")
     @patch("utils.user_anonymization.get_suid")
-    def test_get_anonymous_user_id_new_user(self, mock_get_suid, mock_get_session):
+    def test_get_anonymous_user_id_new_user(
+        self, mock_get_suid, mock_get_session, user_anon_module
+    ):
         """Test creating new anonymous ID for first-time user."""
         # Setup mocks
         mock_session = MagicMock()
@@ -62,7 +69,7 @@ def test_get_anonymous_user_id_new_user(self, mock_get_suid, mock_get_session):
         mock_get_suid.return_value = "anon-123-456"
 
         user_id = "new_user@example.com"
-        result = get_anonymous_user_id(user_id)
+        result = user_anon_module.get_anonymous_user_id(user_id)
 
         # Verify result
         assert result == "anon-123-456"
@@ -75,15 +82,19 @@ def test_get_anonymous_user_id_new_user(self, mock_get_suid, mock_get_session):
         added_mapping = mock_session.add.call_args[0][0]
         assert isinstance(added_mapping, UserMapping)
         assert added_mapping.anonymous_id == "anon-123-456"
-        assert added_mapping.user_id_hash == _hash_user_id(user_id)
+        assert added_mapping.user_id_hash == user_anon_module._hash_user_id(user_id)
 
     @patch("utils.user_anonymization.get_session")
-    def test_get_anonymous_user_id_existing_user(self, mock_get_session):
+    def test_get_anonymous_user_id_existing_user(
+        self, mock_get_session, user_anon_module
+    ):
         """Test retrieving existing anonymous ID for returning user."""
         # Setup existing mapping
         existing_mapping = UserMapping()
         existing_mapping.anonymous_id = "existing-anon-789"
-        existing_mapping.user_id_hash = _hash_user_id("existing@example.com")
+        existing_mapping.user_id_hash = user_anon_module._hash_user_id(
+            "existing@example.com"
+        )
 
         mock_session = MagicMock()
         mock_get_session.return_value.__enter__.return_value = mock_session
@@ -91,7 +102,7 @@ def test_get_anonymous_user_id_existing_user(self, mock_get_session):
             existing_mapping
         )
 
-        result = get_anonymous_user_id("existing@example.com")
+        result = user_anon_module.get_anonymous_user_id("existing@example.com")
 
         # Verify result
         assert result == "existing-anon-789"
@@ -103,7 +114,7 @@ def test_get_anonymous_user_id_existing_user(self, mock_get_session):
     @patch("utils.user_anonymization.get_session")
     @patch("utils.user_anonymization.get_suid")
     def test_get_anonymous_user_id_race_condition(
-        self, mock_get_suid, mock_get_session
+        self, mock_get_suid, mock_get_session, user_anon_module
     ):
         """Test handling race condition when creating user mapping."""
         # Setup mocks for race condition scenario
@@ -123,7 +134,7 @@ def test_get_anonymous_user_id_race_condition(
         mock_session.add.side_effect = IntegrityError("Duplicate key", None, None)
         mock_get_suid.return_value = "new-uuid"
 
-        result = get_anonymous_user_id("race_user@example.com")
+        result = user_anon_module.get_anonymous_user_id("race_user@example.com")
 
         # Should return the mapping created by the other thread
         assert result == "race-condition-uuid"
@@ -132,7 +143,7 @@ def test_get_anonymous_user_id_race_condition(
     @patch("utils.user_anonymization.get_session")
     @patch("utils.user_anonymization.get_suid")
     def test_get_anonymous_user_id_race_condition_failure(
-        self, mock_get_suid, mock_get_session
+        self, mock_get_suid, mock_get_session, user_anon_module
     ):
         """Test handling race condition where retrieval also fails."""
         mock_session = MagicMock()
@@ -149,22 +160,22 @@ def test_get_anonymous_user_id_race_condition_failure(
         with pytest.raises(
             RuntimeError, match="Unable to create or retrieve anonymous user ID"
         ):
-            get_anonymous_user_id("problematic_user@example.com")
+            user_anon_module.get_anonymous_user_id("problematic_user@example.com")
 
     @patch("utils.user_anonymization.get_session")
-    def test_get_user_count(self, mock_get_session):
+    def test_get_user_count(self, mock_get_session, user_anon_module):
         """Test getting total user count."""
         mock_session = MagicMock()
         mock_get_session.return_value.__enter__.return_value = mock_session
         mock_session.query.return_value.count.return_value = 42
 
-        result = get_user_count()
+        result = user_anon_module.get_user_count()
 
         assert result == 42
         mock_session.query.assert_called_once_with(UserMapping)
 
     @patch("utils.user_anonymization.get_session")
-    def test_find_anonymous_user_id_existing(self, mock_get_session):
+    def test_find_anonymous_user_id_existing(self, mock_get_session, user_anon_module):
         """Test finding existing anonymous ID without creating new one."""
         existing_mapping = UserMapping()
         existing_mapping.anonymous_id = "found-uuid"
@@ -175,28 +186,28 @@ def test_find_anonymous_user_id_existing(self, mock_get_session):
             existing_mapping
         )
 
-        result = find_anonymous_user_id("existing@example.com")
+        result = user_anon_module.find_anonymous_user_id("existing@example.com")
 
         assert result == "found-uuid"
 
     @patch("utils.user_anonymization.get_session")
-    def test_find_anonymous_user_id_not_found(self, mock_get_session):
+    def test_find_anonymous_user_id_not_found(self, mock_get_session, user_anon_module):
         """Test finding non-existing anonymous ID returns None."""
         mock_session = MagicMock()
         mock_get_session.return_value.__enter__.return_value = mock_session
         mock_session.query.return_value.filter_by.return_value.first.return_value = None
 
-        result = find_anonymous_user_id("nonexistent@example.com")
+        result = user_anon_module.find_anonymous_user_id("nonexistent@example.com")
 
         assert result is None
 
-    def test_hmac_prevents_rainbow_attacks(self):
+    def test_hmac_prevents_rainbow_attacks(self, user_anon_module):
         """Test that HMAC makes rainbow table attacks impractical."""
         # Common passwords/emails that might be in rainbow tables
         common_ids = ["admin", "test@test.com", "user123", "john.doe@company.com"]
 
         for user_id in common_ids:
-            hash_result = _hash_user_id(user_id)
+            hash_result = user_anon_module._hash_user_id(user_id)
             # Hash should not match simple SHA-256 of just the user ID
             simple_hash = hashlib.sha256(user_id.encode()).hexdigest()
             assert hash_result != simple_hash
@@ -207,7 +218,7 @@ def test_hmac_prevents_rainbow_attacks(self):
             ).hexdigest()
             assert hash_result != hmac_without_pepper
 
-    def test_anonymization_preserves_uniqueness(self):
+    def test_anonymization_preserves_uniqueness(self, user_anon_module):
         """Test that different users get different anonymous IDs."""
         users = [
             "user1@example.com",
@@ -216,12 +227,12 @@ def test_anonymization_preserves_uniqueness(self):
             "test.user@domain.org",
         ]
 
-        hashes = [_hash_user_id(user) for user in users]
+        hashes = [user_anon_module._hash_user_id(user) for user in users]
 
         # All hashes should be unique
         assert len(set(hashes)) == len(hashes)
 
-    def test_user_id_normalization(self):
+    def test_user_id_normalization(self, user_anon_module):
         """Test that user ID normalization works correctly."""
         # Test case variations should produce the same hash
         variations = [
@@ -232,7 +243,7 @@ def test_user_id_normalization(self):
             "  USER@EXAMPLE.COM  ",
         ]
 
-        hashes = [_hash_user_id(variation) for variation in variations]
+        hashes = [user_anon_module._hash_user_id(variation) for variation in variations]
 
         # All variations should produce the same hash
         assert (
@@ -240,7 +251,7 @@ def test_user_id_normalization(self):
         ), "All case/whitespace variations should hash the same"
 
         # But different actual users should still be different
-        different_user = _hash_user_id("different@example.com")
+        different_user = user_anon_module._hash_user_id("different@example.com")
         assert different_user not in hashes
 
     def test_missing_pepper_env_var(self):
@@ -256,7 +267,7 @@ def test_missing_pepper_env_var(self):
                 importlib.reload(utils.user_anonymization)
 
 
-class TestUserMappingModel:
+class TestUserMappingModel:  # pylint: disable=redefined-outer-name
     """Test the UserMapping database model."""
 
     def test_user_mapping_attributes(self):

From d6344dfdd787c9e727279d6ea139068ea8ed67aa Mon Sep 17 00:00:00 2001
From: Erik M Jacobs <erikmjacobs@gmail.com>
Date: Thu, 21 Aug 2025 17:18:59 -0400
Subject: [PATCH 4/6] env content

---
 .env.example        | 11 +++++++++++
 README.md           | 11 +++++++++++
 docker-compose.yaml |  1 +
 3 files changed, 23 insertions(+)
 create mode 100644 .env.example

diff --git a/.env.example b/.env.example
new file mode 100644
index 00000000..c268c75b
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,11 @@
+# Example Environment Variables for Lightspeed Stack
+# Copy this file to .env and set appropriate values
+
+# Required: User anonymization pepper (set to a secure random value)
+# This is used for HMAC-based user ID hashing to protect user privacy
+USER_ANON_PEPPER=your-secure-random-string-here
+
+# Optional: OpenAI API Key (if using OpenAI models)
+OPENAI_API_KEY=your-openai-api-key
+
+# Optional: Other environment variables as needed for your configuration
\ No newline at end of file
diff --git a/README.md b/README.md
index 1b0f9ecd..58b5caaf 100644
--- a/README.md
+++ b/README.md
@@ -79,6 +79,17 @@ Lightspeed Core Stack is based on the FastAPI framework (Uvicorn). The service i
     - please note that currently Python 3.14 is not officially supported
     - all sources are made (backward) compatible with Python 3.12; it is checked on CI
 
+## Environment Variables
+
+The following environment variable is required for the service to start:
+
+* `USER_ANON_PEPPER` - A secure random string used for user anonymization. Set this to a cryptographically secure random value:
+  ```bash
+  export USER_ANON_PEPPER="your-secure-random-string-here"
+  ```
+  
+  **Security Note**: This value should be treated as a secret and kept secure. It's used for HMAC-based user ID hashing to protect user privacy while enabling usage analytics.
+
 # Installation
 
 Installation steps depends on operation system. Please look at instructions for your system:
diff --git a/docker-compose.yaml b/docker-compose.yaml
index b3624526..5148c746 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -30,6 +30,7 @@ services:
       - ./lightspeed-stack.yaml:/app-root/lightspeed-stack.yaml:Z
     environment:
       - OPENAI_API_KEY=${OPENAI_API_KEY}
+      - USER_ANON_PEPPER=${USER_ANON_PEPPER:-default-pepper-for-development-only}
     depends_on:
         llama-stack:
           condition: service_healthy

From 26505ca78d7446b5dd2acf875ef73a9d62869cfa Mon Sep 17 00:00:00 2001
From: Erik M Jacobs <erikmjacobs@gmail.com>
Date: Tue, 26 Aug 2025 09:34:33 -0400
Subject: [PATCH 5/6] fix unit test failures

---
 src/app/endpoints/conversations.py            |  13 +-
 src/app/endpoints/feedback.py                 |  10 +-
 .../unit/app/endpoints/test_conversations.py  |  18 ++-
 tests/unit/app/endpoints/test_feedback.py     |   8 +-
 .../endpoints/test_feedback_anonymization.py  | 138 ++++++++++++++++++
 tests/unit/app/endpoints/test_query.py        |   4 +-
 tests/unit/utils/test_transcripts.py          |   9 +-
 .../utils/test_transcripts_anonymization.py   |  11 +-
 8 files changed, 199 insertions(+), 12 deletions(-)
 create mode 100644 tests/unit/app/endpoints/test_feedback_anonymization.py

diff --git a/src/app/endpoints/conversations.py b/src/app/endpoints/conversations.py
index 98e40ee7..324b0796 100644
--- a/src/app/endpoints/conversations.py
+++ b/src/app/endpoints/conversations.py
@@ -190,20 +190,27 @@ def get_conversations_list_endpoint_handler(
             ]
 
             logger.info(
-                "Found %d conversations for user %s", len(conversations), user_id
+                "Found %d conversations for anonymous user %s",
+                len(conversations),
+                anonymous_user_id,
             )
 
             return ConversationsListResponse(conversations=conversations)
 
         except Exception as e:
             logger.exception(
-                "Error retrieving conversations for user %s: %s", user_id, e
+                "Error retrieving conversations for anonymous user %s: %s",
+                anonymous_user_id,
+                e,
             )
             raise HTTPException(
                 status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                 detail={
                     "response": "Unknown error",
-                    "cause": f"Unknown error while getting conversations for user {user_id}",
+                    "cause": (
+                        f"Unknown error while getting conversations for "
+                        f"anonymous user {anonymous_user_id}"
+                    ),
                 },
             ) from e
 
diff --git a/src/app/endpoints/feedback.py b/src/app/endpoints/feedback.py
index 5d82267e..c9436f25 100644
--- a/src/app/endpoints/feedback.py
+++ b/src/app/endpoints/feedback.py
@@ -19,6 +19,7 @@
 )
 from models.requests import FeedbackRequest
 from utils.suid import get_suid
+from utils.user_anonymization import get_anonymous_user_id
 
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/feedback", tags=["feedback"])
@@ -131,7 +132,8 @@ def store_feedback(user_id: str, feedback: dict) -> None:
         user_id (str): Unique identifier of the user submitting feedback.
         feedback (dict): Feedback data to be stored, merged with user ID and timestamp.
     """
-    logger.debug("Storing feedback for user %s", user_id)
+    anonymous_user_id = get_anonymous_user_id(user_id)
+    logger.debug("Storing feedback for anonymous user %s", anonymous_user_id)
     # Creates storage path only if it doesn't exist. The `exist_ok=True` prevents
     # race conditions in case of multiple server instances trying to set up storage
     # at the same location.
@@ -141,7 +143,11 @@ def store_feedback(user_id: str, feedback: dict) -> None:
     storage_path.mkdir(parents=True, exist_ok=True)
 
     current_time = str(datetime.now(UTC))
-    data_to_store = {"user_id": user_id, "timestamp": current_time, **feedback}
+    data_to_store = {
+        "anonymous_user_id": anonymous_user_id,
+        "timestamp": current_time,
+        **feedback,
+    }
 
     # stores feedback in a file under unique uuid
     feedback_file_path = storage_path / f"{get_suid()}.json"
diff --git a/tests/unit/app/endpoints/test_conversations.py b/tests/unit/app/endpoints/test_conversations.py
index aed6fdc5..d612d5ee 100644
--- a/tests/unit/app/endpoints/test_conversations.py
+++ b/tests/unit/app/endpoints/test_conversations.py
@@ -46,6 +46,10 @@ def create_mock_conversation(
 
 def mock_database_session(mocker, query_result=None):
     """Helper function to mock get_session with proper context manager support."""
+    # Mock database initialization
+    mocker.patch("app.database.engine", mocker.Mock())
+    mocker.patch("app.database.SessionLocal", mocker.Mock())
+
     mock_session = mocker.Mock()
     if query_result is not None:
         mock_session.query.return_value.filter_by.return_value.all.return_value = (
@@ -530,6 +534,10 @@ def test_configuration_not_loaded(self, mocker):
     def test_successful_conversations_list_retrieval(self, mocker, setup_configuration):
         """Test successful retrieval of conversations list."""
         mocker.patch("app.endpoints.conversations.configuration", setup_configuration)
+        mocker.patch(
+            "app.endpoints.conversations.get_anonymous_user_id",
+            return_value="anon-test-user",
+        )
 
         # Mock database session and query results
         mock_conversations = [
@@ -570,6 +578,10 @@ def test_successful_conversations_list_retrieval(self, mocker, setup_configurati
     def test_empty_conversations_list(self, mocker, setup_configuration):
         """Test when user has no conversations."""
         mocker.patch("app.endpoints.conversations.configuration", setup_configuration)
+        mocker.patch(
+            "app.endpoints.conversations.get_anonymous_user_id",
+            return_value="anon-test-user",
+        )
 
         # Mock database session with no results
         mock_database_session(mocker, [])
@@ -583,6 +595,10 @@ def test_empty_conversations_list(self, mocker, setup_configuration):
     def test_database_exception(self, mocker, setup_configuration):
         """Test when database query raises an exception."""
         mocker.patch("app.endpoints.conversations.configuration", setup_configuration)
+        mocker.patch(
+            "app.endpoints.conversations.get_anonymous_user_id",
+            return_value="anon-test-user",
+        )
 
         # Mock database session to raise exception
         mock_session = mock_database_session(mocker)
@@ -594,6 +610,6 @@ def test_database_exception(self, mocker, setup_configuration):
         assert exc_info.value.status_code == status.HTTP_500_INTERNAL_SERVER_ERROR
         assert "Unknown error" in exc_info.value.detail["response"]
         assert (
-            "Unknown error while getting conversations for user"
+            "Unknown error while getting conversations for anonymous user"
             in exc_info.value.detail["cause"]
         )
diff --git a/tests/unit/app/endpoints/test_feedback.py b/tests/unit/app/endpoints/test_feedback.py
index 337552c0..763f3b81 100644
--- a/tests/unit/app/endpoints/test_feedback.py
+++ b/tests/unit/app/endpoints/test_feedback.py
@@ -137,6 +137,9 @@ def test_store_feedback(mocker, feedback_request_data):
     mocker.patch("builtins.open", mocker.mock_open())
     mocker.patch("app.endpoints.feedback.Path", return_value=mocker.MagicMock())
     mocker.patch("app.endpoints.feedback.get_suid", return_value="fake-uuid")
+    mocker.patch(
+        "app.endpoints.feedback.get_anonymous_user_id", return_value="anon-test-user"
+    )
 
     # Patch json to inspect stored data
     mock_json = mocker.patch("app.endpoints.feedback.json")
@@ -146,7 +149,7 @@ def test_store_feedback(mocker, feedback_request_data):
     store_feedback(user_id, feedback_request_data)
 
     expected_data = {
-        "user_id": user_id,
+        "anonymous_user_id": "anon-test-user",
         "timestamp": mocker.ANY,
         **feedback_request_data,
     }
@@ -182,6 +185,9 @@ def test_store_feedback_on_io_error(mocker, feedback_request_data):
     configuration.user_data_collection_configuration.feedback_storage = "fake-path"
     mocker.patch("app.endpoints.feedback.Path", return_value=mocker.MagicMock())
     mocker.patch("builtins.open", side_effect=PermissionError("EACCES"))
+    mocker.patch(
+        "app.endpoints.feedback.get_anonymous_user_id", return_value="anon-test-user"
+    )
 
     user_id = "test_user_id"
 
diff --git a/tests/unit/app/endpoints/test_feedback_anonymization.py b/tests/unit/app/endpoints/test_feedback_anonymization.py
new file mode 100644
index 00000000..26e2b4af
--- /dev/null
+++ b/tests/unit/app/endpoints/test_feedback_anonymization.py
@@ -0,0 +1,138 @@
+"""Tests for feedback endpoint anonymization functionality."""
+
+import os
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+from app.endpoints.feedback import store_feedback
+
+
+# Set up test environment variable before importing the module
+@pytest.fixture(autouse=True)
+def setup_test_pepper():
+    """Set up test pepper environment variable for all tests."""
+    test_pepper = "test-pepper-for-feedback-tests"
+    with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}):
+        yield
+
+
+class TestFeedbackAnonymization:
+    """Test feedback storage with user anonymization."""
+
+    @patch("app.endpoints.feedback.get_anonymous_user_id")
+    @patch("app.endpoints.feedback.get_suid")
+    @patch("app.endpoints.feedback.json")
+    @patch("app.endpoints.feedback.Path")
+    def test_store_feedback_anonymizes_user_id(
+        self, mock_path, mock_json, mock_get_suid, mock_get_anonymous
+    ):
+        """Test that store_feedback uses anonymous user ID."""
+        # Setup mocks
+        mock_get_anonymous.return_value = "anon-feedback-123"
+        mock_get_suid.return_value = "feedback-uuid"
+        mock_path.return_value = MagicMock()
+
+        # Mock configuration
+        with (
+            patch("app.endpoints.feedback.configuration") as mock_config,
+            patch("builtins.open"),
+        ):
+            mock_config.user_data_collection_configuration.feedback_storage = (
+                "/tmp/feedback"
+            )
+
+            # Call store_feedback
+            store_feedback(
+                user_id="original_user@example.com",
+                feedback={
+                    "feedback": "This is test feedback",
+                    "sentiment": 1,
+                    "categories": ["helpful"],
+                },
+            )
+
+            # Verify anonymous user ID was used
+            mock_get_anonymous.assert_called_once_with("original_user@example.com")
+
+            # Verify stored data uses anonymous ID
+            stored_data = mock_json.dump.call_args[0][0]
+            assert stored_data["anonymous_user_id"] == "anon-feedback-123"
+            assert "user_id" not in stored_data
+            assert stored_data["feedback"] == "This is test feedback"
+            assert stored_data["sentiment"] == 1
+            assert stored_data["categories"] == ["helpful"]
+
+    @patch("app.endpoints.feedback.get_anonymous_user_id")
+    def test_store_feedback_different_users_get_different_anonymous_ids(
+        self, mock_get_anonymous
+    ):
+        """Test that different users get different anonymous IDs for feedback."""
+
+        def mock_anonymous_side_effect(user_id):
+            if user_id == "user1@example.com":
+                return "anon-feedback-user1"
+            if user_id == "user2@example.com":
+                return "anon-feedback-user2"
+            return "anon-unknown"
+
+        mock_get_anonymous.side_effect = mock_anonymous_side_effect
+
+        with (
+            patch("app.endpoints.feedback.json") as mock_json,
+            patch("app.endpoints.feedback.Path") as mock_path,
+            patch("builtins.open"),
+            patch("app.endpoints.feedback.get_suid", return_value="uuid"),
+            patch("app.endpoints.feedback.configuration") as mock_config,
+        ):
+
+            mock_config.user_data_collection_configuration.feedback_storage = (
+                "/tmp/feedback"
+            )
+            mock_path.return_value = MagicMock()
+
+            # Store feedback for user 1
+            store_feedback("user1@example.com", {"feedback": "Test 1"})
+            first_call_data = mock_json.dump.call_args[0][0]
+
+            # Reset mock for second call
+            mock_json.reset_mock()
+
+            # Store feedback for user 2
+            store_feedback("user2@example.com", {"feedback": "Test 2"})
+            second_call_data = mock_json.dump.call_args[0][0]
+
+            # Verify different anonymous IDs were used
+            assert first_call_data["anonymous_user_id"] == "anon-feedback-user1"
+            assert second_call_data["anonymous_user_id"] == "anon-feedback-user2"
+            assert (
+                first_call_data["anonymous_user_id"]
+                != second_call_data["anonymous_user_id"]
+            )
+
+    @patch("app.endpoints.feedback.get_anonymous_user_id")
+    @patch("app.endpoints.feedback.logger")
+    def test_feedback_logging_uses_anonymous_id(self, mock_logger, mock_get_anonymous):
+        """Test that feedback logging uses anonymous user ID."""
+        mock_get_anonymous.return_value = "anon-feedback-logging"
+
+        with (
+            patch("app.endpoints.feedback.json"),
+            patch("app.endpoints.feedback.Path") as mock_path,
+            patch("builtins.open"),
+            patch("app.endpoints.feedback.get_suid", return_value="uuid"),
+            patch("app.endpoints.feedback.configuration") as mock_config,
+        ):
+
+            mock_config.user_data_collection_configuration.feedback_storage = (
+                "/tmp/feedback"
+            )
+            mock_path.return_value = MagicMock()
+
+            # Store feedback
+            store_feedback("user@example.com", {"feedback": "Test feedback"})
+
+            # Verify logging uses anonymous ID
+            mock_logger.debug.assert_called_once_with(
+                "Storing feedback for anonymous user %s", "anon-feedback-logging"
+            )
diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py
index c6acb888..30f6dc8a 100644
--- a/tests/unit/app/endpoints/test_query.py
+++ b/tests/unit/app/endpoints/test_query.py
@@ -169,7 +169,7 @@ async def _test_query_endpoint_handler(mocker, store_transcript_to_file=False):
     # Assert the store_transcript function is called if transcripts are enabled
     if store_transcript_to_file:
         mock_transcript.assert_called_once_with(
-            anonymous_user_id="mock_user_id",
+            user_id="mock_user_id",
             conversation_id=conversation_id,
             model_id="fake_model_id",
             provider_id="fake_provider_id",
@@ -177,9 +177,9 @@ async def _test_query_endpoint_handler(mocker, store_transcript_to_file=False):
             query=query,
             query_request=query_request,
             summary=summary,
-            attachments=[],
             rag_chunks=[],
             truncated=False,
+            attachments=[],
         )
     else:
         mock_transcript.assert_not_called()
diff --git a/tests/unit/utils/test_transcripts.py b/tests/unit/utils/test_transcripts.py
index b30b430a..4e2eaeb9 100644
--- a/tests/unit/utils/test_transcripts.py
+++ b/tests/unit/utils/test_transcripts.py
@@ -50,11 +50,18 @@ def test_construct_transcripts_path(mocker):
 def test_store_transcript(mocker):
     """Test the store_transcript function."""
 
+    # Mock database initialization to prevent the error
+    mocker.patch("app.database.engine", mocker.Mock())
+    mocker.patch("app.database.SessionLocal", mocker.Mock())
+
     mocker.patch("builtins.open", mocker.mock_open())
     mocker.patch(
         "utils.transcripts.construct_transcripts_path",
         return_value=mocker.MagicMock(),
     )
+    mocker.patch(
+        "utils.transcripts.get_anonymous_user_id", return_value="anon-user-123"
+    )
 
     # Mock the JSON to assert the data is stored correctly
     mock_json = mocker.patch("utils.transcripts.json")
@@ -104,7 +111,7 @@ def test_store_transcript(mocker):
                 "model": "fake-model",
                 "query_provider": query_request.provider,
                 "query_model": query_request.model,
-                "user_id": user_id,
+                "anonymous_user_id": "anon-user-123",
                 "conversation_id": conversation_id,
                 "timestamp": mocker.ANY,
             },
diff --git a/tests/unit/utils/test_transcripts_anonymization.py b/tests/unit/utils/test_transcripts_anonymization.py
index abadc4e5..c564eea0 100644
--- a/tests/unit/utils/test_transcripts_anonymization.py
+++ b/tests/unit/utils/test_transcripts_anonymization.py
@@ -213,8 +213,14 @@ def test_store_transcript_with_attachments(
             assert data["attachments"][0]["attachment_type"] == "text"
             assert data["attachments"][0]["content"] == "Test attachment content"
 
-    def test_path_sanitization_with_anonymous_ids(self):
+    @patch("utils.transcripts.configuration")
+    def test_path_sanitization_with_anonymous_ids(self, mock_config):
         """Test that path sanitization works correctly with anonymous UUIDs."""
+        # Setup mock configuration
+        mock_config.user_data_collection_configuration.transcripts_storage = (
+            "/tmp/transcripts"
+        )
+
         # Test with various UUID formats and potential path injection
         test_cases = [
             ("anon-uuid-123", "conv-456"),
@@ -229,7 +235,8 @@ def test_path_sanitization_with_anonymous_ids(self):
 
             # Should not contain path traversal sequences
             assert "../" not in result_str
-            assert not result_str.startswith("/")
+            # Paths should be absolute (start with /) since we use /tmp/transcripts as base
+            assert result_str.startswith("/tmp/transcripts/")
 
     @patch("utils.transcripts.get_anonymous_user_id")
     def test_logging_shows_anonymization(self, mock_get_anonymous, caplog):

From d641687493dfde84e7191520c7a629bf25983319 Mon Sep 17 00:00:00 2001
From: Erik M Jacobs <erikmjacobs@gmail.com>
Date: Tue, 26 Aug 2025 09:57:52 -0400
Subject: [PATCH 6/6] fix bad merge

---
 src/app/endpoints/query.py           | 14 ----------
 src/app/endpoints/streaming_query.py | 14 ----------
 src/utils/endpoints.py               | 38 ++++++++++++++++++++++------
 3 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py
index a06f6ee0..395f7e3e 100644
--- a/src/app/endpoints/query.py
+++ b/src/app/endpoints/query.py
@@ -195,20 +195,6 @@ async def query_endpoint_handler(
             ),
         )
 
-        if user_conversation is None:
-            logger.warning(
-                "User %s attempted to query conversation %s they don't own",
-                user_id,
-                query_request.conversation_id,
-            )
-            raise HTTPException(
-                status_code=status.HTTP_403_FORBIDDEN,
-                detail={
-                    "response": "Access denied",
-                    "cause": "You do not have permission to access this conversation",
-                },
-            )
-
     try:
         # try to get Llama Stack client
         client = AsyncLlamaStackClientHolder().get_client()
diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py
index a7fe6788..2a36282f 100644
--- a/src/app/endpoints/streaming_query.py
+++ b/src/app/endpoints/streaming_query.py
@@ -552,20 +552,6 @@ async def streaming_query_endpoint_handler(  # pylint: disable=too-many-locals
             user_id=user_id, conversation_id=query_request.conversation_id
         )
 
-        if user_conversation is None:
-            logger.warning(
-                "User %s attempted to query conversation %s they don't own",
-                user_id,
-                query_request.conversation_id,
-            )
-            raise HTTPException(
-                status_code=status.HTTP_403_FORBIDDEN,
-                detail={
-                    "response": "Access denied",
-                    "cause": "You do not have permission to access this conversation",
-                },
-            )
-
     try:
         # try to get Llama Stack client
         client = AsyncLlamaStackClientHolder().get_client()
diff --git a/src/utils/endpoints.py b/src/utils/endpoints.py
index 3f201b3a..4523ba19 100644
--- a/src/utils/endpoints.py
+++ b/src/utils/endpoints.py
@@ -29,22 +29,44 @@ def validate_conversation_ownership(
     If `others_allowed` is True, it allows conversations that do not belong to the user,
     which is useful for admin access.
 
-    Returns the conversation object if valid.
+    Returns the conversation object if valid, raises HTTPException if not.
     """
     # Get anonymous user ID for database lookup
     anonymous_user_id = get_anonymous_user_id(user_id)
 
     with get_session() as session:
-        conversation = session.query(UserConversation)
-
-        filtered_conversation_query = (
-            conversation_query.filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id)
-            if others_allowed
-            else conversation_query.filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id)
-        )
+        conversation_query = session.query(UserConversation)
+
+        if others_allowed:
+            # If others_allowed is True, we can access any conversation by ID
+            filtered_conversation_query = conversation_query.filter_by(
+                id=conversation_id
+            )
+        else:
+            # If others_allowed is False, we can only access conversations belonging to this user
+            filtered_conversation_query = conversation_query.filter_by(
+                id=conversation_id, anonymous_user_id=anonymous_user_id
+            )
 
         conversation: UserConversation | None = filtered_conversation_query.first()
 
+        if conversation is None:
+            logger.warning(
+                "User %s attempted to access conversation %s they don't own",
+                user_id,
+                conversation_id,
+            )
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail={
+                    "response": "Forbidden: conversation does not belong to user",
+                    "cause": (
+                        f"User {user_id} does not have access to "
+                        f"conversation {conversation_id}"
+                    ),
+                },
+            )
+
         return conversation