From d055a021e92a94b2d5c0e11ed20987cf746fabb8 Mon Sep 17 00:00:00 2001 From: Erik M Jacobs Date: Thu, 21 Aug 2025 14:28:00 -0400 Subject: [PATCH 1/6] attempt to anonymize transcripts --- src/app/endpoints/conversations.py | 16 +- src/app/endpoints/query.py | 17 +- src/models/database/conversations.py | 4 +- src/models/database/user_mapping.py | 28 ++ src/utils/endpoints.py | 8 +- src/utils/transcripts.py | 21 +- src/utils/user_anonymization.py | 122 ++++++++ .../app/endpoints/test_query_anonymization.py | 197 +++++++++++++ .../utils/test_endpoints_anonymization.py | 171 +++++++++++ .../utils/test_transcripts_anonymization.py | 271 ++++++++++++++++++ tests/unit/utils/test_user_anonymization.py | 225 +++++++++++++++ 11 files changed, 1062 insertions(+), 18 deletions(-) create mode 100644 src/models/database/user_mapping.py create mode 100644 src/utils/user_anonymization.py create mode 100644 tests/unit/app/endpoints/test_query_anonymization.py create mode 100644 tests/unit/utils/test_endpoints_anonymization.py create mode 100644 tests/unit/utils/test_transcripts_anonymization.py create mode 100644 tests/unit/utils/test_user_anonymization.py diff --git a/src/app/endpoints/conversations.py b/src/app/endpoints/conversations.py index bbb7825d..c58ebcce 100644 --- a/src/app/endpoints/conversations.py +++ b/src/app/endpoints/conversations.py @@ -20,6 +20,7 @@ from app.database import get_session from utils.endpoints import check_configuration_loaded, validate_conversation_ownership from utils.suid import check_suid +from utils.user_anonymization import get_anonymous_user_id logger = logging.getLogger("app.endpoints.handlers") router = APIRouter(tags=["conversations"]) @@ -154,13 +155,22 @@ def get_conversations_list_endpoint_handler( user_id, _, _ = auth - logger.info("Retrieving conversations for user %s", user_id) + # Get anonymous user ID for database lookup + anonymous_user_id = get_anonymous_user_id(user_id) + + logger.info( + "Retrieving conversations for user %s (anonymous: %s)", + user_id[:8] + "..." if len(user_id) > 8 else user_id, + anonymous_user_id, + ) with get_session() as session: try: - # Get all conversations for this user + # Get all conversations for this user using anonymous ID user_conversations = ( - session.query(UserConversation).filter_by(user_id=user_id).all() + session.query(UserConversation) + .filter_by(anonymous_user_id=anonymous_user_id) + .all() ) # Return conversation summaries with metadata diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py index eae3ae28..92afdf8b 100644 --- a/src/app/endpoints/query.py +++ b/src/app/endpoints/query.py @@ -24,16 +24,17 @@ from configuration import configuration from app.database import get_session import metrics +import constants from models.database.conversations import UserConversation from models.responses import QueryResponse, UnauthorizedResponse, ForbiddenResponse from models.requests import QueryRequest, Attachment -import constants from utils.endpoints import ( check_configuration_loaded, get_agent, get_system_prompt, validate_conversation_ownership, ) +from utils.user_anonymization import get_anonymous_user_id from utils.mcp_headers import mcp_headers_dependency, handle_mcp_headers_with_toolgroups from utils.transcripts import store_transcript from utils.types import TurnSummary @@ -76,25 +77,31 @@ def is_transcripts_enabled() -> bool: def persist_user_conversation_details( user_id: str, conversation_id: str, model: str, provider_id: str ) -> None: - """Associate conversation to user in the database.""" + """Associate conversation to user in the database using anonymous user ID.""" + # Get anonymous user ID for database storage + anonymous_user_id = get_anonymous_user_id(user_id) + with get_session() as session: existing_conversation = ( session.query(UserConversation) - .filter_by(id=conversation_id, user_id=user_id) + .filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id) .first() ) if not existing_conversation: conversation = UserConversation( id=conversation_id, - user_id=user_id, + anonymous_user_id=anonymous_user_id, last_used_model=model, last_used_provider=provider_id, message_count=1, ) session.add(conversation) logger.debug( - "Associated conversation %s to user %s", conversation_id, user_id + "Associated conversation %s to anonymous user %s (original: %s)", + conversation_id, + anonymous_user_id, + user_id[:8] + "..." if len(user_id) > 8 else user_id, ) else: existing_conversation.last_used_model = model diff --git a/src/models/database/conversations.py b/src/models/database/conversations.py index 1cce8a64..ce0ff795 100644 --- a/src/models/database/conversations.py +++ b/src/models/database/conversations.py @@ -16,8 +16,8 @@ class UserConversation(Base): # pylint: disable=too-few-public-methods # The conversation ID id: Mapped[str] = mapped_column(primary_key=True) - # The user ID associated with the conversation - user_id: Mapped[str] = mapped_column(index=True) + # The anonymous user ID associated with the conversation + anonymous_user_id: Mapped[str] = mapped_column(index=True) # The last provider/model used in the conversation last_used_model: Mapped[str] = mapped_column() diff --git a/src/models/database/user_mapping.py b/src/models/database/user_mapping.py new file mode 100644 index 00000000..f9f245ef --- /dev/null +++ b/src/models/database/user_mapping.py @@ -0,0 +1,28 @@ +"""User ID anonymization mapping model.""" + +from datetime import datetime + +from sqlalchemy.orm import Mapped, mapped_column +from sqlalchemy import DateTime, func, Index + +from models.database.base import Base + + +class UserMapping(Base): # pylint: disable=too-few-public-methods + """Model for mapping real user IDs to anonymous UUIDs.""" + + __tablename__ = "user_mapping" + + # Anonymous UUID used for all storage/analytics (primary key) + anonymous_id: Mapped[str] = mapped_column(primary_key=True) + + # Original user ID from authentication (hashed for security) + user_id_hash: Mapped[str] = mapped_column(index=True, unique=True) + + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), + server_default=func.now(), # pylint: disable=not-callable + ) + + # Index for efficient lookups + __table_args__ = (Index("ix_user_mapping_hash_lookup", "user_id_hash"),) diff --git a/src/utils/endpoints.py b/src/utils/endpoints.py index 993dc4e8..78a949df 100644 --- a/src/utils/endpoints.py +++ b/src/utils/endpoints.py @@ -13,6 +13,7 @@ from configuration import AppConfig from utils.suid import get_suid from utils.types import GraniteToolParser +from utils.user_anonymization import get_anonymous_user_id logger = logging.getLogger("utils.endpoints") @@ -21,11 +22,14 @@ def validate_conversation_ownership( user_id: str, conversation_id: str ) -> UserConversation | None: - """Validate that the conversation belongs to the user.""" + """Validate that the conversation belongs to the user using anonymous ID lookup.""" + # Get anonymous user ID for database lookup + anonymous_user_id = get_anonymous_user_id(user_id) + with get_session() as session: conversation = ( session.query(UserConversation) - .filter_by(id=conversation_id, user_id=user_id) + .filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id) .first() ) return conversation diff --git a/src/utils/transcripts.py b/src/utils/transcripts.py index e29d4319..17e9638c 100644 --- a/src/utils/transcripts.py +++ b/src/utils/transcripts.py @@ -14,15 +14,16 @@ from models.requests import Attachment, QueryRequest from utils.suid import get_suid from utils.types import TurnSummary +from utils.user_anonymization import get_anonymous_user_id logger = logging.getLogger("utils.transcripts") -def construct_transcripts_path(user_id: str, conversation_id: str) -> Path: - """Construct path to transcripts.""" +def construct_transcripts_path(anonymous_user_id: str, conversation_id: str) -> Path: + """Construct path to transcripts using anonymous user ID.""" # these two normalizations are required by Snyk as it detects # this Path sanitization pattern - uid = os.path.normpath("/" + user_id).lstrip("/") + uid = os.path.normpath("/" + anonymous_user_id).lstrip("/") cid = os.path.normpath("/" + conversation_id).lstrip("/") file_path = ( configuration.user_data_collection_configuration.transcripts_storage or "" @@ -46,7 +47,7 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional- """Store transcript in the local filesystem. Args: - user_id: The user ID (UUID). + user_id: The original user ID from authentication (will be anonymized). conversation_id: The conversation ID (UUID). query_is_valid: The result of the query validation. query: The query (without attachments). @@ -56,7 +57,15 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional- truncated: The flag indicating if the history was truncated. attachments: The list of `Attachment` objects. """ - transcripts_path = construct_transcripts_path(user_id, conversation_id) + # Get anonymous user ID for storage + anonymous_user_id = get_anonymous_user_id(user_id) + logger.debug( + "Anonymized user %s to %s for transcript storage", + user_id[:8] + "..." if len(user_id) > 8 else user_id, + anonymous_user_id, + ) + + transcripts_path = construct_transcripts_path(anonymous_user_id, conversation_id) transcripts_path.mkdir(parents=True, exist_ok=True) data_to_store = { @@ -65,7 +74,7 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional- "model": model_id, "query_provider": query_request.provider, "query_model": query_request.model, - "user_id": user_id, + "anonymous_user_id": anonymous_user_id, # Store anonymous ID only "conversation_id": conversation_id, "timestamp": datetime.now(UTC).isoformat(), }, diff --git a/src/utils/user_anonymization.py b/src/utils/user_anonymization.py new file mode 100644 index 00000000..7ab8929f --- /dev/null +++ b/src/utils/user_anonymization.py @@ -0,0 +1,122 @@ +"""User ID anonymization utilities.""" + +import hashlib +import logging +from typing import Optional + +from sqlalchemy.exc import IntegrityError + +from models.database.user_mapping import UserMapping +from app.database import get_session +from utils.suid import get_suid + +logger = logging.getLogger("utils.user_anonymization") + + +def _hash_user_id(user_id: str) -> str: + """ + Create a consistent hash of the user ID for mapping purposes. + + Uses SHA-256 with a fixed salt to ensure consistent hashing + while preventing rainbow table attacks. + """ + # Use a fixed salt - in production, this should be configurable + salt = "lightspeed_user_anonymization_salt_v1" + hash_input = f"{salt}:{user_id}".encode("utf-8") + return hashlib.sha256(hash_input).hexdigest() + + +def get_anonymous_user_id(auth_user_id: str) -> str: + """ + Get or create an anonymous UUID for a user ID from authentication. + + This function: + 1. Hashes the original user ID for secure storage + 2. Looks up existing anonymous mapping + 3. Creates new anonymous UUID if none exists + 4. Returns the anonymous UUID for use in storage/analytics + + Args: + auth_user_id: The original user ID from authentication + + Returns: + Anonymous UUID string for this user + """ + user_id_hash = _hash_user_id(auth_user_id) + + with get_session() as session: + # Try to find existing mapping + existing_mapping = ( + session.query(UserMapping).filter_by(user_id_hash=user_id_hash).first() + ) + + if existing_mapping: + logger.debug( + "Found existing anonymous ID for user hash %s", user_id_hash[:8] + "..." + ) + return existing_mapping.anonymous_id + + # Create new anonymous mapping + anonymous_id = get_suid() + new_mapping = UserMapping(anonymous_id=anonymous_id, user_id_hash=user_id_hash) + + try: + session.add(new_mapping) + session.commit() + logger.info( + "Created new anonymous ID %s for user hash %s", + anonymous_id, + user_id_hash[:8] + "...", + ) + return anonymous_id + + except IntegrityError as e: + session.rollback() + # Race condition - another thread created the mapping + logger.warning("Race condition creating user mapping: %s", e) + + # Try to fetch the mapping created by the other thread + existing_mapping = ( + session.query(UserMapping).filter_by(user_id_hash=user_id_hash).first() + ) + + if existing_mapping: + return existing_mapping.anonymous_id + + # If we still can't find it, something is wrong + logger.error( + "Failed to create or retrieve user mapping for hash %s", + user_id_hash[:8] + "...", + ) + raise RuntimeError("Unable to create or retrieve anonymous user ID") from e + + +def get_user_count() -> int: + """ + Get the total number of unique users in the system. + + Returns: + Total count of unique anonymous users + """ + with get_session() as session: + return session.query(UserMapping).count() + + +def find_anonymous_user_id(auth_user_id: str) -> Optional[str]: + """ + Find existing anonymous ID for a user without creating a new one. + + Args: + auth_user_id: The original user ID from authentication + + Returns: + Anonymous UUID if found, None otherwise + """ + user_id_hash = _hash_user_id(auth_user_id) + + with get_session() as session: + existing_mapping = ( + session.query(UserMapping).filter_by(user_id_hash=user_id_hash).first() + ) + + return existing_mapping.anonymous_id if existing_mapping else None diff --git a/tests/unit/app/endpoints/test_query_anonymization.py b/tests/unit/app/endpoints/test_query_anonymization.py new file mode 100644 index 00000000..61c76467 --- /dev/null +++ b/tests/unit/app/endpoints/test_query_anonymization.py @@ -0,0 +1,197 @@ +"""Tests for query endpoint anonymization functionality.""" + +import logging +from unittest.mock import patch, MagicMock + +from sqlalchemy import inspect + +from app.endpoints.query import persist_user_conversation_details +from models.database.conversations import UserConversation + + +class TestQueryAnonymization: + """Test query endpoint user anonymization.""" + + @patch("app.endpoints.query.get_anonymous_user_id") + @patch("app.endpoints.query.get_session") + def test_persist_user_conversation_details_new_conversation( + self, mock_get_session, mock_get_anonymous + ): + """Test persisting new conversation uses anonymous user ID.""" + # Setup mocks + mock_get_anonymous.return_value = "anon-new-conv-123" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + # Call function + persist_user_conversation_details( + user_id="new_user@example.com", + conversation_id="new-conv-456", + model="test-model", + provider_id="test-provider", + ) + + # Verify anonymous user ID was obtained + mock_get_anonymous.assert_called_once_with("new_user@example.com") + + # Verify database query used anonymous ID + mock_session.query.assert_called_once_with(UserConversation) + filter_call = mock_session.query.return_value.filter_by.call_args + assert filter_call[1]["id"] == "new-conv-456" + assert filter_call[1]["anonymous_user_id"] == "anon-new-conv-123" + + # Verify new conversation was created with anonymous ID + mock_session.add.assert_called_once() + added_conversation = mock_session.add.call_args[0][0] + assert isinstance(added_conversation, UserConversation) + assert added_conversation.id == "new-conv-456" + assert added_conversation.anonymous_user_id == "anon-new-conv-123" + assert added_conversation.last_used_model == "test-model" + assert added_conversation.last_used_provider == "test-provider" + assert added_conversation.message_count == 1 + + mock_session.commit.assert_called_once() + + @patch("app.endpoints.query.get_anonymous_user_id") + @patch("app.endpoints.query.get_session") + def test_persist_user_conversation_details_existing_conversation( + self, mock_get_session, mock_get_anonymous + ): + """Test updating existing conversation uses anonymous user ID.""" + # Setup existing conversation + existing_conv = MagicMock(spec=UserConversation) + existing_conv.last_used_model = "old-model" + existing_conv.last_used_provider = "old-provider" + existing_conv.message_count = 5 + + mock_get_anonymous.return_value = "anon-existing-789" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = ( + existing_conv + ) + + # Call function + persist_user_conversation_details( + user_id="existing_user@example.com", + conversation_id="existing-conv-123", + model="new-model", + provider_id="new-provider", + ) + + # Verify anonymous user ID was used for lookup + mock_get_anonymous.assert_called_once_with("existing_user@example.com") + filter_call = mock_session.query.return_value.filter_by.call_args + assert filter_call[1]["anonymous_user_id"] == "anon-existing-789" + + # Verify existing conversation was updated + assert existing_conv.last_used_model == "new-model" + assert existing_conv.last_used_provider == "new-provider" + assert existing_conv.message_count == 6 # Incremented + + # Verify no new conversation was added + mock_session.add.assert_not_called() + mock_session.commit.assert_called_once() + + @patch("app.endpoints.query.get_anonymous_user_id") + def test_persist_user_conversation_logs_anonymization( + self, mock_get_anonymous, caplog + ): + """Test that conversation persistence logs anonymization.""" + + mock_get_anonymous.return_value = "anon-logging-456" + + with caplog.at_level(logging.DEBUG): + with patch("app.endpoints.query.get_session") as mock_get_session: + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = ( + None + ) + + persist_user_conversation_details( + user_id="logging_test_user@example.com", + conversation_id="logging-conv-789", + model="logging-model", + provider_id="logging-provider", + ) + + # Check that anonymization is logged + log_messages = [record.message for record in caplog.records] + anonymization_logs = [ + msg + for msg in log_messages + if "Associated conversation" in msg and "anonymous user" in msg + ] + assert len(anonymization_logs) > 0 + + # Verify log contains both anonymous and truncated original user + log_msg = anonymization_logs[0] + assert "anon-logging-456" in log_msg + assert "logging-conv-789" in log_msg + assert "logging_..." in log_msg # Truncated original user ID + + def test_conversation_model_uses_anonymous_field(self): + """Test that UserConversation model uses anonymous_user_id field.""" + # Verify the model has the correct field + conversation = UserConversation() + assert hasattr(conversation, "anonymous_user_id") + + # Verify we can set the anonymous user ID + conversation.anonymous_user_id = "test-anon-uuid" + assert conversation.anonymous_user_id == "test-anon-uuid" + + # Verify the field is mapped correctly (this tests the database column) + mapper = inspect(UserConversation) + assert "anonymous_user_id" in mapper.columns + + @patch("app.endpoints.query.get_anonymous_user_id") + @patch("app.endpoints.query.get_session") + def test_persist_user_conversation_different_users_same_conversation_id( + self, mock_get_session, mock_get_anonymous + ): + """Test that different users can't access each other's conversations.""" + + # Simulate two different users with same conversation ID (edge case) + def mock_anonymous_side_effect(user_id): + if user_id == "user1@example.com": + return "anon-user1-123" + if user_id == "user2@example.com": + return "anon-user2-456" + return "anon-unknown" + + mock_get_anonymous.side_effect = mock_anonymous_side_effect + + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + # User 1 creates conversation + persist_user_conversation_details( + user_id="user1@example.com", + conversation_id="shared-conv-id", + model="model1", + provider_id="provider1", + ) + + # Verify first user's anonymous ID was used + first_call = mock_session.query.return_value.filter_by.call_args_list[0] + assert first_call[1]["anonymous_user_id"] == "anon-user1-123" + + # Reset mock for second call + mock_session.reset_mock() + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + # User 2 tries to create conversation with same ID + persist_user_conversation_details( + user_id="user2@example.com", + conversation_id="shared-conv-id", # Same conversation ID + model="model2", + provider_id="provider2", + ) + + # Verify second user's anonymous ID was used (different from first) + second_call = mock_session.query.return_value.filter_by.call_args_list[0] + assert second_call[1]["anonymous_user_id"] == "anon-user2-456" + assert second_call[1]["anonymous_user_id"] != "anon-user1-123" diff --git a/tests/unit/utils/test_endpoints_anonymization.py b/tests/unit/utils/test_endpoints_anonymization.py new file mode 100644 index 00000000..1e14f7b0 --- /dev/null +++ b/tests/unit/utils/test_endpoints_anonymization.py @@ -0,0 +1,171 @@ +"""Tests for endpoint utilities anonymization functionality.""" + +from unittest.mock import patch, MagicMock +import pytest + +from utils.endpoints import validate_conversation_ownership +from models.database.conversations import UserConversation + + +class TestEndpointsAnonymization: + """Test endpoint utilities user anonymization.""" + + @patch("utils.endpoints.get_anonymous_user_id") + @patch("utils.endpoints.get_session") + def test_validate_conversation_ownership_uses_anonymous_id( + self, mock_get_session, mock_get_anonymous + ): + """Test that conversation ownership validation uses anonymous user ID.""" + # Setup mocks + conversation = MagicMock(spec=UserConversation) + conversation.id = "test-conv-123" + conversation.anonymous_user_id = "anon-owner-456" + + mock_get_anonymous.return_value = "anon-owner-456" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = ( + conversation + ) + + # Call validation + result = validate_conversation_ownership( + user_id="owner@example.com", conversation_id="test-conv-123" + ) + + # Verify anonymous user ID was obtained + mock_get_anonymous.assert_called_once_with("owner@example.com") + + # Verify database query used anonymous ID + mock_session.query.assert_called_once_with(UserConversation) + filter_call = mock_session.query.return_value.filter_by.call_args + assert filter_call[1]["id"] == "test-conv-123" + assert filter_call[1]["anonymous_user_id"] == "anon-owner-456" + + # Verify correct conversation returned + assert result == conversation + + @patch("utils.endpoints.get_anonymous_user_id") + @patch("utils.endpoints.get_session") + def test_validate_conversation_ownership_not_owned( + self, mock_get_session, mock_get_anonymous + ): + """Test validation fails when user doesn't own conversation.""" + mock_get_anonymous.return_value = "anon-not-owner-789" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + # Call validation + result = validate_conversation_ownership( + user_id="notowner@example.com", conversation_id="not-owned-conv" + ) + + # Verify anonymous user ID was used for lookup + mock_get_anonymous.assert_called_once_with("notowner@example.com") + filter_call = mock_session.query.return_value.filter_by.call_args + assert filter_call[1]["anonymous_user_id"] == "anon-not-owner-789" + + # Verify None returned for non-owned conversation + assert result is None + + @patch("utils.endpoints.get_anonymous_user_id") + @patch("utils.endpoints.get_session") + def test_validate_conversation_ownership_different_users( + self, mock_get_session, mock_get_anonymous + ): + """Test that different users get different anonymous IDs for validation.""" + + def mock_anonymous_side_effect(user_id): + if user_id == "user1@example.com": + return "anon-user1-validation" + if user_id == "user2@example.com": + return "anon-user2-validation" + return "anon-unknown" + + mock_get_anonymous.side_effect = mock_anonymous_side_effect + + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + # Validate for user 1 + result1 = validate_conversation_ownership( + user_id="user1@example.com", conversation_id="shared-conv-test" + ) + + first_call = mock_session.query.return_value.filter_by.call_args + assert first_call[1]["anonymous_user_id"] == "anon-user1-validation" + + # Reset mock for second call + mock_session.reset_mock() + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + # Validate for user 2 + result2 = validate_conversation_ownership( + user_id="user2@example.com", + conversation_id="shared-conv-test", # Same conversation ID + ) + + second_call = mock_session.query.return_value.filter_by.call_args + assert second_call[1]["anonymous_user_id"] == "anon-user2-validation" + + # Both should return None since neither owns the conversation + assert result1 is None + assert result2 is None + + # But they used different anonymous IDs (checking the calls, not constant comparison) + assert first_call[1]["anonymous_user_id"] != second_call[1]["anonymous_user_id"] + + @patch("utils.endpoints.get_anonymous_user_id") + @patch("utils.endpoints.get_session") + def test_validate_conversation_ownership_preserves_conversation_data( + self, mock_get_session, mock_get_anonymous + ): + """Test that validation preserves all conversation data.""" + # Create conversation with full data + conversation = UserConversation() + conversation.id = "full-data-conv" + conversation.anonymous_user_id = "anon-full-data" + conversation.last_used_model = "test-model" + conversation.last_used_provider = "test-provider" + conversation.message_count = 42 + + mock_get_anonymous.return_value = "anon-full-data" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = ( + conversation + ) + + # Validate ownership + result = validate_conversation_ownership( + user_id="datauser@example.com", conversation_id="full-data-conv" + ) + + # Verify all conversation data is preserved + assert result.id == "full-data-conv" + assert result.anonymous_user_id == "anon-full-data" + assert result.last_used_model == "test-model" + assert result.last_used_provider == "test-provider" + assert result.message_count == 42 + + @patch("utils.endpoints.get_anonymous_user_id") + def test_validate_conversation_ownership_error_handling(self, mock_get_anonymous): + """Test that validation handles errors gracefully.""" + mock_get_anonymous.return_value = "anon-error-test" + + # Simulate database error + with patch("utils.endpoints.get_session") as mock_get_session: + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.side_effect = Exception("Database error") + + # Should propagate the exception + with pytest.raises(Exception, match="Database error"): + validate_conversation_ownership( + user_id="erroruser@example.com", conversation_id="error-conv" + ) + + # Verify anonymous ID was still obtained before error + mock_get_anonymous.assert_called_once_with("erroruser@example.com") diff --git a/tests/unit/utils/test_transcripts_anonymization.py b/tests/unit/utils/test_transcripts_anonymization.py new file mode 100644 index 00000000..869b1c55 --- /dev/null +++ b/tests/unit/utils/test_transcripts_anonymization.py @@ -0,0 +1,271 @@ +"""Tests for transcript anonymization functionality.""" + +import json +import tempfile +from pathlib import Path +from unittest.mock import patch + + +from models.requests import QueryRequest, Attachment +from utils.transcripts import store_transcript, construct_transcripts_path +from utils.types import TurnSummary + + +class TestTranscriptAnonymization: + """Test transcript storage with user anonymization.""" + + @patch("utils.transcripts.get_anonymous_user_id") + @patch("utils.transcripts.get_suid") + @patch("utils.transcripts.configuration") + def test_store_transcript_anonymizes_user_id( + self, mock_config, mock_get_suid, mock_get_anonymous + ): + """Test that store_transcript uses anonymous user ID.""" + # Setup mocks + mock_get_anonymous.return_value = "anon-uuid-123" + mock_get_suid.return_value = "transcript-uuid" + + with tempfile.TemporaryDirectory() as temp_dir: + mock_config.user_data_collection_configuration.transcripts_storage = ( + temp_dir + ) + + # Create test data + query_request = QueryRequest( + query="Test query", model="test-model", provider="test-provider" + ) + + summary = TurnSummary(llm_response="Test response", tool_calls=[]) + + # Call store_transcript + store_transcript( + user_id="original_user@example.com", + conversation_id="conv-123", + model_id="test-model", + provider_id="test-provider", + query_is_valid=True, + query="Test query", + query_request=query_request, + summary=summary, + rag_chunks=[], + truncated=False, + attachments=[], + ) + + # Verify anonymous user ID was used + mock_get_anonymous.assert_called_once_with("original_user@example.com") + + # Verify file was created with anonymous path + expected_path = ( + Path(temp_dir) / "anon-uuid-123" / "conv-123" / "transcript-uuid.json" + ) + assert expected_path.exists() + + # Verify file content contains anonymous ID, not original + with open(expected_path, "r", encoding="utf-8") as f: + data = json.load(f) + + assert data["metadata"]["anonymous_user_id"] == "anon-uuid-123" + assert "user_id" not in data["metadata"] + assert data["metadata"]["conversation_id"] == "conv-123" + + def test_construct_transcripts_path_with_anonymous_id(self): + """Test that transcript path construction uses anonymous ID.""" + # This test verifies the path construction function directly + result = construct_transcripts_path("anon-uuid-456", "conv-789") + + # Path should contain the anonymous ID, not original user ID + assert "anon-uuid-456" in str(result) + assert "conv-789" in str(result) + + @patch("utils.transcripts.get_anonymous_user_id") + @patch("utils.transcripts.get_suid") + @patch("utils.transcripts.configuration") + def test_store_transcript_preserves_other_metadata( + self, mock_config, mock_get_suid, mock_get_anonymous + ): + """Test that anonymization preserves all other metadata.""" + mock_get_anonymous.return_value = "anon-preserved-test" + mock_get_suid.return_value = "metadata-test-uuid" + + with tempfile.TemporaryDirectory() as temp_dir: + mock_config.user_data_collection_configuration.transcripts_storage = ( + temp_dir + ) + + query_request = QueryRequest( + query="Metadata test", + model="metadata-model", + provider="metadata-provider", + ) + + summary = TurnSummary(llm_response="Metadata response", tool_calls=[]) + + # Store transcript with rich metadata + store_transcript( + user_id="metadata_user@example.com", + conversation_id="metadata-conv", + model_id="actual-model", + provider_id="actual-provider", + query_is_valid=True, + query="Metadata test query", + query_request=query_request, + summary=summary, + rag_chunks=["chunk1", "chunk2"], + truncated=True, + attachments=[], + ) + + # Read and verify all metadata is preserved + transcript_file = ( + Path(temp_dir) + / "anon-preserved-test" + / "metadata-conv" + / "metadata-test-uuid.json" + ) + with open(transcript_file, "r", encoding="utf-8") as f: + data = json.load(f) + + metadata = data["metadata"] + assert metadata["provider"] == "actual-provider" + assert metadata["model"] == "actual-model" + assert metadata["query_provider"] == "metadata-provider" + assert metadata["query_model"] == "metadata-model" + assert metadata["conversation_id"] == "metadata-conv" + assert "timestamp" in metadata + + # Verify other data preserved + assert data["redacted_query"] == "Metadata test query" + assert data["query_is_valid"] is True + assert data["llm_response"] == "Metadata response" + assert data["rag_chunks"] == ["chunk1", "chunk2"] + assert data["truncated"] is True + + @patch("utils.transcripts.get_anonymous_user_id") + @patch("utils.transcripts.get_suid") + @patch("utils.transcripts.configuration") + def test_store_transcript_with_attachments( + self, mock_config, mock_get_suid, mock_get_anonymous + ): + """Test transcript storage with attachments preserves anonymization.""" + mock_get_anonymous.return_value = "anon-attachments-test" + mock_get_suid.return_value = "attachments-uuid" + + with tempfile.TemporaryDirectory() as temp_dir: + mock_config.user_data_collection_configuration.transcripts_storage = ( + temp_dir + ) + + # Create attachment + attachment = Attachment( + attachment_type="text", + content_type="text/plain", + content="Test attachment content", + ) + + query_request = QueryRequest( + query="Query with attachment", + model="attachment-model", + provider="attachment-provider", + ) + + summary = TurnSummary( + llm_response="Response with attachment", tool_calls=[] + ) + + store_transcript( + user_id="attachment_user@example.com", + conversation_id="attachment-conv", + model_id="attachment-actual-model", + provider_id="attachment-actual-provider", + query_is_valid=True, + query="Query with attachment", + query_request=query_request, + summary=summary, + rag_chunks=[], + truncated=False, + attachments=[attachment], + ) + + # Verify attachment data preserved with anonymization + transcript_file = ( + Path(temp_dir) + / "anon-attachments-test" + / "attachment-conv" + / "attachments-uuid.json" + ) + with open(transcript_file, "r", encoding="utf-8") as f: + data = json.load(f) + + assert data["metadata"]["anonymous_user_id"] == "anon-attachments-test" + assert len(data["attachments"]) == 1 + assert data["attachments"][0]["attachment_type"] == "text" + assert data["attachments"][0]["content"] == "Test attachment content" + + def test_path_sanitization_with_anonymous_ids(self): + """Test that path sanitization works correctly with anonymous UUIDs.""" + # Test with various UUID formats and potential path injection + test_cases = [ + ("anon-uuid-123", "conv-456"), + ("../anon-malicious", "conv-normal"), # Path traversal attempt + ("anon/with/slashes", "conv-789"), # Embedded slashes + ("anon-normal", "../conv-malicious"), # Conversation ID traversal attempt + ] + + for anon_id, conv_id in test_cases: + result = construct_transcripts_path(anon_id, conv_id) + result_str = str(result) + + # Should not contain path traversal sequences + assert "../" not in result_str + assert not result_str.startswith("/") + + @patch("utils.transcripts.get_anonymous_user_id") + def test_logging_shows_anonymization(self, mock_get_anonymous, caplog): + """Test that logging shows anonymization is happening.""" + import logging # pylint: disable=import-outside-toplevel + + mock_get_anonymous.return_value = "anon-logging-test" + + with caplog.at_level(logging.DEBUG): + # Create minimal test setup + with tempfile.TemporaryDirectory() as temp_dir: + with patch("utils.transcripts.configuration") as mock_config: + mock_config.user_data_collection_configuration.transcripts_storage = ( + temp_dir + ) + + with patch( + "utils.transcripts.get_suid", return_value="log-test-uuid" + ): + query_request = QueryRequest( + query="Log test", model="log-model", provider="log-provider" + ) + + summary = TurnSummary( + llm_response="Log response", tool_calls=[] + ) + + store_transcript( + user_id="log_test_user@example.com", + conversation_id="log-conv", + model_id="log-model", + provider_id="log-provider", + query_is_valid=True, + query="Log test query", + query_request=query_request, + summary=summary, + rag_chunks=[], + truncated=False, + attachments=[], + ) + + # Check that anonymization is logged + log_messages = [record.message for record in caplog.records] + anonymization_logs = [msg for msg in log_messages if "Anonymized user" in msg] + assert len(anonymization_logs) > 0 + + # Verify the log shows original user is being anonymized + anonymization_log = anonymization_logs[0] + assert "anon-logging-test" in anonymization_log + assert "log_test..." in anonymization_log # Truncated original user ID diff --git a/tests/unit/utils/test_user_anonymization.py b/tests/unit/utils/test_user_anonymization.py new file mode 100644 index 00000000..bdbce8f5 --- /dev/null +++ b/tests/unit/utils/test_user_anonymization.py @@ -0,0 +1,225 @@ +"""Tests for user anonymization utilities.""" + +import hashlib +from unittest.mock import patch, MagicMock + +import pytest +from sqlalchemy.exc import IntegrityError + +from utils.user_anonymization import ( + get_anonymous_user_id, + get_user_count, + find_anonymous_user_id, + _hash_user_id, +) +from models.database.user_mapping import UserMapping + + +class TestUserAnonymization: + """Test user anonymization functionality.""" + + def test_hash_user_id_consistency(self): + """Test that user ID hashing is consistent.""" + user_id = "test@example.com" + hash1 = _hash_user_id(user_id) + hash2 = _hash_user_id(user_id) + + assert hash1 == hash2 + assert len(hash1) == 64 # SHA-256 hex length + assert hash1 != user_id # Should be different from original + + def test_hash_user_id_different_for_different_users(self): + """Test that different user IDs produce different hashes.""" + user1_hash = _hash_user_id("user1@example.com") + user2_hash = _hash_user_id("user2@example.com") + + assert user1_hash != user2_hash + + @patch("utils.user_anonymization.get_session") + @patch("utils.user_anonymization.get_suid") + def test_get_anonymous_user_id_new_user(self, mock_get_suid, mock_get_session): + """Test creating new anonymous ID for first-time user.""" + # Setup mocks + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = None + mock_get_suid.return_value = "anon-123-456" + + user_id = "new_user@example.com" + result = get_anonymous_user_id(user_id) + + # Verify result + assert result == "anon-123-456" + + # Verify database interactions + mock_session.add.assert_called_once() + mock_session.commit.assert_called_once() + + # Verify the UserMapping was created correctly + added_mapping = mock_session.add.call_args[0][0] + assert isinstance(added_mapping, UserMapping) + assert added_mapping.anonymous_id == "anon-123-456" + assert added_mapping.user_id_hash == _hash_user_id(user_id) + + @patch("utils.user_anonymization.get_session") + def test_get_anonymous_user_id_existing_user(self, mock_get_session): + """Test retrieving existing anonymous ID for returning user.""" + # Setup existing mapping + existing_mapping = UserMapping() + existing_mapping.anonymous_id = "existing-anon-789" + existing_mapping.user_id_hash = _hash_user_id("existing@example.com") + + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = ( + existing_mapping + ) + + result = get_anonymous_user_id("existing@example.com") + + # Verify result + assert result == "existing-anon-789" + + # Verify no new mapping was created + mock_session.add.assert_not_called() + mock_session.commit.assert_not_called() + + @patch("utils.user_anonymization.get_session") + @patch("utils.user_anonymization.get_suid") + def test_get_anonymous_user_id_race_condition( + self, mock_get_suid, mock_get_session + ): + """Test handling race condition when creating user mapping.""" + # Setup mocks for race condition scenario + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + + # First query returns None (no existing mapping) + # Add raises IntegrityError (race condition) + # Second query returns existing mapping (created by other thread) + existing_mapping = UserMapping() + existing_mapping.anonymous_id = "race-condition-uuid" + + mock_session.query.return_value.filter_by.return_value.first.side_effect = [ + None, # First check - no existing mapping + existing_mapping, # Second check after race condition + ] + mock_session.add.side_effect = IntegrityError("Duplicate key", None, None) + mock_get_suid.return_value = "new-uuid" + + result = get_anonymous_user_id("race_user@example.com") + + # Should return the mapping created by the other thread + assert result == "race-condition-uuid" + mock_session.rollback.assert_called_once() + + @patch("utils.user_anonymization.get_session") + @patch("utils.user_anonymization.get_suid") + def test_get_anonymous_user_id_race_condition_failure( + self, mock_get_suid, mock_get_session + ): + """Test handling race condition where retrieval also fails.""" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + + # First query returns None, add fails, second query also returns None + mock_session.query.return_value.filter_by.return_value.first.side_effect = [ + None, + None, + ] + mock_session.add.side_effect = IntegrityError("Duplicate key", None, None) + mock_get_suid.return_value = "new-uuid" + + with pytest.raises( + RuntimeError, match="Unable to create or retrieve anonymous user ID" + ): + get_anonymous_user_id("problematic_user@example.com") + + @patch("utils.user_anonymization.get_session") + def test_get_user_count(self, mock_get_session): + """Test getting total user count.""" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.count.return_value = 42 + + result = get_user_count() + + assert result == 42 + mock_session.query.assert_called_once_with(UserMapping) + + @patch("utils.user_anonymization.get_session") + def test_find_anonymous_user_id_existing(self, mock_get_session): + """Test finding existing anonymous ID without creating new one.""" + existing_mapping = UserMapping() + existing_mapping.anonymous_id = "found-uuid" + + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = ( + existing_mapping + ) + + result = find_anonymous_user_id("existing@example.com") + + assert result == "found-uuid" + + @patch("utils.user_anonymization.get_session") + def test_find_anonymous_user_id_not_found(self, mock_get_session): + """Test finding non-existing anonymous ID returns None.""" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + result = find_anonymous_user_id("nonexistent@example.com") + + assert result is None + + def test_salt_prevents_rainbow_attacks(self): + """Test that salt makes rainbow table attacks impractical.""" + # Common passwords/emails that might be in rainbow tables + common_ids = ["admin", "test@test.com", "user123", "john.doe@company.com"] + + for user_id in common_ids: + hash_result = _hash_user_id(user_id) + # Hash should not match simple SHA-256 of just the user ID + simple_hash = hashlib.sha256(user_id.encode()).hexdigest() + assert hash_result != simple_hash + + def test_anonymization_preserves_uniqueness(self): + """Test that different users get different anonymous IDs.""" + users = [ + "user1@example.com", + "user2@example.com", + "admin@company.com", + "test.user@domain.org", + ] + + hashes = [_hash_user_id(user) for user in users] + + # All hashes should be unique + assert len(set(hashes)) == len(hashes) + + +class TestUserMappingModel: + """Test the UserMapping database model.""" + + def test_user_mapping_attributes(self): + """Test UserMapping model has correct attributes.""" + mapping = UserMapping() + + # Check that required attributes exist + assert hasattr(mapping, "anonymous_id") + assert hasattr(mapping, "user_id_hash") + assert hasattr(mapping, "created_at") + + # Check table name + assert UserMapping.__tablename__ == "user_mapping" + + def test_user_mapping_creation(self): + """Test creating UserMapping instance.""" + mapping = UserMapping( + anonymous_id="test-uuid-123", user_id_hash="hashed_user_id" + ) + + assert mapping.anonymous_id == "test-uuid-123" + assert mapping.user_id_hash == "hashed_user_id" From e1682760e2e529f43dd14f72d3d00422c32e6f0b Mon Sep 17 00:00:00 2001 From: Erik M Jacobs Date: Thu, 21 Aug 2025 15:44:41 -0400 Subject: [PATCH 2/6] coderabbit fixes --- src/app/endpoints/conversations.py | 3 +- src/app/endpoints/query.py | 3 +- src/models/database/conversations.py | 6 +- src/models/database/user_mapping.py | 10 ++- src/utils/endpoints.py | 16 ++++- src/utils/transcripts.py | 3 +- src/utils/user_anonymization.py | 26 ++++++-- .../app/endpoints/test_query_anonymization.py | 17 ++++- tests/unit/utils/conftest.py | 6 ++ .../utils/test_endpoints_anonymization.py | 58 +++++++++++------ .../utils/test_transcripts_anonymization.py | 33 +++++++--- tests/unit/utils/test_user_anonymization.py | 62 ++++++++++++++++++- 12 files changed, 191 insertions(+), 52 deletions(-) create mode 100644 tests/unit/utils/conftest.py diff --git a/src/app/endpoints/conversations.py b/src/app/endpoints/conversations.py index c58ebcce..98e40ee7 100644 --- a/src/app/endpoints/conversations.py +++ b/src/app/endpoints/conversations.py @@ -159,8 +159,7 @@ def get_conversations_list_endpoint_handler( anonymous_user_id = get_anonymous_user_id(user_id) logger.info( - "Retrieving conversations for user %s (anonymous: %s)", - user_id[:8] + "..." if len(user_id) > 8 else user_id, + "Retrieving conversations for anonymous user %s", anonymous_user_id, ) diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py index 92afdf8b..5d67892a 100644 --- a/src/app/endpoints/query.py +++ b/src/app/endpoints/query.py @@ -98,10 +98,9 @@ def persist_user_conversation_details( ) session.add(conversation) logger.debug( - "Associated conversation %s to anonymous user %s (original: %s)", + "Associated conversation %s to anonymous user %s", conversation_id, anonymous_user_id, - user_id[:8] + "..." if len(user_id) > 8 else user_id, ) else: existing_conversation.last_used_model = model diff --git a/src/models/database/conversations.py b/src/models/database/conversations.py index ce0ff795..73d65ac4 100644 --- a/src/models/database/conversations.py +++ b/src/models/database/conversations.py @@ -3,7 +3,7 @@ from datetime import datetime from sqlalchemy.orm import Mapped, mapped_column -from sqlalchemy import DateTime, func +from sqlalchemy import DateTime, func, ForeignKey from models.database.base import Base @@ -17,7 +17,9 @@ class UserConversation(Base): # pylint: disable=too-few-public-methods id: Mapped[str] = mapped_column(primary_key=True) # The anonymous user ID associated with the conversation - anonymous_user_id: Mapped[str] = mapped_column(index=True) + anonymous_user_id: Mapped[str] = mapped_column( + ForeignKey("user_mapping.anonymous_id"), index=True, nullable=False + ) # The last provider/model used in the conversation last_used_model: Mapped[str] = mapped_column() diff --git a/src/models/database/user_mapping.py b/src/models/database/user_mapping.py index f9f245ef..b01f2408 100644 --- a/src/models/database/user_mapping.py +++ b/src/models/database/user_mapping.py @@ -3,7 +3,7 @@ from datetime import datetime from sqlalchemy.orm import Mapped, mapped_column -from sqlalchemy import DateTime, func, Index +from sqlalchemy import DateTime, func, Index, String from models.database.base import Base @@ -14,10 +14,14 @@ class UserMapping(Base): # pylint: disable=too-few-public-methods __tablename__ = "user_mapping" # Anonymous UUID used for all storage/analytics (primary key) - anonymous_id: Mapped[str] = mapped_column(primary_key=True) + anonymous_id: Mapped[str] = mapped_column( + String(36), primary_key=True, nullable=False + ) # Original user ID from authentication (hashed for security) - user_id_hash: Mapped[str] = mapped_column(index=True, unique=True) + user_id_hash: Mapped[str] = mapped_column( + String(64), index=True, unique=True, nullable=False + ) created_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), diff --git a/src/utils/endpoints.py b/src/utils/endpoints.py index 78a949df..79c1d98e 100644 --- a/src/utils/endpoints.py +++ b/src/utils/endpoints.py @@ -21,8 +21,13 @@ def validate_conversation_ownership( user_id: str, conversation_id: str -) -> UserConversation | None: - """Validate that the conversation belongs to the user using anonymous ID lookup.""" +) -> UserConversation: + """ + Validate that the conversation belongs to the user using anonymous ID lookup. + + Raises HTTPException(403) if conversation is not found or doesn't belong to user. + Returns the conversation object if valid. + """ # Get anonymous user ID for database lookup anonymous_user_id = get_anonymous_user_id(user_id) @@ -32,6 +37,13 @@ def validate_conversation_ownership( .filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id) .first() ) + + if not conversation: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Forbidden: conversation does not belong to user", + ) + return conversation diff --git a/src/utils/transcripts.py b/src/utils/transcripts.py index 17e9638c..dd28e51c 100644 --- a/src/utils/transcripts.py +++ b/src/utils/transcripts.py @@ -60,8 +60,7 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional- # Get anonymous user ID for storage anonymous_user_id = get_anonymous_user_id(user_id) logger.debug( - "Anonymized user %s to %s for transcript storage", - user_id[:8] + "..." if len(user_id) > 8 else user_id, + "Storing transcript for anonymous user %s", anonymous_user_id, ) diff --git a/src/utils/user_anonymization.py b/src/utils/user_anonymization.py index 7ab8929f..0e63467d 100644 --- a/src/utils/user_anonymization.py +++ b/src/utils/user_anonymization.py @@ -1,7 +1,9 @@ """User ID anonymization utilities.""" import hashlib +import hmac import logging +import os from typing import Optional from sqlalchemy.exc import IntegrityError @@ -12,18 +14,30 @@ logger = logging.getLogger("utils.user_anonymization") +# Load HMAC pepper from environment - fail at startup if missing +try: + _USER_ANON_PEPPER = os.environ["USER_ANON_PEPPER"].encode("utf-8") +except KeyError as e: + raise RuntimeError( + "USER_ANON_PEPPER environment variable is required for user anonymization. " + "Set a secure random value (e.g., 'export USER_ANON_PEPPER=')" + ) from e + def _hash_user_id(user_id: str) -> str: """ Create a consistent hash of the user ID for mapping purposes. - Uses SHA-256 with a fixed salt to ensure consistent hashing - while preventing rainbow table attacks. + Uses HMAC-SHA256 with a server-secret pepper to ensure consistent hashing + while preventing rainbow table attacks and providing cryptographic security. """ - # Use a fixed salt - in production, this should be configurable - salt = "lightspeed_user_anonymization_salt_v1" - hash_input = f"{salt}:{user_id}".encode("utf-8") - return hashlib.sha256(hash_input).hexdigest() + # Normalize user ID: trim whitespace and lowercase + normalized_user_id = user_id.strip().lower() + + # Use HMAC-SHA256 with secret pepper + return hmac.new( + _USER_ANON_PEPPER, normalized_user_id.encode("utf-8"), hashlib.sha256 + ).hexdigest() def get_anonymous_user_id(auth_user_id: str) -> str: diff --git a/tests/unit/app/endpoints/test_query_anonymization.py b/tests/unit/app/endpoints/test_query_anonymization.py index 61c76467..808b807f 100644 --- a/tests/unit/app/endpoints/test_query_anonymization.py +++ b/tests/unit/app/endpoints/test_query_anonymization.py @@ -1,14 +1,25 @@ """Tests for query endpoint anonymization functionality.""" import logging +import os from unittest.mock import patch, MagicMock +import pytest from sqlalchemy import inspect from app.endpoints.query import persist_user_conversation_details from models.database.conversations import UserConversation +# Set up test environment variable before importing the module +@pytest.fixture(autouse=True) +def setup_test_pepper(): + """Set up test pepper environment variable for all tests.""" + test_pepper = "test-pepper-for-query-tests" + with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}): + yield + + class TestQueryAnonymization: """Test query endpoint user anonymization.""" @@ -126,11 +137,13 @@ def test_persist_user_conversation_logs_anonymization( ] assert len(anonymization_logs) > 0 - # Verify log contains both anonymous and truncated original user + # Verify log contains anonymous user and conversation ID (but no original user ID) log_msg = anonymization_logs[0] assert "anon-logging-456" in log_msg assert "logging-conv-789" in log_msg - assert "logging_..." in log_msg # Truncated original user ID + # Should NOT contain any reference to original user ID + assert "logging_test_user" not in log_msg + assert "logging_..." not in log_msg def test_conversation_model_uses_anonymous_field(self): """Test that UserConversation model uses anonymous_user_id field.""" diff --git a/tests/unit/utils/conftest.py b/tests/unit/utils/conftest.py new file mode 100644 index 00000000..d39526e9 --- /dev/null +++ b/tests/unit/utils/conftest.py @@ -0,0 +1,6 @@ +"""Configuration for utils unit tests.""" + +import os + +# Set environment variable for all tests in this directory +os.environ.setdefault("USER_ANON_PEPPER", "test-pepper-for-utils-tests") diff --git a/tests/unit/utils/test_endpoints_anonymization.py b/tests/unit/utils/test_endpoints_anonymization.py index 1e14f7b0..dd5b23c1 100644 --- a/tests/unit/utils/test_endpoints_anonymization.py +++ b/tests/unit/utils/test_endpoints_anonymization.py @@ -1,10 +1,22 @@ """Tests for endpoint utilities anonymization functionality.""" +import os from unittest.mock import patch, MagicMock + import pytest +from fastapi import HTTPException -from utils.endpoints import validate_conversation_ownership from models.database.conversations import UserConversation +from utils.endpoints import validate_conversation_ownership + + +# Set up test environment variable before importing the module +@pytest.fixture(autouse=True) +def setup_test_pepper(): + """Set up test pepper environment variable for all tests.""" + test_pepper = "test-pepper-for-endpoints-tests" + with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}): + yield class TestEndpointsAnonymization: @@ -50,24 +62,28 @@ def test_validate_conversation_ownership_uses_anonymous_id( def test_validate_conversation_ownership_not_owned( self, mock_get_session, mock_get_anonymous ): - """Test validation fails when user doesn't own conversation.""" + """Test validation raises 403 when user doesn't own conversation.""" mock_get_anonymous.return_value = "anon-not-owner-789" mock_session = MagicMock() mock_get_session.return_value.__enter__.return_value = mock_session mock_session.query.return_value.filter_by.return_value.first.return_value = None - # Call validation - result = validate_conversation_ownership( - user_id="notowner@example.com", conversation_id="not-owned-conv" - ) + # Call validation should raise HTTPException + with pytest.raises(HTTPException) as exc_info: + validate_conversation_ownership( + user_id="notowner@example.com", conversation_id="not-owned-conv" + ) # Verify anonymous user ID was used for lookup mock_get_anonymous.assert_called_once_with("notowner@example.com") filter_call = mock_session.query.return_value.filter_by.call_args assert filter_call[1]["anonymous_user_id"] == "anon-not-owner-789" - # Verify None returned for non-owned conversation - assert result is None + # Verify 403 error raised for non-owned conversation + assert exc_info.value.status_code == 403 + assert "Forbidden: conversation does not belong to user" in str( + exc_info.value.detail + ) @patch("utils.endpoints.get_anonymous_user_id") @patch("utils.endpoints.get_session") @@ -89,10 +105,11 @@ def mock_anonymous_side_effect(user_id): mock_get_session.return_value.__enter__.return_value = mock_session mock_session.query.return_value.filter_by.return_value.first.return_value = None - # Validate for user 1 - result1 = validate_conversation_ownership( - user_id="user1@example.com", conversation_id="shared-conv-test" - ) + # Validate for user 1 - should raise 403 + with pytest.raises(HTTPException) as exc_info1: + validate_conversation_ownership( + user_id="user1@example.com", conversation_id="shared-conv-test" + ) first_call = mock_session.query.return_value.filter_by.call_args assert first_call[1]["anonymous_user_id"] == "anon-user1-validation" @@ -101,18 +118,19 @@ def mock_anonymous_side_effect(user_id): mock_session.reset_mock() mock_session.query.return_value.filter_by.return_value.first.return_value = None - # Validate for user 2 - result2 = validate_conversation_ownership( - user_id="user2@example.com", - conversation_id="shared-conv-test", # Same conversation ID - ) + # Validate for user 2 - should also raise 403 + with pytest.raises(HTTPException) as exc_info2: + validate_conversation_ownership( + user_id="user2@example.com", + conversation_id="shared-conv-test", # Same conversation ID + ) second_call = mock_session.query.return_value.filter_by.call_args assert second_call[1]["anonymous_user_id"] == "anon-user2-validation" - # Both should return None since neither owns the conversation - assert result1 is None - assert result2 is None + # Both should raise 403 since neither owns the conversation + assert exc_info1.value.status_code == 403 + assert exc_info2.value.status_code == 403 # But they used different anonymous IDs (checking the calls, not constant comparison) assert first_call[1]["anonymous_user_id"] != second_call[1]["anonymous_user_id"] diff --git a/tests/unit/utils/test_transcripts_anonymization.py b/tests/unit/utils/test_transcripts_anonymization.py index 869b1c55..abadc4e5 100644 --- a/tests/unit/utils/test_transcripts_anonymization.py +++ b/tests/unit/utils/test_transcripts_anonymization.py @@ -1,16 +1,27 @@ """Tests for transcript anonymization functionality.""" import json +import os import tempfile from pathlib import Path from unittest.mock import patch +import pytest from models.requests import QueryRequest, Attachment from utils.transcripts import store_transcript, construct_transcripts_path from utils.types import TurnSummary +# Set up test environment variable before importing the module +@pytest.fixture(autouse=True) +def setup_test_pepper(): + """Set up test pepper environment variable for all tests.""" + test_pepper = "test-pepper-for-transcript-tests" + with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}): + yield + + class TestTranscriptAnonymization: """Test transcript storage with user anonymization.""" @@ -260,12 +271,18 @@ def test_logging_shows_anonymization(self, mock_get_anonymous, caplog): attachments=[], ) - # Check that anonymization is logged + # Check that transcript storage is logged log_messages = [record.message for record in caplog.records] - anonymization_logs = [msg for msg in log_messages if "Anonymized user" in msg] - assert len(anonymization_logs) > 0 - - # Verify the log shows original user is being anonymized - anonymization_log = anonymization_logs[0] - assert "anon-logging-test" in anonymization_log - assert "log_test..." in anonymization_log # Truncated original user ID + storage_logs = [ + msg + for msg in log_messages + if "Storing transcript for anonymous user" in msg + ] + assert len(storage_logs) > 0 + + # Verify the log shows only anonymous user ID (no original user ID) + storage_log = storage_logs[0] + assert "anon-logging-test" in storage_log + # Should NOT contain any reference to original user ID + assert "log_test_user" not in storage_log + assert "log_test..." not in storage_log diff --git a/tests/unit/utils/test_user_anonymization.py b/tests/unit/utils/test_user_anonymization.py index bdbce8f5..16f28b5d 100644 --- a/tests/unit/utils/test_user_anonymization.py +++ b/tests/unit/utils/test_user_anonymization.py @@ -1,18 +1,34 @@ """Tests for user anonymization utilities.""" import hashlib +import hmac +import importlib +import os from unittest.mock import patch, MagicMock import pytest from sqlalchemy.exc import IntegrityError +from models.database.user_mapping import UserMapping from utils.user_anonymization import ( get_anonymous_user_id, get_user_count, find_anonymous_user_id, _hash_user_id, ) -from models.database.user_mapping import UserMapping + + +# Set up test environment variable for each test +@pytest.fixture(autouse=True) +def setup_test_pepper(): + """Set up test pepper environment variable for all tests.""" + test_pepper = "test-pepper-for-anonymization-tests" + with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}): + # Reload the module to pick up the new environment variable + import utils.user_anonymization # pylint: disable=import-outside-toplevel + + importlib.reload(utils.user_anonymization) + yield class TestUserAnonymization: @@ -174,8 +190,8 @@ def test_find_anonymous_user_id_not_found(self, mock_get_session): assert result is None - def test_salt_prevents_rainbow_attacks(self): - """Test that salt makes rainbow table attacks impractical.""" + def test_hmac_prevents_rainbow_attacks(self): + """Test that HMAC makes rainbow table attacks impractical.""" # Common passwords/emails that might be in rainbow tables common_ids = ["admin", "test@test.com", "user123", "john.doe@company.com"] @@ -185,6 +201,12 @@ def test_salt_prevents_rainbow_attacks(self): simple_hash = hashlib.sha256(user_id.encode()).hexdigest() assert hash_result != simple_hash + # Hash should not match HMAC without pepper + hmac_without_pepper = hmac.new( + b"", user_id.strip().lower().encode(), hashlib.sha256 + ).hexdigest() + assert hash_result != hmac_without_pepper + def test_anonymization_preserves_uniqueness(self): """Test that different users get different anonymous IDs.""" users = [ @@ -199,6 +221,40 @@ def test_anonymization_preserves_uniqueness(self): # All hashes should be unique assert len(set(hashes)) == len(hashes) + def test_user_id_normalization(self): + """Test that user ID normalization works correctly.""" + # Test case variations should produce the same hash + variations = [ + "User@Example.Com", + "user@example.com", + " user@example.com ", + "USER@EXAMPLE.COM", + " USER@EXAMPLE.COM ", + ] + + hashes = [_hash_user_id(variation) for variation in variations] + + # All variations should produce the same hash + assert ( + len(set(hashes)) == 1 + ), "All case/whitespace variations should hash the same" + + # But different actual users should still be different + different_user = _hash_user_id("different@example.com") + assert different_user not in hashes + + def test_missing_pepper_env_var(self): + """Test that missing pepper environment variable raises clear error.""" + import utils.user_anonymization # pylint: disable=import-outside-toplevel + + # Test that module import fails with missing env var + with patch.dict(os.environ, {}, clear=True): + with pytest.raises( + RuntimeError, match="USER_ANON_PEPPER environment variable is required" + ): + # Force reimport to trigger the env var check + importlib.reload(utils.user_anonymization) + class TestUserMappingModel: """Test the UserMapping database model.""" From fd32c3e44a1298c9ce655504c6b85101a563b02b Mon Sep 17 00:00:00 2001 From: Erik M Jacobs Date: Thu, 21 Aug 2025 16:14:54 -0400 Subject: [PATCH 3/6] coderabbit --- tests/conftest.py | 7 ++ tests/unit/app/endpoints/test_query.py | 6 +- tests/unit/utils/test_user_anonymization.py | 85 ++++++++++++--------- 3 files changed, 58 insertions(+), 40 deletions(-) create mode 100644 tests/conftest.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..4be2f7fe --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,7 @@ +"""Global configuration for all tests.""" + +import os + +# Set environment variable for all tests globally +# This ensures the anonymization module can be imported in CI environments +os.environ.setdefault("USER_ANON_PEPPER", "test-pepper-for-all-tests") diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py index 3356f2b2..c6acb888 100644 --- a/tests/unit/app/endpoints/test_query.py +++ b/tests/unit/app/endpoints/test_query.py @@ -169,7 +169,7 @@ async def _test_query_endpoint_handler(mocker, store_transcript_to_file=False): # Assert the store_transcript function is called if transcripts are enabled if store_transcript_to_file: mock_transcript.assert_called_once_with( - user_id="mock_user_id", + anonymous_user_id="mock_user_id", conversation_id=conversation_id, model_id="fake_model_id", provider_id="fake_provider_id", @@ -1409,7 +1409,7 @@ def test_no_tools_parameter_backward_compatibility(): ( UserConversation( id="conv1", - user_id="user1", + anonymous_user_id="user1", last_used_provider="foo", last_used_model="bar", message_count=1, @@ -1428,7 +1428,7 @@ def test_no_tools_parameter_backward_compatibility(): ( UserConversation( id="conv1", - user_id="user1", + anonymous_user_id="user1", last_used_provider="foo", last_used_model="bar", message_count=1, diff --git a/tests/unit/utils/test_user_anonymization.py b/tests/unit/utils/test_user_anonymization.py index 16f28b5d..a0a559ea 100644 --- a/tests/unit/utils/test_user_anonymization.py +++ b/tests/unit/utils/test_user_anonymization.py @@ -10,12 +10,8 @@ from sqlalchemy.exc import IntegrityError from models.database.user_mapping import UserMapping -from utils.user_anonymization import ( - get_anonymous_user_id, - get_user_count, - find_anonymous_user_id, - _hash_user_id, -) + +# Functions under test are accessed via a module fixture (see `ua` below) # Set up test environment variable for each test @@ -31,29 +27,40 @@ def setup_test_pepper(): yield -class TestUserAnonymization: +@pytest.fixture +def user_anon_module(): + """Return a fresh reference to the reloaded utils.user_anonymization module.""" + import utils.user_anonymization as _ua # pylint: disable=import-outside-toplevel + + importlib.reload(_ua) + return _ua + + +class TestUserAnonymization: # pylint: disable=redefined-outer-name,protected-access """Test user anonymization functionality.""" - def test_hash_user_id_consistency(self): + def test_hash_user_id_consistency(self, user_anon_module): """Test that user ID hashing is consistent.""" user_id = "test@example.com" - hash1 = _hash_user_id(user_id) - hash2 = _hash_user_id(user_id) + hash1 = user_anon_module._hash_user_id(user_id) + hash2 = user_anon_module._hash_user_id(user_id) assert hash1 == hash2 assert len(hash1) == 64 # SHA-256 hex length assert hash1 != user_id # Should be different from original - def test_hash_user_id_different_for_different_users(self): + def test_hash_user_id_different_for_different_users(self, user_anon_module): """Test that different user IDs produce different hashes.""" - user1_hash = _hash_user_id("user1@example.com") - user2_hash = _hash_user_id("user2@example.com") + user1_hash = user_anon_module._hash_user_id("user1@example.com") + user2_hash = user_anon_module._hash_user_id("user2@example.com") assert user1_hash != user2_hash @patch("utils.user_anonymization.get_session") @patch("utils.user_anonymization.get_suid") - def test_get_anonymous_user_id_new_user(self, mock_get_suid, mock_get_session): + def test_get_anonymous_user_id_new_user( + self, mock_get_suid, mock_get_session, user_anon_module + ): """Test creating new anonymous ID for first-time user.""" # Setup mocks mock_session = MagicMock() @@ -62,7 +69,7 @@ def test_get_anonymous_user_id_new_user(self, mock_get_suid, mock_get_session): mock_get_suid.return_value = "anon-123-456" user_id = "new_user@example.com" - result = get_anonymous_user_id(user_id) + result = user_anon_module.get_anonymous_user_id(user_id) # Verify result assert result == "anon-123-456" @@ -75,15 +82,19 @@ def test_get_anonymous_user_id_new_user(self, mock_get_suid, mock_get_session): added_mapping = mock_session.add.call_args[0][0] assert isinstance(added_mapping, UserMapping) assert added_mapping.anonymous_id == "anon-123-456" - assert added_mapping.user_id_hash == _hash_user_id(user_id) + assert added_mapping.user_id_hash == user_anon_module._hash_user_id(user_id) @patch("utils.user_anonymization.get_session") - def test_get_anonymous_user_id_existing_user(self, mock_get_session): + def test_get_anonymous_user_id_existing_user( + self, mock_get_session, user_anon_module + ): """Test retrieving existing anonymous ID for returning user.""" # Setup existing mapping existing_mapping = UserMapping() existing_mapping.anonymous_id = "existing-anon-789" - existing_mapping.user_id_hash = _hash_user_id("existing@example.com") + existing_mapping.user_id_hash = user_anon_module._hash_user_id( + "existing@example.com" + ) mock_session = MagicMock() mock_get_session.return_value.__enter__.return_value = mock_session @@ -91,7 +102,7 @@ def test_get_anonymous_user_id_existing_user(self, mock_get_session): existing_mapping ) - result = get_anonymous_user_id("existing@example.com") + result = user_anon_module.get_anonymous_user_id("existing@example.com") # Verify result assert result == "existing-anon-789" @@ -103,7 +114,7 @@ def test_get_anonymous_user_id_existing_user(self, mock_get_session): @patch("utils.user_anonymization.get_session") @patch("utils.user_anonymization.get_suid") def test_get_anonymous_user_id_race_condition( - self, mock_get_suid, mock_get_session + self, mock_get_suid, mock_get_session, user_anon_module ): """Test handling race condition when creating user mapping.""" # Setup mocks for race condition scenario @@ -123,7 +134,7 @@ def test_get_anonymous_user_id_race_condition( mock_session.add.side_effect = IntegrityError("Duplicate key", None, None) mock_get_suid.return_value = "new-uuid" - result = get_anonymous_user_id("race_user@example.com") + result = user_anon_module.get_anonymous_user_id("race_user@example.com") # Should return the mapping created by the other thread assert result == "race-condition-uuid" @@ -132,7 +143,7 @@ def test_get_anonymous_user_id_race_condition( @patch("utils.user_anonymization.get_session") @patch("utils.user_anonymization.get_suid") def test_get_anonymous_user_id_race_condition_failure( - self, mock_get_suid, mock_get_session + self, mock_get_suid, mock_get_session, user_anon_module ): """Test handling race condition where retrieval also fails.""" mock_session = MagicMock() @@ -149,22 +160,22 @@ def test_get_anonymous_user_id_race_condition_failure( with pytest.raises( RuntimeError, match="Unable to create or retrieve anonymous user ID" ): - get_anonymous_user_id("problematic_user@example.com") + user_anon_module.get_anonymous_user_id("problematic_user@example.com") @patch("utils.user_anonymization.get_session") - def test_get_user_count(self, mock_get_session): + def test_get_user_count(self, mock_get_session, user_anon_module): """Test getting total user count.""" mock_session = MagicMock() mock_get_session.return_value.__enter__.return_value = mock_session mock_session.query.return_value.count.return_value = 42 - result = get_user_count() + result = user_anon_module.get_user_count() assert result == 42 mock_session.query.assert_called_once_with(UserMapping) @patch("utils.user_anonymization.get_session") - def test_find_anonymous_user_id_existing(self, mock_get_session): + def test_find_anonymous_user_id_existing(self, mock_get_session, user_anon_module): """Test finding existing anonymous ID without creating new one.""" existing_mapping = UserMapping() existing_mapping.anonymous_id = "found-uuid" @@ -175,28 +186,28 @@ def test_find_anonymous_user_id_existing(self, mock_get_session): existing_mapping ) - result = find_anonymous_user_id("existing@example.com") + result = user_anon_module.find_anonymous_user_id("existing@example.com") assert result == "found-uuid" @patch("utils.user_anonymization.get_session") - def test_find_anonymous_user_id_not_found(self, mock_get_session): + def test_find_anonymous_user_id_not_found(self, mock_get_session, user_anon_module): """Test finding non-existing anonymous ID returns None.""" mock_session = MagicMock() mock_get_session.return_value.__enter__.return_value = mock_session mock_session.query.return_value.filter_by.return_value.first.return_value = None - result = find_anonymous_user_id("nonexistent@example.com") + result = user_anon_module.find_anonymous_user_id("nonexistent@example.com") assert result is None - def test_hmac_prevents_rainbow_attacks(self): + def test_hmac_prevents_rainbow_attacks(self, user_anon_module): """Test that HMAC makes rainbow table attacks impractical.""" # Common passwords/emails that might be in rainbow tables common_ids = ["admin", "test@test.com", "user123", "john.doe@company.com"] for user_id in common_ids: - hash_result = _hash_user_id(user_id) + hash_result = user_anon_module._hash_user_id(user_id) # Hash should not match simple SHA-256 of just the user ID simple_hash = hashlib.sha256(user_id.encode()).hexdigest() assert hash_result != simple_hash @@ -207,7 +218,7 @@ def test_hmac_prevents_rainbow_attacks(self): ).hexdigest() assert hash_result != hmac_without_pepper - def test_anonymization_preserves_uniqueness(self): + def test_anonymization_preserves_uniqueness(self, user_anon_module): """Test that different users get different anonymous IDs.""" users = [ "user1@example.com", @@ -216,12 +227,12 @@ def test_anonymization_preserves_uniqueness(self): "test.user@domain.org", ] - hashes = [_hash_user_id(user) for user in users] + hashes = [user_anon_module._hash_user_id(user) for user in users] # All hashes should be unique assert len(set(hashes)) == len(hashes) - def test_user_id_normalization(self): + def test_user_id_normalization(self, user_anon_module): """Test that user ID normalization works correctly.""" # Test case variations should produce the same hash variations = [ @@ -232,7 +243,7 @@ def test_user_id_normalization(self): " USER@EXAMPLE.COM ", ] - hashes = [_hash_user_id(variation) for variation in variations] + hashes = [user_anon_module._hash_user_id(variation) for variation in variations] # All variations should produce the same hash assert ( @@ -240,7 +251,7 @@ def test_user_id_normalization(self): ), "All case/whitespace variations should hash the same" # But different actual users should still be different - different_user = _hash_user_id("different@example.com") + different_user = user_anon_module._hash_user_id("different@example.com") assert different_user not in hashes def test_missing_pepper_env_var(self): @@ -256,7 +267,7 @@ def test_missing_pepper_env_var(self): importlib.reload(utils.user_anonymization) -class TestUserMappingModel: +class TestUserMappingModel: # pylint: disable=redefined-outer-name """Test the UserMapping database model.""" def test_user_mapping_attributes(self): From d6344dfdd787c9e727279d6ea139068ea8ed67aa Mon Sep 17 00:00:00 2001 From: Erik M Jacobs Date: Thu, 21 Aug 2025 17:18:59 -0400 Subject: [PATCH 4/6] env content --- .env.example | 11 +++++++++++ README.md | 11 +++++++++++ docker-compose.yaml | 1 + 3 files changed, 23 insertions(+) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..c268c75b --- /dev/null +++ b/.env.example @@ -0,0 +1,11 @@ +# Example Environment Variables for Lightspeed Stack +# Copy this file to .env and set appropriate values + +# Required: User anonymization pepper (set to a secure random value) +# This is used for HMAC-based user ID hashing to protect user privacy +USER_ANON_PEPPER=your-secure-random-string-here + +# Optional: OpenAI API Key (if using OpenAI models) +OPENAI_API_KEY=your-openai-api-key + +# Optional: Other environment variables as needed for your configuration \ No newline at end of file diff --git a/README.md b/README.md index 1b0f9ecd..58b5caaf 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,17 @@ Lightspeed Core Stack is based on the FastAPI framework (Uvicorn). The service i - please note that currently Python 3.14 is not officially supported - all sources are made (backward) compatible with Python 3.12; it is checked on CI +## Environment Variables + +The following environment variable is required for the service to start: + +* `USER_ANON_PEPPER` - A secure random string used for user anonymization. Set this to a cryptographically secure random value: + ```bash + export USER_ANON_PEPPER="your-secure-random-string-here" + ``` + + **Security Note**: This value should be treated as a secret and kept secure. It's used for HMAC-based user ID hashing to protect user privacy while enabling usage analytics. + # Installation Installation steps depends on operation system. Please look at instructions for your system: diff --git a/docker-compose.yaml b/docker-compose.yaml index b3624526..5148c746 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -30,6 +30,7 @@ services: - ./lightspeed-stack.yaml:/app-root/lightspeed-stack.yaml:Z environment: - OPENAI_API_KEY=${OPENAI_API_KEY} + - USER_ANON_PEPPER=${USER_ANON_PEPPER:-default-pepper-for-development-only} depends_on: llama-stack: condition: service_healthy From 26505ca78d7446b5dd2acf875ef73a9d62869cfa Mon Sep 17 00:00:00 2001 From: Erik M Jacobs Date: Tue, 26 Aug 2025 09:34:33 -0400 Subject: [PATCH 5/6] fix unit test failures --- src/app/endpoints/conversations.py | 13 +- src/app/endpoints/feedback.py | 10 +- .../unit/app/endpoints/test_conversations.py | 18 ++- tests/unit/app/endpoints/test_feedback.py | 8 +- .../endpoints/test_feedback_anonymization.py | 138 ++++++++++++++++++ tests/unit/app/endpoints/test_query.py | 4 +- tests/unit/utils/test_transcripts.py | 9 +- .../utils/test_transcripts_anonymization.py | 11 +- 8 files changed, 199 insertions(+), 12 deletions(-) create mode 100644 tests/unit/app/endpoints/test_feedback_anonymization.py diff --git a/src/app/endpoints/conversations.py b/src/app/endpoints/conversations.py index 98e40ee7..324b0796 100644 --- a/src/app/endpoints/conversations.py +++ b/src/app/endpoints/conversations.py @@ -190,20 +190,27 @@ def get_conversations_list_endpoint_handler( ] logger.info( - "Found %d conversations for user %s", len(conversations), user_id + "Found %d conversations for anonymous user %s", + len(conversations), + anonymous_user_id, ) return ConversationsListResponse(conversations=conversations) except Exception as e: logger.exception( - "Error retrieving conversations for user %s: %s", user_id, e + "Error retrieving conversations for anonymous user %s: %s", + anonymous_user_id, + e, ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail={ "response": "Unknown error", - "cause": f"Unknown error while getting conversations for user {user_id}", + "cause": ( + f"Unknown error while getting conversations for " + f"anonymous user {anonymous_user_id}" + ), }, ) from e diff --git a/src/app/endpoints/feedback.py b/src/app/endpoints/feedback.py index 5d82267e..c9436f25 100644 --- a/src/app/endpoints/feedback.py +++ b/src/app/endpoints/feedback.py @@ -19,6 +19,7 @@ ) from models.requests import FeedbackRequest from utils.suid import get_suid +from utils.user_anonymization import get_anonymous_user_id logger = logging.getLogger(__name__) router = APIRouter(prefix="/feedback", tags=["feedback"]) @@ -131,7 +132,8 @@ def store_feedback(user_id: str, feedback: dict) -> None: user_id (str): Unique identifier of the user submitting feedback. feedback (dict): Feedback data to be stored, merged with user ID and timestamp. """ - logger.debug("Storing feedback for user %s", user_id) + anonymous_user_id = get_anonymous_user_id(user_id) + logger.debug("Storing feedback for anonymous user %s", anonymous_user_id) # Creates storage path only if it doesn't exist. The `exist_ok=True` prevents # race conditions in case of multiple server instances trying to set up storage # at the same location. @@ -141,7 +143,11 @@ def store_feedback(user_id: str, feedback: dict) -> None: storage_path.mkdir(parents=True, exist_ok=True) current_time = str(datetime.now(UTC)) - data_to_store = {"user_id": user_id, "timestamp": current_time, **feedback} + data_to_store = { + "anonymous_user_id": anonymous_user_id, + "timestamp": current_time, + **feedback, + } # stores feedback in a file under unique uuid feedback_file_path = storage_path / f"{get_suid()}.json" diff --git a/tests/unit/app/endpoints/test_conversations.py b/tests/unit/app/endpoints/test_conversations.py index aed6fdc5..d612d5ee 100644 --- a/tests/unit/app/endpoints/test_conversations.py +++ b/tests/unit/app/endpoints/test_conversations.py @@ -46,6 +46,10 @@ def create_mock_conversation( def mock_database_session(mocker, query_result=None): """Helper function to mock get_session with proper context manager support.""" + # Mock database initialization + mocker.patch("app.database.engine", mocker.Mock()) + mocker.patch("app.database.SessionLocal", mocker.Mock()) + mock_session = mocker.Mock() if query_result is not None: mock_session.query.return_value.filter_by.return_value.all.return_value = ( @@ -530,6 +534,10 @@ def test_configuration_not_loaded(self, mocker): def test_successful_conversations_list_retrieval(self, mocker, setup_configuration): """Test successful retrieval of conversations list.""" mocker.patch("app.endpoints.conversations.configuration", setup_configuration) + mocker.patch( + "app.endpoints.conversations.get_anonymous_user_id", + return_value="anon-test-user", + ) # Mock database session and query results mock_conversations = [ @@ -570,6 +578,10 @@ def test_successful_conversations_list_retrieval(self, mocker, setup_configurati def test_empty_conversations_list(self, mocker, setup_configuration): """Test when user has no conversations.""" mocker.patch("app.endpoints.conversations.configuration", setup_configuration) + mocker.patch( + "app.endpoints.conversations.get_anonymous_user_id", + return_value="anon-test-user", + ) # Mock database session with no results mock_database_session(mocker, []) @@ -583,6 +595,10 @@ def test_empty_conversations_list(self, mocker, setup_configuration): def test_database_exception(self, mocker, setup_configuration): """Test when database query raises an exception.""" mocker.patch("app.endpoints.conversations.configuration", setup_configuration) + mocker.patch( + "app.endpoints.conversations.get_anonymous_user_id", + return_value="anon-test-user", + ) # Mock database session to raise exception mock_session = mock_database_session(mocker) @@ -594,6 +610,6 @@ def test_database_exception(self, mocker, setup_configuration): assert exc_info.value.status_code == status.HTTP_500_INTERNAL_SERVER_ERROR assert "Unknown error" in exc_info.value.detail["response"] assert ( - "Unknown error while getting conversations for user" + "Unknown error while getting conversations for anonymous user" in exc_info.value.detail["cause"] ) diff --git a/tests/unit/app/endpoints/test_feedback.py b/tests/unit/app/endpoints/test_feedback.py index 337552c0..763f3b81 100644 --- a/tests/unit/app/endpoints/test_feedback.py +++ b/tests/unit/app/endpoints/test_feedback.py @@ -137,6 +137,9 @@ def test_store_feedback(mocker, feedback_request_data): mocker.patch("builtins.open", mocker.mock_open()) mocker.patch("app.endpoints.feedback.Path", return_value=mocker.MagicMock()) mocker.patch("app.endpoints.feedback.get_suid", return_value="fake-uuid") + mocker.patch( + "app.endpoints.feedback.get_anonymous_user_id", return_value="anon-test-user" + ) # Patch json to inspect stored data mock_json = mocker.patch("app.endpoints.feedback.json") @@ -146,7 +149,7 @@ def test_store_feedback(mocker, feedback_request_data): store_feedback(user_id, feedback_request_data) expected_data = { - "user_id": user_id, + "anonymous_user_id": "anon-test-user", "timestamp": mocker.ANY, **feedback_request_data, } @@ -182,6 +185,9 @@ def test_store_feedback_on_io_error(mocker, feedback_request_data): configuration.user_data_collection_configuration.feedback_storage = "fake-path" mocker.patch("app.endpoints.feedback.Path", return_value=mocker.MagicMock()) mocker.patch("builtins.open", side_effect=PermissionError("EACCES")) + mocker.patch( + "app.endpoints.feedback.get_anonymous_user_id", return_value="anon-test-user" + ) user_id = "test_user_id" diff --git a/tests/unit/app/endpoints/test_feedback_anonymization.py b/tests/unit/app/endpoints/test_feedback_anonymization.py new file mode 100644 index 00000000..26e2b4af --- /dev/null +++ b/tests/unit/app/endpoints/test_feedback_anonymization.py @@ -0,0 +1,138 @@ +"""Tests for feedback endpoint anonymization functionality.""" + +import os +from unittest.mock import patch, MagicMock + +import pytest + +from app.endpoints.feedback import store_feedback + + +# Set up test environment variable before importing the module +@pytest.fixture(autouse=True) +def setup_test_pepper(): + """Set up test pepper environment variable for all tests.""" + test_pepper = "test-pepper-for-feedback-tests" + with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}): + yield + + +class TestFeedbackAnonymization: + """Test feedback storage with user anonymization.""" + + @patch("app.endpoints.feedback.get_anonymous_user_id") + @patch("app.endpoints.feedback.get_suid") + @patch("app.endpoints.feedback.json") + @patch("app.endpoints.feedback.Path") + def test_store_feedback_anonymizes_user_id( + self, mock_path, mock_json, mock_get_suid, mock_get_anonymous + ): + """Test that store_feedback uses anonymous user ID.""" + # Setup mocks + mock_get_anonymous.return_value = "anon-feedback-123" + mock_get_suid.return_value = "feedback-uuid" + mock_path.return_value = MagicMock() + + # Mock configuration + with ( + patch("app.endpoints.feedback.configuration") as mock_config, + patch("builtins.open"), + ): + mock_config.user_data_collection_configuration.feedback_storage = ( + "/tmp/feedback" + ) + + # Call store_feedback + store_feedback( + user_id="original_user@example.com", + feedback={ + "feedback": "This is test feedback", + "sentiment": 1, + "categories": ["helpful"], + }, + ) + + # Verify anonymous user ID was used + mock_get_anonymous.assert_called_once_with("original_user@example.com") + + # Verify stored data uses anonymous ID + stored_data = mock_json.dump.call_args[0][0] + assert stored_data["anonymous_user_id"] == "anon-feedback-123" + assert "user_id" not in stored_data + assert stored_data["feedback"] == "This is test feedback" + assert stored_data["sentiment"] == 1 + assert stored_data["categories"] == ["helpful"] + + @patch("app.endpoints.feedback.get_anonymous_user_id") + def test_store_feedback_different_users_get_different_anonymous_ids( + self, mock_get_anonymous + ): + """Test that different users get different anonymous IDs for feedback.""" + + def mock_anonymous_side_effect(user_id): + if user_id == "user1@example.com": + return "anon-feedback-user1" + if user_id == "user2@example.com": + return "anon-feedback-user2" + return "anon-unknown" + + mock_get_anonymous.side_effect = mock_anonymous_side_effect + + with ( + patch("app.endpoints.feedback.json") as mock_json, + patch("app.endpoints.feedback.Path") as mock_path, + patch("builtins.open"), + patch("app.endpoints.feedback.get_suid", return_value="uuid"), + patch("app.endpoints.feedback.configuration") as mock_config, + ): + + mock_config.user_data_collection_configuration.feedback_storage = ( + "/tmp/feedback" + ) + mock_path.return_value = MagicMock() + + # Store feedback for user 1 + store_feedback("user1@example.com", {"feedback": "Test 1"}) + first_call_data = mock_json.dump.call_args[0][0] + + # Reset mock for second call + mock_json.reset_mock() + + # Store feedback for user 2 + store_feedback("user2@example.com", {"feedback": "Test 2"}) + second_call_data = mock_json.dump.call_args[0][0] + + # Verify different anonymous IDs were used + assert first_call_data["anonymous_user_id"] == "anon-feedback-user1" + assert second_call_data["anonymous_user_id"] == "anon-feedback-user2" + assert ( + first_call_data["anonymous_user_id"] + != second_call_data["anonymous_user_id"] + ) + + @patch("app.endpoints.feedback.get_anonymous_user_id") + @patch("app.endpoints.feedback.logger") + def test_feedback_logging_uses_anonymous_id(self, mock_logger, mock_get_anonymous): + """Test that feedback logging uses anonymous user ID.""" + mock_get_anonymous.return_value = "anon-feedback-logging" + + with ( + patch("app.endpoints.feedback.json"), + patch("app.endpoints.feedback.Path") as mock_path, + patch("builtins.open"), + patch("app.endpoints.feedback.get_suid", return_value="uuid"), + patch("app.endpoints.feedback.configuration") as mock_config, + ): + + mock_config.user_data_collection_configuration.feedback_storage = ( + "/tmp/feedback" + ) + mock_path.return_value = MagicMock() + + # Store feedback + store_feedback("user@example.com", {"feedback": "Test feedback"}) + + # Verify logging uses anonymous ID + mock_logger.debug.assert_called_once_with( + "Storing feedback for anonymous user %s", "anon-feedback-logging" + ) diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py index c6acb888..30f6dc8a 100644 --- a/tests/unit/app/endpoints/test_query.py +++ b/tests/unit/app/endpoints/test_query.py @@ -169,7 +169,7 @@ async def _test_query_endpoint_handler(mocker, store_transcript_to_file=False): # Assert the store_transcript function is called if transcripts are enabled if store_transcript_to_file: mock_transcript.assert_called_once_with( - anonymous_user_id="mock_user_id", + user_id="mock_user_id", conversation_id=conversation_id, model_id="fake_model_id", provider_id="fake_provider_id", @@ -177,9 +177,9 @@ async def _test_query_endpoint_handler(mocker, store_transcript_to_file=False): query=query, query_request=query_request, summary=summary, - attachments=[], rag_chunks=[], truncated=False, + attachments=[], ) else: mock_transcript.assert_not_called() diff --git a/tests/unit/utils/test_transcripts.py b/tests/unit/utils/test_transcripts.py index b30b430a..4e2eaeb9 100644 --- a/tests/unit/utils/test_transcripts.py +++ b/tests/unit/utils/test_transcripts.py @@ -50,11 +50,18 @@ def test_construct_transcripts_path(mocker): def test_store_transcript(mocker): """Test the store_transcript function.""" + # Mock database initialization to prevent the error + mocker.patch("app.database.engine", mocker.Mock()) + mocker.patch("app.database.SessionLocal", mocker.Mock()) + mocker.patch("builtins.open", mocker.mock_open()) mocker.patch( "utils.transcripts.construct_transcripts_path", return_value=mocker.MagicMock(), ) + mocker.patch( + "utils.transcripts.get_anonymous_user_id", return_value="anon-user-123" + ) # Mock the JSON to assert the data is stored correctly mock_json = mocker.patch("utils.transcripts.json") @@ -104,7 +111,7 @@ def test_store_transcript(mocker): "model": "fake-model", "query_provider": query_request.provider, "query_model": query_request.model, - "user_id": user_id, + "anonymous_user_id": "anon-user-123", "conversation_id": conversation_id, "timestamp": mocker.ANY, }, diff --git a/tests/unit/utils/test_transcripts_anonymization.py b/tests/unit/utils/test_transcripts_anonymization.py index abadc4e5..c564eea0 100644 --- a/tests/unit/utils/test_transcripts_anonymization.py +++ b/tests/unit/utils/test_transcripts_anonymization.py @@ -213,8 +213,14 @@ def test_store_transcript_with_attachments( assert data["attachments"][0]["attachment_type"] == "text" assert data["attachments"][0]["content"] == "Test attachment content" - def test_path_sanitization_with_anonymous_ids(self): + @patch("utils.transcripts.configuration") + def test_path_sanitization_with_anonymous_ids(self, mock_config): """Test that path sanitization works correctly with anonymous UUIDs.""" + # Setup mock configuration + mock_config.user_data_collection_configuration.transcripts_storage = ( + "/tmp/transcripts" + ) + # Test with various UUID formats and potential path injection test_cases = [ ("anon-uuid-123", "conv-456"), @@ -229,7 +235,8 @@ def test_path_sanitization_with_anonymous_ids(self): # Should not contain path traversal sequences assert "../" not in result_str - assert not result_str.startswith("/") + # Paths should be absolute (start with /) since we use /tmp/transcripts as base + assert result_str.startswith("/tmp/transcripts/") @patch("utils.transcripts.get_anonymous_user_id") def test_logging_shows_anonymization(self, mock_get_anonymous, caplog): From d641687493dfde84e7191520c7a629bf25983319 Mon Sep 17 00:00:00 2001 From: Erik M Jacobs Date: Tue, 26 Aug 2025 09:57:52 -0400 Subject: [PATCH 6/6] fix bad merge --- src/app/endpoints/query.py | 14 ---------- src/app/endpoints/streaming_query.py | 14 ---------- src/utils/endpoints.py | 38 ++++++++++++++++++++++------ 3 files changed, 30 insertions(+), 36 deletions(-) diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py index a06f6ee0..395f7e3e 100644 --- a/src/app/endpoints/query.py +++ b/src/app/endpoints/query.py @@ -195,20 +195,6 @@ async def query_endpoint_handler( ), ) - if user_conversation is None: - logger.warning( - "User %s attempted to query conversation %s they don't own", - user_id, - query_request.conversation_id, - ) - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, - detail={ - "response": "Access denied", - "cause": "You do not have permission to access this conversation", - }, - ) - try: # try to get Llama Stack client client = AsyncLlamaStackClientHolder().get_client() diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py index a7fe6788..2a36282f 100644 --- a/src/app/endpoints/streaming_query.py +++ b/src/app/endpoints/streaming_query.py @@ -552,20 +552,6 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals user_id=user_id, conversation_id=query_request.conversation_id ) - if user_conversation is None: - logger.warning( - "User %s attempted to query conversation %s they don't own", - user_id, - query_request.conversation_id, - ) - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, - detail={ - "response": "Access denied", - "cause": "You do not have permission to access this conversation", - }, - ) - try: # try to get Llama Stack client client = AsyncLlamaStackClientHolder().get_client() diff --git a/src/utils/endpoints.py b/src/utils/endpoints.py index 3f201b3a..4523ba19 100644 --- a/src/utils/endpoints.py +++ b/src/utils/endpoints.py @@ -29,22 +29,44 @@ def validate_conversation_ownership( If `others_allowed` is True, it allows conversations that do not belong to the user, which is useful for admin access. - Returns the conversation object if valid. + Returns the conversation object if valid, raises HTTPException if not. """ # Get anonymous user ID for database lookup anonymous_user_id = get_anonymous_user_id(user_id) with get_session() as session: - conversation = session.query(UserConversation) - - filtered_conversation_query = ( - conversation_query.filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id) - if others_allowed - else conversation_query.filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id) - ) + conversation_query = session.query(UserConversation) + + if others_allowed: + # If others_allowed is True, we can access any conversation by ID + filtered_conversation_query = conversation_query.filter_by( + id=conversation_id + ) + else: + # If others_allowed is False, we can only access conversations belonging to this user + filtered_conversation_query = conversation_query.filter_by( + id=conversation_id, anonymous_user_id=anonymous_user_id + ) conversation: UserConversation | None = filtered_conversation_query.first() + if conversation is None: + logger.warning( + "User %s attempted to access conversation %s they don't own", + user_id, + conversation_id, + ) + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail={ + "response": "Forbidden: conversation does not belong to user", + "cause": ( + f"User {user_id} does not have access to " + f"conversation {conversation_id}" + ), + }, + ) + return conversation