diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..c268c75b --- /dev/null +++ b/.env.example @@ -0,0 +1,11 @@ +# Example Environment Variables for Lightspeed Stack +# Copy this file to .env and set appropriate values + +# Required: User anonymization pepper (set to a secure random value) +# This is used for HMAC-based user ID hashing to protect user privacy +USER_ANON_PEPPER=your-secure-random-string-here + +# Optional: OpenAI API Key (if using OpenAI models) +OPENAI_API_KEY=your-openai-api-key + +# Optional: Other environment variables as needed for your configuration \ No newline at end of file diff --git a/README.md b/README.md index f206ea17..59e89968 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,17 @@ Lightspeed Core Stack is based on the FastAPI framework (Uvicorn). The service i - please note that currently Python 3.14 is not officially supported - all sources are made (backward) compatible with Python 3.12; it is checked on CI +## Environment Variables + +The following environment variable is required for the service to start: + +* `USER_ANON_PEPPER` - A secure random string used for user anonymization. Set this to a cryptographically secure random value: + ```bash + export USER_ANON_PEPPER="your-secure-random-string-here" + ``` + + **Security Note**: This value should be treated as a secret and kept secure. It's used for HMAC-based user ID hashing to protect user privacy while enabling usage analytics. + # Installation Installation steps depends on operation system. Please look at instructions for your system: diff --git a/docker-compose.yaml b/docker-compose.yaml index b3624526..5148c746 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -30,6 +30,7 @@ services: - ./lightspeed-stack.yaml:/app-root/lightspeed-stack.yaml:Z environment: - OPENAI_API_KEY=${OPENAI_API_KEY} + - USER_ANON_PEPPER=${USER_ANON_PEPPER:-default-pepper-for-development-only} depends_on: llama-stack: condition: service_healthy diff --git a/src/app/endpoints/conversations.py b/src/app/endpoints/conversations.py index ba844085..f1fe9ddc 100644 --- a/src/app/endpoints/conversations.py +++ b/src/app/endpoints/conversations.py @@ -22,6 +22,7 @@ ) from utils.endpoints import check_configuration_loaded, validate_conversation_ownership from utils.suid import check_suid +from utils.user_anonymization import get_anonymous_user_id logger = logging.getLogger("app.endpoints.handlers") router = APIRouter(tags=["conversations"]) @@ -158,7 +159,13 @@ async def get_conversations_list_endpoint_handler( user_id, _, _ = auth - logger.info("Retrieving conversations for user %s", user_id) + # Get anonymous user ID for database lookup + anonymous_user_id = get_anonymous_user_id(user_id) + + logger.info( + "Retrieving conversations for anonymous user %s", + anonymous_user_id, + ) with get_session() as session: try: @@ -167,7 +174,7 @@ async def get_conversations_list_endpoint_handler( filtered_query = ( query if Action.LIST_OTHERS_CONVERSATIONS in request.state.authorized_actions - else query.filter_by(user_id=user_id) + else query.filter_by(anonymous_user_id=anonymous_user_id) ) user_conversations = filtered_query.all() @@ -190,20 +197,27 @@ async def get_conversations_list_endpoint_handler( ] logger.info( - "Found %d conversations for user %s", len(conversations), user_id + "Found %d conversations for anonymous user %s", + len(conversations), + anonymous_user_id, ) return ConversationsListResponse(conversations=conversations) except Exception as e: logger.exception( - "Error retrieving conversations for user %s: %s", user_id, e + "Error retrieving conversations for anonymous user %s: %s", + anonymous_user_id, + e, ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail={ "response": "Unknown error", - "cause": f"Unknown error while getting conversations for user {user_id}", + "cause": ( + f"Unknown error while getting conversations for " + f"anonymous user {anonymous_user_id}" + ), }, ) from e diff --git a/src/app/endpoints/feedback.py b/src/app/endpoints/feedback.py index 39d2a269..37553785 100644 --- a/src/app/endpoints/feedback.py +++ b/src/app/endpoints/feedback.py @@ -21,6 +21,7 @@ ForbiddenResponse, ) from utils.suid import get_suid +from utils.user_anonymization import get_anonymous_user_id logger = logging.getLogger(__name__) router = APIRouter(prefix="/feedback", tags=["feedback"]) @@ -134,7 +135,8 @@ def store_feedback(user_id: str, feedback: dict) -> None: user_id (str): Unique identifier of the user submitting feedback. feedback (dict): Feedback data to be stored, merged with user ID and timestamp. """ - logger.debug("Storing feedback for user %s", user_id) + anonymous_user_id = get_anonymous_user_id(user_id) + logger.debug("Storing feedback for anonymous user %s", anonymous_user_id) # Creates storage path only if it doesn't exist. The `exist_ok=True` prevents # race conditions in case of multiple server instances trying to set up storage # at the same location. @@ -144,7 +146,11 @@ def store_feedback(user_id: str, feedback: dict) -> None: storage_path.mkdir(parents=True, exist_ok=True) current_time = str(datetime.now(UTC)) - data_to_store = {"user_id": user_id, "timestamp": current_time, **feedback} + data_to_store = { + "anonymous_user_id": anonymous_user_id, + "timestamp": current_time, + **feedback, + } # stores feedback in a file under unique uuid feedback_file_path = storage_path / f"{get_suid()}.json" diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py index 7c5343b7..395f7e3e 100644 --- a/src/app/endpoints/query.py +++ b/src/app/endpoints/query.py @@ -36,6 +36,7 @@ get_system_prompt, validate_conversation_ownership, ) +from utils.user_anonymization import get_anonymous_user_id from utils.mcp_headers import mcp_headers_dependency, handle_mcp_headers_with_toolgroups from utils.transcripts import store_transcript from utils.types import TurnSummary @@ -78,25 +79,30 @@ def is_transcripts_enabled() -> bool: def persist_user_conversation_details( user_id: str, conversation_id: str, model: str, provider_id: str ) -> None: - """Associate conversation to user in the database.""" + """Associate conversation to user in the database using anonymous user ID.""" + # Get anonymous user ID for database storage + anonymous_user_id = get_anonymous_user_id(user_id) + with get_session() as session: existing_conversation = ( session.query(UserConversation) - .filter_by(id=conversation_id, user_id=user_id) + .filter_by(id=conversation_id, anonymous_user_id=anonymous_user_id) .first() ) if not existing_conversation: conversation = UserConversation( id=conversation_id, - user_id=user_id, + anonymous_user_id=anonymous_user_id, last_used_model=model, last_used_provider=provider_id, message_count=1, ) session.add(conversation) logger.debug( - "Associated conversation %s to user %s", conversation_id, user_id + "Associated conversation %s to anonymous user %s", + conversation_id, + anonymous_user_id, ) else: existing_conversation.last_used_model = model @@ -189,20 +195,6 @@ async def query_endpoint_handler( ), ) - if user_conversation is None: - logger.warning( - "User %s attempted to query conversation %s they don't own", - user_id, - query_request.conversation_id, - ) - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, - detail={ - "response": "Access denied", - "cause": "You do not have permission to access this conversation", - }, - ) - try: # try to get Llama Stack client client = AsyncLlamaStackClientHolder().get_client() diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py index a7fe6788..2a36282f 100644 --- a/src/app/endpoints/streaming_query.py +++ b/src/app/endpoints/streaming_query.py @@ -552,20 +552,6 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals user_id=user_id, conversation_id=query_request.conversation_id ) - if user_conversation is None: - logger.warning( - "User %s attempted to query conversation %s they don't own", - user_id, - query_request.conversation_id, - ) - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, - detail={ - "response": "Access denied", - "cause": "You do not have permission to access this conversation", - }, - ) - try: # try to get Llama Stack client client = AsyncLlamaStackClientHolder().get_client() diff --git a/src/models/database/conversations.py b/src/models/database/conversations.py index 1cce8a64..73d65ac4 100644 --- a/src/models/database/conversations.py +++ b/src/models/database/conversations.py @@ -3,7 +3,7 @@ from datetime import datetime from sqlalchemy.orm import Mapped, mapped_column -from sqlalchemy import DateTime, func +from sqlalchemy import DateTime, func, ForeignKey from models.database.base import Base @@ -16,8 +16,10 @@ class UserConversation(Base): # pylint: disable=too-few-public-methods # The conversation ID id: Mapped[str] = mapped_column(primary_key=True) - # The user ID associated with the conversation - user_id: Mapped[str] = mapped_column(index=True) + # The anonymous user ID associated with the conversation + anonymous_user_id: Mapped[str] = mapped_column( + ForeignKey("user_mapping.anonymous_id"), index=True, nullable=False + ) # The last provider/model used in the conversation last_used_model: Mapped[str] = mapped_column() diff --git a/src/models/database/user_mapping.py b/src/models/database/user_mapping.py new file mode 100644 index 00000000..b01f2408 --- /dev/null +++ b/src/models/database/user_mapping.py @@ -0,0 +1,32 @@ +"""User ID anonymization mapping model.""" + +from datetime import datetime + +from sqlalchemy.orm import Mapped, mapped_column +from sqlalchemy import DateTime, func, Index, String + +from models.database.base import Base + + +class UserMapping(Base): # pylint: disable=too-few-public-methods + """Model for mapping real user IDs to anonymous UUIDs.""" + + __tablename__ = "user_mapping" + + # Anonymous UUID used for all storage/analytics (primary key) + anonymous_id: Mapped[str] = mapped_column( + String(36), primary_key=True, nullable=False + ) + + # Original user ID from authentication (hashed for security) + user_id_hash: Mapped[str] = mapped_column( + String(64), index=True, unique=True, nullable=False + ) + + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), + server_default=func.now(), # pylint: disable=not-callable + ) + + # Index for efficient lookups + __table_args__ = (Index("ix_user_mapping_hash_lookup", "user_id_hash"),) diff --git a/src/utils/endpoints.py b/src/utils/endpoints.py index 34c0147d..4523ba19 100644 --- a/src/utils/endpoints.py +++ b/src/utils/endpoints.py @@ -13,6 +13,7 @@ from configuration import AppConfig from utils.suid import get_suid from utils.types import GraniteToolParser +from utils.user_anonymization import get_anonymous_user_id logger = logging.getLogger("utils.endpoints") @@ -21,23 +22,51 @@ def validate_conversation_ownership( user_id: str, conversation_id: str, others_allowed: bool = False ) -> UserConversation | None: - """Validate that the conversation belongs to the user. + """ + Validate that the conversation belongs to the user using anonymous ID lookup. Validates that the conversation with the given ID belongs to the user with the given ID. If `others_allowed` is True, it allows conversations that do not belong to the user, which is useful for admin access. + + Returns the conversation object if valid, raises HTTPException if not. """ + # Get anonymous user ID for database lookup + anonymous_user_id = get_anonymous_user_id(user_id) + with get_session() as session: conversation_query = session.query(UserConversation) - filtered_conversation_query = ( - conversation_query.filter_by(id=conversation_id) - if others_allowed - else conversation_query.filter_by(id=conversation_id, user_id=user_id) - ) + if others_allowed: + # If others_allowed is True, we can access any conversation by ID + filtered_conversation_query = conversation_query.filter_by( + id=conversation_id + ) + else: + # If others_allowed is False, we can only access conversations belonging to this user + filtered_conversation_query = conversation_query.filter_by( + id=conversation_id, anonymous_user_id=anonymous_user_id + ) conversation: UserConversation | None = filtered_conversation_query.first() + if conversation is None: + logger.warning( + "User %s attempted to access conversation %s they don't own", + user_id, + conversation_id, + ) + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail={ + "response": "Forbidden: conversation does not belong to user", + "cause": ( + f"User {user_id} does not have access to " + f"conversation {conversation_id}" + ), + }, + ) + return conversation diff --git a/src/utils/transcripts.py b/src/utils/transcripts.py index e29d4319..dd28e51c 100644 --- a/src/utils/transcripts.py +++ b/src/utils/transcripts.py @@ -14,15 +14,16 @@ from models.requests import Attachment, QueryRequest from utils.suid import get_suid from utils.types import TurnSummary +from utils.user_anonymization import get_anonymous_user_id logger = logging.getLogger("utils.transcripts") -def construct_transcripts_path(user_id: str, conversation_id: str) -> Path: - """Construct path to transcripts.""" +def construct_transcripts_path(anonymous_user_id: str, conversation_id: str) -> Path: + """Construct path to transcripts using anonymous user ID.""" # these two normalizations are required by Snyk as it detects # this Path sanitization pattern - uid = os.path.normpath("/" + user_id).lstrip("/") + uid = os.path.normpath("/" + anonymous_user_id).lstrip("/") cid = os.path.normpath("/" + conversation_id).lstrip("/") file_path = ( configuration.user_data_collection_configuration.transcripts_storage or "" @@ -46,7 +47,7 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional- """Store transcript in the local filesystem. Args: - user_id: The user ID (UUID). + user_id: The original user ID from authentication (will be anonymized). conversation_id: The conversation ID (UUID). query_is_valid: The result of the query validation. query: The query (without attachments). @@ -56,7 +57,14 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional- truncated: The flag indicating if the history was truncated. attachments: The list of `Attachment` objects. """ - transcripts_path = construct_transcripts_path(user_id, conversation_id) + # Get anonymous user ID for storage + anonymous_user_id = get_anonymous_user_id(user_id) + logger.debug( + "Storing transcript for anonymous user %s", + anonymous_user_id, + ) + + transcripts_path = construct_transcripts_path(anonymous_user_id, conversation_id) transcripts_path.mkdir(parents=True, exist_ok=True) data_to_store = { @@ -65,7 +73,7 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional- "model": model_id, "query_provider": query_request.provider, "query_model": query_request.model, - "user_id": user_id, + "anonymous_user_id": anonymous_user_id, # Store anonymous ID only "conversation_id": conversation_id, "timestamp": datetime.now(UTC).isoformat(), }, diff --git a/src/utils/user_anonymization.py b/src/utils/user_anonymization.py new file mode 100644 index 00000000..0e63467d --- /dev/null +++ b/src/utils/user_anonymization.py @@ -0,0 +1,136 @@ +"""User ID anonymization utilities.""" + +import hashlib +import hmac +import logging +import os +from typing import Optional + +from sqlalchemy.exc import IntegrityError + +from models.database.user_mapping import UserMapping +from app.database import get_session +from utils.suid import get_suid + +logger = logging.getLogger("utils.user_anonymization") + +# Load HMAC pepper from environment - fail at startup if missing +try: + _USER_ANON_PEPPER = os.environ["USER_ANON_PEPPER"].encode("utf-8") +except KeyError as e: + raise RuntimeError( + "USER_ANON_PEPPER environment variable is required for user anonymization. " + "Set a secure random value (e.g., 'export USER_ANON_PEPPER=')" + ) from e + + +def _hash_user_id(user_id: str) -> str: + """ + Create a consistent hash of the user ID for mapping purposes. + + Uses HMAC-SHA256 with a server-secret pepper to ensure consistent hashing + while preventing rainbow table attacks and providing cryptographic security. + """ + # Normalize user ID: trim whitespace and lowercase + normalized_user_id = user_id.strip().lower() + + # Use HMAC-SHA256 with secret pepper + return hmac.new( + _USER_ANON_PEPPER, normalized_user_id.encode("utf-8"), hashlib.sha256 + ).hexdigest() + + +def get_anonymous_user_id(auth_user_id: str) -> str: + """ + Get or create an anonymous UUID for a user ID from authentication. + + This function: + 1. Hashes the original user ID for secure storage + 2. Looks up existing anonymous mapping + 3. Creates new anonymous UUID if none exists + 4. Returns the anonymous UUID for use in storage/analytics + + Args: + auth_user_id: The original user ID from authentication + + Returns: + Anonymous UUID string for this user + """ + user_id_hash = _hash_user_id(auth_user_id) + + with get_session() as session: + # Try to find existing mapping + existing_mapping = ( + session.query(UserMapping).filter_by(user_id_hash=user_id_hash).first() + ) + + if existing_mapping: + logger.debug( + "Found existing anonymous ID for user hash %s", user_id_hash[:8] + "..." + ) + return existing_mapping.anonymous_id + + # Create new anonymous mapping + anonymous_id = get_suid() + new_mapping = UserMapping(anonymous_id=anonymous_id, user_id_hash=user_id_hash) + + try: + session.add(new_mapping) + session.commit() + logger.info( + "Created new anonymous ID %s for user hash %s", + anonymous_id, + user_id_hash[:8] + "...", + ) + return anonymous_id + + except IntegrityError as e: + session.rollback() + # Race condition - another thread created the mapping + logger.warning("Race condition creating user mapping: %s", e) + + # Try to fetch the mapping created by the other thread + existing_mapping = ( + session.query(UserMapping).filter_by(user_id_hash=user_id_hash).first() + ) + + if existing_mapping: + return existing_mapping.anonymous_id + + # If we still can't find it, something is wrong + logger.error( + "Failed to create or retrieve user mapping for hash %s", + user_id_hash[:8] + "...", + ) + raise RuntimeError("Unable to create or retrieve anonymous user ID") from e + + +def get_user_count() -> int: + """ + Get the total number of unique users in the system. + + Returns: + Total count of unique anonymous users + """ + with get_session() as session: + return session.query(UserMapping).count() + + +def find_anonymous_user_id(auth_user_id: str) -> Optional[str]: + """ + Find existing anonymous ID for a user without creating a new one. + + Args: + auth_user_id: The original user ID from authentication + + Returns: + Anonymous UUID if found, None otherwise + """ + user_id_hash = _hash_user_id(auth_user_id) + + with get_session() as session: + existing_mapping = ( + session.query(UserMapping).filter_by(user_id_hash=user_id_hash).first() + ) + + return existing_mapping.anonymous_id if existing_mapping else None diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..4be2f7fe --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,7 @@ +"""Global configuration for all tests.""" + +import os + +# Set environment variable for all tests globally +# This ensures the anonymization module can be imported in CI environments +os.environ.setdefault("USER_ANON_PEPPER", "test-pepper-for-all-tests") diff --git a/tests/unit/app/endpoints/test_conversations.py b/tests/unit/app/endpoints/test_conversations.py index f4fcd4c8..0ce8b5a5 100644 --- a/tests/unit/app/endpoints/test_conversations.py +++ b/tests/unit/app/endpoints/test_conversations.py @@ -64,6 +64,10 @@ def create_mock_conversation( def mock_database_session(mocker, query_result=None): """Helper function to mock get_session with proper context manager support.""" + # Mock database initialization + mocker.patch("app.database.engine", mocker.Mock()) + mocker.patch("app.database.SessionLocal", mocker.Mock()) + mock_session = mocker.Mock() if query_result is not None: # Mock both the filtered and unfiltered query paths @@ -613,6 +617,10 @@ async def test_successful_conversations_list_retrieval( """Test successful retrieval of conversations list.""" mock_authorization_resolvers(mocker) mocker.patch("app.endpoints.conversations.configuration", setup_configuration) + mocker.patch( + "app.endpoints.conversations.get_anonymous_user_id", + return_value="anon-test-user", + ) # Mock database session and query results mock_conversations = [ @@ -659,6 +667,10 @@ async def test_empty_conversations_list( """Test when user has no conversations.""" mock_authorization_resolvers(mocker) mocker.patch("app.endpoints.conversations.configuration", setup_configuration) + mocker.patch( + "app.endpoints.conversations.get_anonymous_user_id", + return_value="anon-test-user", + ) # Mock database session with no results mock_database_session(mocker, []) @@ -676,6 +688,10 @@ async def test_database_exception(self, mocker, setup_configuration, dummy_reque """Test when database query raises an exception.""" mock_authorization_resolvers(mocker) mocker.patch("app.endpoints.conversations.configuration", setup_configuration) + mocker.patch( + "app.endpoints.conversations.get_anonymous_user_id", + return_value="anon-test-user", + ) # Mock database session to raise exception mock_session = mock_database_session(mocker) @@ -689,6 +705,6 @@ async def test_database_exception(self, mocker, setup_configuration, dummy_reque assert exc_info.value.status_code == status.HTTP_500_INTERNAL_SERVER_ERROR assert "Unknown error" in exc_info.value.detail["response"] assert ( - "Unknown error while getting conversations for user" + "Unknown error while getting conversations for anonymous user" in exc_info.value.detail["cause"] ) diff --git a/tests/unit/app/endpoints/test_feedback.py b/tests/unit/app/endpoints/test_feedback.py index 4a155ef6..bce5cdf1 100644 --- a/tests/unit/app/endpoints/test_feedback.py +++ b/tests/unit/app/endpoints/test_feedback.py @@ -144,6 +144,9 @@ def test_store_feedback(mocker, feedback_request_data): mocker.patch("builtins.open", mocker.mock_open()) mocker.patch("app.endpoints.feedback.Path", return_value=mocker.MagicMock()) mocker.patch("app.endpoints.feedback.get_suid", return_value="fake-uuid") + mocker.patch( + "app.endpoints.feedback.get_anonymous_user_id", return_value="anon-test-user" + ) # Patch json to inspect stored data mock_json = mocker.patch("app.endpoints.feedback.json") @@ -153,7 +156,7 @@ def test_store_feedback(mocker, feedback_request_data): store_feedback(user_id, feedback_request_data) expected_data = { - "user_id": user_id, + "anonymous_user_id": "anon-test-user", "timestamp": mocker.ANY, **feedback_request_data, } @@ -189,6 +192,9 @@ def test_store_feedback_on_io_error(mocker, feedback_request_data): configuration.user_data_collection_configuration.feedback_storage = "fake-path" mocker.patch("app.endpoints.feedback.Path", return_value=mocker.MagicMock()) mocker.patch("builtins.open", side_effect=PermissionError("EACCES")) + mocker.patch( + "app.endpoints.feedback.get_anonymous_user_id", return_value="anon-test-user" + ) user_id = "test_user_id" diff --git a/tests/unit/app/endpoints/test_feedback_anonymization.py b/tests/unit/app/endpoints/test_feedback_anonymization.py new file mode 100644 index 00000000..26e2b4af --- /dev/null +++ b/tests/unit/app/endpoints/test_feedback_anonymization.py @@ -0,0 +1,138 @@ +"""Tests for feedback endpoint anonymization functionality.""" + +import os +from unittest.mock import patch, MagicMock + +import pytest + +from app.endpoints.feedback import store_feedback + + +# Set up test environment variable before importing the module +@pytest.fixture(autouse=True) +def setup_test_pepper(): + """Set up test pepper environment variable for all tests.""" + test_pepper = "test-pepper-for-feedback-tests" + with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}): + yield + + +class TestFeedbackAnonymization: + """Test feedback storage with user anonymization.""" + + @patch("app.endpoints.feedback.get_anonymous_user_id") + @patch("app.endpoints.feedback.get_suid") + @patch("app.endpoints.feedback.json") + @patch("app.endpoints.feedback.Path") + def test_store_feedback_anonymizes_user_id( + self, mock_path, mock_json, mock_get_suid, mock_get_anonymous + ): + """Test that store_feedback uses anonymous user ID.""" + # Setup mocks + mock_get_anonymous.return_value = "anon-feedback-123" + mock_get_suid.return_value = "feedback-uuid" + mock_path.return_value = MagicMock() + + # Mock configuration + with ( + patch("app.endpoints.feedback.configuration") as mock_config, + patch("builtins.open"), + ): + mock_config.user_data_collection_configuration.feedback_storage = ( + "/tmp/feedback" + ) + + # Call store_feedback + store_feedback( + user_id="original_user@example.com", + feedback={ + "feedback": "This is test feedback", + "sentiment": 1, + "categories": ["helpful"], + }, + ) + + # Verify anonymous user ID was used + mock_get_anonymous.assert_called_once_with("original_user@example.com") + + # Verify stored data uses anonymous ID + stored_data = mock_json.dump.call_args[0][0] + assert stored_data["anonymous_user_id"] == "anon-feedback-123" + assert "user_id" not in stored_data + assert stored_data["feedback"] == "This is test feedback" + assert stored_data["sentiment"] == 1 + assert stored_data["categories"] == ["helpful"] + + @patch("app.endpoints.feedback.get_anonymous_user_id") + def test_store_feedback_different_users_get_different_anonymous_ids( + self, mock_get_anonymous + ): + """Test that different users get different anonymous IDs for feedback.""" + + def mock_anonymous_side_effect(user_id): + if user_id == "user1@example.com": + return "anon-feedback-user1" + if user_id == "user2@example.com": + return "anon-feedback-user2" + return "anon-unknown" + + mock_get_anonymous.side_effect = mock_anonymous_side_effect + + with ( + patch("app.endpoints.feedback.json") as mock_json, + patch("app.endpoints.feedback.Path") as mock_path, + patch("builtins.open"), + patch("app.endpoints.feedback.get_suid", return_value="uuid"), + patch("app.endpoints.feedback.configuration") as mock_config, + ): + + mock_config.user_data_collection_configuration.feedback_storage = ( + "/tmp/feedback" + ) + mock_path.return_value = MagicMock() + + # Store feedback for user 1 + store_feedback("user1@example.com", {"feedback": "Test 1"}) + first_call_data = mock_json.dump.call_args[0][0] + + # Reset mock for second call + mock_json.reset_mock() + + # Store feedback for user 2 + store_feedback("user2@example.com", {"feedback": "Test 2"}) + second_call_data = mock_json.dump.call_args[0][0] + + # Verify different anonymous IDs were used + assert first_call_data["anonymous_user_id"] == "anon-feedback-user1" + assert second_call_data["anonymous_user_id"] == "anon-feedback-user2" + assert ( + first_call_data["anonymous_user_id"] + != second_call_data["anonymous_user_id"] + ) + + @patch("app.endpoints.feedback.get_anonymous_user_id") + @patch("app.endpoints.feedback.logger") + def test_feedback_logging_uses_anonymous_id(self, mock_logger, mock_get_anonymous): + """Test that feedback logging uses anonymous user ID.""" + mock_get_anonymous.return_value = "anon-feedback-logging" + + with ( + patch("app.endpoints.feedback.json"), + patch("app.endpoints.feedback.Path") as mock_path, + patch("builtins.open"), + patch("app.endpoints.feedback.get_suid", return_value="uuid"), + patch("app.endpoints.feedback.configuration") as mock_config, + ): + + mock_config.user_data_collection_configuration.feedback_storage = ( + "/tmp/feedback" + ) + mock_path.return_value = MagicMock() + + # Store feedback + store_feedback("user@example.com", {"feedback": "Test feedback"}) + + # Verify logging uses anonymous ID + mock_logger.debug.assert_called_once_with( + "Storing feedback for anonymous user %s", "anon-feedback-logging" + ) diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py index b12101b4..f5c1b8d9 100644 --- a/tests/unit/app/endpoints/test_query.py +++ b/tests/unit/app/endpoints/test_query.py @@ -200,9 +200,9 @@ async def _test_query_endpoint_handler( query=query, query_request=query_request, summary=summary, - attachments=[], rag_chunks=[], truncated=False, + attachments=[], ) else: mock_transcript.assert_not_called() @@ -1445,7 +1445,7 @@ def test_no_tools_parameter_backward_compatibility(): ( UserConversation( id="conv1", - user_id="user1", + anonymous_user_id="user1", last_used_provider="foo", last_used_model="bar", message_count=1, @@ -1464,7 +1464,7 @@ def test_no_tools_parameter_backward_compatibility(): ( UserConversation( id="conv1", - user_id="user1", + anonymous_user_id="user1", last_used_provider="foo", last_used_model="bar", message_count=1, diff --git a/tests/unit/app/endpoints/test_query_anonymization.py b/tests/unit/app/endpoints/test_query_anonymization.py new file mode 100644 index 00000000..808b807f --- /dev/null +++ b/tests/unit/app/endpoints/test_query_anonymization.py @@ -0,0 +1,210 @@ +"""Tests for query endpoint anonymization functionality.""" + +import logging +import os +from unittest.mock import patch, MagicMock + +import pytest +from sqlalchemy import inspect + +from app.endpoints.query import persist_user_conversation_details +from models.database.conversations import UserConversation + + +# Set up test environment variable before importing the module +@pytest.fixture(autouse=True) +def setup_test_pepper(): + """Set up test pepper environment variable for all tests.""" + test_pepper = "test-pepper-for-query-tests" + with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}): + yield + + +class TestQueryAnonymization: + """Test query endpoint user anonymization.""" + + @patch("app.endpoints.query.get_anonymous_user_id") + @patch("app.endpoints.query.get_session") + def test_persist_user_conversation_details_new_conversation( + self, mock_get_session, mock_get_anonymous + ): + """Test persisting new conversation uses anonymous user ID.""" + # Setup mocks + mock_get_anonymous.return_value = "anon-new-conv-123" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + # Call function + persist_user_conversation_details( + user_id="new_user@example.com", + conversation_id="new-conv-456", + model="test-model", + provider_id="test-provider", + ) + + # Verify anonymous user ID was obtained + mock_get_anonymous.assert_called_once_with("new_user@example.com") + + # Verify database query used anonymous ID + mock_session.query.assert_called_once_with(UserConversation) + filter_call = mock_session.query.return_value.filter_by.call_args + assert filter_call[1]["id"] == "new-conv-456" + assert filter_call[1]["anonymous_user_id"] == "anon-new-conv-123" + + # Verify new conversation was created with anonymous ID + mock_session.add.assert_called_once() + added_conversation = mock_session.add.call_args[0][0] + assert isinstance(added_conversation, UserConversation) + assert added_conversation.id == "new-conv-456" + assert added_conversation.anonymous_user_id == "anon-new-conv-123" + assert added_conversation.last_used_model == "test-model" + assert added_conversation.last_used_provider == "test-provider" + assert added_conversation.message_count == 1 + + mock_session.commit.assert_called_once() + + @patch("app.endpoints.query.get_anonymous_user_id") + @patch("app.endpoints.query.get_session") + def test_persist_user_conversation_details_existing_conversation( + self, mock_get_session, mock_get_anonymous + ): + """Test updating existing conversation uses anonymous user ID.""" + # Setup existing conversation + existing_conv = MagicMock(spec=UserConversation) + existing_conv.last_used_model = "old-model" + existing_conv.last_used_provider = "old-provider" + existing_conv.message_count = 5 + + mock_get_anonymous.return_value = "anon-existing-789" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = ( + existing_conv + ) + + # Call function + persist_user_conversation_details( + user_id="existing_user@example.com", + conversation_id="existing-conv-123", + model="new-model", + provider_id="new-provider", + ) + + # Verify anonymous user ID was used for lookup + mock_get_anonymous.assert_called_once_with("existing_user@example.com") + filter_call = mock_session.query.return_value.filter_by.call_args + assert filter_call[1]["anonymous_user_id"] == "anon-existing-789" + + # Verify existing conversation was updated + assert existing_conv.last_used_model == "new-model" + assert existing_conv.last_used_provider == "new-provider" + assert existing_conv.message_count == 6 # Incremented + + # Verify no new conversation was added + mock_session.add.assert_not_called() + mock_session.commit.assert_called_once() + + @patch("app.endpoints.query.get_anonymous_user_id") + def test_persist_user_conversation_logs_anonymization( + self, mock_get_anonymous, caplog + ): + """Test that conversation persistence logs anonymization.""" + + mock_get_anonymous.return_value = "anon-logging-456" + + with caplog.at_level(logging.DEBUG): + with patch("app.endpoints.query.get_session") as mock_get_session: + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = ( + None + ) + + persist_user_conversation_details( + user_id="logging_test_user@example.com", + conversation_id="logging-conv-789", + model="logging-model", + provider_id="logging-provider", + ) + + # Check that anonymization is logged + log_messages = [record.message for record in caplog.records] + anonymization_logs = [ + msg + for msg in log_messages + if "Associated conversation" in msg and "anonymous user" in msg + ] + assert len(anonymization_logs) > 0 + + # Verify log contains anonymous user and conversation ID (but no original user ID) + log_msg = anonymization_logs[0] + assert "anon-logging-456" in log_msg + assert "logging-conv-789" in log_msg + # Should NOT contain any reference to original user ID + assert "logging_test_user" not in log_msg + assert "logging_..." not in log_msg + + def test_conversation_model_uses_anonymous_field(self): + """Test that UserConversation model uses anonymous_user_id field.""" + # Verify the model has the correct field + conversation = UserConversation() + assert hasattr(conversation, "anonymous_user_id") + + # Verify we can set the anonymous user ID + conversation.anonymous_user_id = "test-anon-uuid" + assert conversation.anonymous_user_id == "test-anon-uuid" + + # Verify the field is mapped correctly (this tests the database column) + mapper = inspect(UserConversation) + assert "anonymous_user_id" in mapper.columns + + @patch("app.endpoints.query.get_anonymous_user_id") + @patch("app.endpoints.query.get_session") + def test_persist_user_conversation_different_users_same_conversation_id( + self, mock_get_session, mock_get_anonymous + ): + """Test that different users can't access each other's conversations.""" + + # Simulate two different users with same conversation ID (edge case) + def mock_anonymous_side_effect(user_id): + if user_id == "user1@example.com": + return "anon-user1-123" + if user_id == "user2@example.com": + return "anon-user2-456" + return "anon-unknown" + + mock_get_anonymous.side_effect = mock_anonymous_side_effect + + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + # User 1 creates conversation + persist_user_conversation_details( + user_id="user1@example.com", + conversation_id="shared-conv-id", + model="model1", + provider_id="provider1", + ) + + # Verify first user's anonymous ID was used + first_call = mock_session.query.return_value.filter_by.call_args_list[0] + assert first_call[1]["anonymous_user_id"] == "anon-user1-123" + + # Reset mock for second call + mock_session.reset_mock() + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + # User 2 tries to create conversation with same ID + persist_user_conversation_details( + user_id="user2@example.com", + conversation_id="shared-conv-id", # Same conversation ID + model="model2", + provider_id="provider2", + ) + + # Verify second user's anonymous ID was used (different from first) + second_call = mock_session.query.return_value.filter_by.call_args_list[0] + assert second_call[1]["anonymous_user_id"] == "anon-user2-456" + assert second_call[1]["anonymous_user_id"] != "anon-user1-123" diff --git a/tests/unit/utils/conftest.py b/tests/unit/utils/conftest.py new file mode 100644 index 00000000..d39526e9 --- /dev/null +++ b/tests/unit/utils/conftest.py @@ -0,0 +1,6 @@ +"""Configuration for utils unit tests.""" + +import os + +# Set environment variable for all tests in this directory +os.environ.setdefault("USER_ANON_PEPPER", "test-pepper-for-utils-tests") diff --git a/tests/unit/utils/test_endpoints_anonymization.py b/tests/unit/utils/test_endpoints_anonymization.py new file mode 100644 index 00000000..dd5b23c1 --- /dev/null +++ b/tests/unit/utils/test_endpoints_anonymization.py @@ -0,0 +1,189 @@ +"""Tests for endpoint utilities anonymization functionality.""" + +import os +from unittest.mock import patch, MagicMock + +import pytest +from fastapi import HTTPException + +from models.database.conversations import UserConversation +from utils.endpoints import validate_conversation_ownership + + +# Set up test environment variable before importing the module +@pytest.fixture(autouse=True) +def setup_test_pepper(): + """Set up test pepper environment variable for all tests.""" + test_pepper = "test-pepper-for-endpoints-tests" + with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}): + yield + + +class TestEndpointsAnonymization: + """Test endpoint utilities user anonymization.""" + + @patch("utils.endpoints.get_anonymous_user_id") + @patch("utils.endpoints.get_session") + def test_validate_conversation_ownership_uses_anonymous_id( + self, mock_get_session, mock_get_anonymous + ): + """Test that conversation ownership validation uses anonymous user ID.""" + # Setup mocks + conversation = MagicMock(spec=UserConversation) + conversation.id = "test-conv-123" + conversation.anonymous_user_id = "anon-owner-456" + + mock_get_anonymous.return_value = "anon-owner-456" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = ( + conversation + ) + + # Call validation + result = validate_conversation_ownership( + user_id="owner@example.com", conversation_id="test-conv-123" + ) + + # Verify anonymous user ID was obtained + mock_get_anonymous.assert_called_once_with("owner@example.com") + + # Verify database query used anonymous ID + mock_session.query.assert_called_once_with(UserConversation) + filter_call = mock_session.query.return_value.filter_by.call_args + assert filter_call[1]["id"] == "test-conv-123" + assert filter_call[1]["anonymous_user_id"] == "anon-owner-456" + + # Verify correct conversation returned + assert result == conversation + + @patch("utils.endpoints.get_anonymous_user_id") + @patch("utils.endpoints.get_session") + def test_validate_conversation_ownership_not_owned( + self, mock_get_session, mock_get_anonymous + ): + """Test validation raises 403 when user doesn't own conversation.""" + mock_get_anonymous.return_value = "anon-not-owner-789" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + # Call validation should raise HTTPException + with pytest.raises(HTTPException) as exc_info: + validate_conversation_ownership( + user_id="notowner@example.com", conversation_id="not-owned-conv" + ) + + # Verify anonymous user ID was used for lookup + mock_get_anonymous.assert_called_once_with("notowner@example.com") + filter_call = mock_session.query.return_value.filter_by.call_args + assert filter_call[1]["anonymous_user_id"] == "anon-not-owner-789" + + # Verify 403 error raised for non-owned conversation + assert exc_info.value.status_code == 403 + assert "Forbidden: conversation does not belong to user" in str( + exc_info.value.detail + ) + + @patch("utils.endpoints.get_anonymous_user_id") + @patch("utils.endpoints.get_session") + def test_validate_conversation_ownership_different_users( + self, mock_get_session, mock_get_anonymous + ): + """Test that different users get different anonymous IDs for validation.""" + + def mock_anonymous_side_effect(user_id): + if user_id == "user1@example.com": + return "anon-user1-validation" + if user_id == "user2@example.com": + return "anon-user2-validation" + return "anon-unknown" + + mock_get_anonymous.side_effect = mock_anonymous_side_effect + + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + # Validate for user 1 - should raise 403 + with pytest.raises(HTTPException) as exc_info1: + validate_conversation_ownership( + user_id="user1@example.com", conversation_id="shared-conv-test" + ) + + first_call = mock_session.query.return_value.filter_by.call_args + assert first_call[1]["anonymous_user_id"] == "anon-user1-validation" + + # Reset mock for second call + mock_session.reset_mock() + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + # Validate for user 2 - should also raise 403 + with pytest.raises(HTTPException) as exc_info2: + validate_conversation_ownership( + user_id="user2@example.com", + conversation_id="shared-conv-test", # Same conversation ID + ) + + second_call = mock_session.query.return_value.filter_by.call_args + assert second_call[1]["anonymous_user_id"] == "anon-user2-validation" + + # Both should raise 403 since neither owns the conversation + assert exc_info1.value.status_code == 403 + assert exc_info2.value.status_code == 403 + + # But they used different anonymous IDs (checking the calls, not constant comparison) + assert first_call[1]["anonymous_user_id"] != second_call[1]["anonymous_user_id"] + + @patch("utils.endpoints.get_anonymous_user_id") + @patch("utils.endpoints.get_session") + def test_validate_conversation_ownership_preserves_conversation_data( + self, mock_get_session, mock_get_anonymous + ): + """Test that validation preserves all conversation data.""" + # Create conversation with full data + conversation = UserConversation() + conversation.id = "full-data-conv" + conversation.anonymous_user_id = "anon-full-data" + conversation.last_used_model = "test-model" + conversation.last_used_provider = "test-provider" + conversation.message_count = 42 + + mock_get_anonymous.return_value = "anon-full-data" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = ( + conversation + ) + + # Validate ownership + result = validate_conversation_ownership( + user_id="datauser@example.com", conversation_id="full-data-conv" + ) + + # Verify all conversation data is preserved + assert result.id == "full-data-conv" + assert result.anonymous_user_id == "anon-full-data" + assert result.last_used_model == "test-model" + assert result.last_used_provider == "test-provider" + assert result.message_count == 42 + + @patch("utils.endpoints.get_anonymous_user_id") + def test_validate_conversation_ownership_error_handling(self, mock_get_anonymous): + """Test that validation handles errors gracefully.""" + mock_get_anonymous.return_value = "anon-error-test" + + # Simulate database error + with patch("utils.endpoints.get_session") as mock_get_session: + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.side_effect = Exception("Database error") + + # Should propagate the exception + with pytest.raises(Exception, match="Database error"): + validate_conversation_ownership( + user_id="erroruser@example.com", conversation_id="error-conv" + ) + + # Verify anonymous ID was still obtained before error + mock_get_anonymous.assert_called_once_with("erroruser@example.com") diff --git a/tests/unit/utils/test_transcripts.py b/tests/unit/utils/test_transcripts.py index b30b430a..4e2eaeb9 100644 --- a/tests/unit/utils/test_transcripts.py +++ b/tests/unit/utils/test_transcripts.py @@ -50,11 +50,18 @@ def test_construct_transcripts_path(mocker): def test_store_transcript(mocker): """Test the store_transcript function.""" + # Mock database initialization to prevent the error + mocker.patch("app.database.engine", mocker.Mock()) + mocker.patch("app.database.SessionLocal", mocker.Mock()) + mocker.patch("builtins.open", mocker.mock_open()) mocker.patch( "utils.transcripts.construct_transcripts_path", return_value=mocker.MagicMock(), ) + mocker.patch( + "utils.transcripts.get_anonymous_user_id", return_value="anon-user-123" + ) # Mock the JSON to assert the data is stored correctly mock_json = mocker.patch("utils.transcripts.json") @@ -104,7 +111,7 @@ def test_store_transcript(mocker): "model": "fake-model", "query_provider": query_request.provider, "query_model": query_request.model, - "user_id": user_id, + "anonymous_user_id": "anon-user-123", "conversation_id": conversation_id, "timestamp": mocker.ANY, }, diff --git a/tests/unit/utils/test_transcripts_anonymization.py b/tests/unit/utils/test_transcripts_anonymization.py new file mode 100644 index 00000000..c564eea0 --- /dev/null +++ b/tests/unit/utils/test_transcripts_anonymization.py @@ -0,0 +1,295 @@ +"""Tests for transcript anonymization functionality.""" + +import json +import os +import tempfile +from pathlib import Path +from unittest.mock import patch + +import pytest + +from models.requests import QueryRequest, Attachment +from utils.transcripts import store_transcript, construct_transcripts_path +from utils.types import TurnSummary + + +# Set up test environment variable before importing the module +@pytest.fixture(autouse=True) +def setup_test_pepper(): + """Set up test pepper environment variable for all tests.""" + test_pepper = "test-pepper-for-transcript-tests" + with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}): + yield + + +class TestTranscriptAnonymization: + """Test transcript storage with user anonymization.""" + + @patch("utils.transcripts.get_anonymous_user_id") + @patch("utils.transcripts.get_suid") + @patch("utils.transcripts.configuration") + def test_store_transcript_anonymizes_user_id( + self, mock_config, mock_get_suid, mock_get_anonymous + ): + """Test that store_transcript uses anonymous user ID.""" + # Setup mocks + mock_get_anonymous.return_value = "anon-uuid-123" + mock_get_suid.return_value = "transcript-uuid" + + with tempfile.TemporaryDirectory() as temp_dir: + mock_config.user_data_collection_configuration.transcripts_storage = ( + temp_dir + ) + + # Create test data + query_request = QueryRequest( + query="Test query", model="test-model", provider="test-provider" + ) + + summary = TurnSummary(llm_response="Test response", tool_calls=[]) + + # Call store_transcript + store_transcript( + user_id="original_user@example.com", + conversation_id="conv-123", + model_id="test-model", + provider_id="test-provider", + query_is_valid=True, + query="Test query", + query_request=query_request, + summary=summary, + rag_chunks=[], + truncated=False, + attachments=[], + ) + + # Verify anonymous user ID was used + mock_get_anonymous.assert_called_once_with("original_user@example.com") + + # Verify file was created with anonymous path + expected_path = ( + Path(temp_dir) / "anon-uuid-123" / "conv-123" / "transcript-uuid.json" + ) + assert expected_path.exists() + + # Verify file content contains anonymous ID, not original + with open(expected_path, "r", encoding="utf-8") as f: + data = json.load(f) + + assert data["metadata"]["anonymous_user_id"] == "anon-uuid-123" + assert "user_id" not in data["metadata"] + assert data["metadata"]["conversation_id"] == "conv-123" + + def test_construct_transcripts_path_with_anonymous_id(self): + """Test that transcript path construction uses anonymous ID.""" + # This test verifies the path construction function directly + result = construct_transcripts_path("anon-uuid-456", "conv-789") + + # Path should contain the anonymous ID, not original user ID + assert "anon-uuid-456" in str(result) + assert "conv-789" in str(result) + + @patch("utils.transcripts.get_anonymous_user_id") + @patch("utils.transcripts.get_suid") + @patch("utils.transcripts.configuration") + def test_store_transcript_preserves_other_metadata( + self, mock_config, mock_get_suid, mock_get_anonymous + ): + """Test that anonymization preserves all other metadata.""" + mock_get_anonymous.return_value = "anon-preserved-test" + mock_get_suid.return_value = "metadata-test-uuid" + + with tempfile.TemporaryDirectory() as temp_dir: + mock_config.user_data_collection_configuration.transcripts_storage = ( + temp_dir + ) + + query_request = QueryRequest( + query="Metadata test", + model="metadata-model", + provider="metadata-provider", + ) + + summary = TurnSummary(llm_response="Metadata response", tool_calls=[]) + + # Store transcript with rich metadata + store_transcript( + user_id="metadata_user@example.com", + conversation_id="metadata-conv", + model_id="actual-model", + provider_id="actual-provider", + query_is_valid=True, + query="Metadata test query", + query_request=query_request, + summary=summary, + rag_chunks=["chunk1", "chunk2"], + truncated=True, + attachments=[], + ) + + # Read and verify all metadata is preserved + transcript_file = ( + Path(temp_dir) + / "anon-preserved-test" + / "metadata-conv" + / "metadata-test-uuid.json" + ) + with open(transcript_file, "r", encoding="utf-8") as f: + data = json.load(f) + + metadata = data["metadata"] + assert metadata["provider"] == "actual-provider" + assert metadata["model"] == "actual-model" + assert metadata["query_provider"] == "metadata-provider" + assert metadata["query_model"] == "metadata-model" + assert metadata["conversation_id"] == "metadata-conv" + assert "timestamp" in metadata + + # Verify other data preserved + assert data["redacted_query"] == "Metadata test query" + assert data["query_is_valid"] is True + assert data["llm_response"] == "Metadata response" + assert data["rag_chunks"] == ["chunk1", "chunk2"] + assert data["truncated"] is True + + @patch("utils.transcripts.get_anonymous_user_id") + @patch("utils.transcripts.get_suid") + @patch("utils.transcripts.configuration") + def test_store_transcript_with_attachments( + self, mock_config, mock_get_suid, mock_get_anonymous + ): + """Test transcript storage with attachments preserves anonymization.""" + mock_get_anonymous.return_value = "anon-attachments-test" + mock_get_suid.return_value = "attachments-uuid" + + with tempfile.TemporaryDirectory() as temp_dir: + mock_config.user_data_collection_configuration.transcripts_storage = ( + temp_dir + ) + + # Create attachment + attachment = Attachment( + attachment_type="text", + content_type="text/plain", + content="Test attachment content", + ) + + query_request = QueryRequest( + query="Query with attachment", + model="attachment-model", + provider="attachment-provider", + ) + + summary = TurnSummary( + llm_response="Response with attachment", tool_calls=[] + ) + + store_transcript( + user_id="attachment_user@example.com", + conversation_id="attachment-conv", + model_id="attachment-actual-model", + provider_id="attachment-actual-provider", + query_is_valid=True, + query="Query with attachment", + query_request=query_request, + summary=summary, + rag_chunks=[], + truncated=False, + attachments=[attachment], + ) + + # Verify attachment data preserved with anonymization + transcript_file = ( + Path(temp_dir) + / "anon-attachments-test" + / "attachment-conv" + / "attachments-uuid.json" + ) + with open(transcript_file, "r", encoding="utf-8") as f: + data = json.load(f) + + assert data["metadata"]["anonymous_user_id"] == "anon-attachments-test" + assert len(data["attachments"]) == 1 + assert data["attachments"][0]["attachment_type"] == "text" + assert data["attachments"][0]["content"] == "Test attachment content" + + @patch("utils.transcripts.configuration") + def test_path_sanitization_with_anonymous_ids(self, mock_config): + """Test that path sanitization works correctly with anonymous UUIDs.""" + # Setup mock configuration + mock_config.user_data_collection_configuration.transcripts_storage = ( + "/tmp/transcripts" + ) + + # Test with various UUID formats and potential path injection + test_cases = [ + ("anon-uuid-123", "conv-456"), + ("../anon-malicious", "conv-normal"), # Path traversal attempt + ("anon/with/slashes", "conv-789"), # Embedded slashes + ("anon-normal", "../conv-malicious"), # Conversation ID traversal attempt + ] + + for anon_id, conv_id in test_cases: + result = construct_transcripts_path(anon_id, conv_id) + result_str = str(result) + + # Should not contain path traversal sequences + assert "../" not in result_str + # Paths should be absolute (start with /) since we use /tmp/transcripts as base + assert result_str.startswith("/tmp/transcripts/") + + @patch("utils.transcripts.get_anonymous_user_id") + def test_logging_shows_anonymization(self, mock_get_anonymous, caplog): + """Test that logging shows anonymization is happening.""" + import logging # pylint: disable=import-outside-toplevel + + mock_get_anonymous.return_value = "anon-logging-test" + + with caplog.at_level(logging.DEBUG): + # Create minimal test setup + with tempfile.TemporaryDirectory() as temp_dir: + with patch("utils.transcripts.configuration") as mock_config: + mock_config.user_data_collection_configuration.transcripts_storage = ( + temp_dir + ) + + with patch( + "utils.transcripts.get_suid", return_value="log-test-uuid" + ): + query_request = QueryRequest( + query="Log test", model="log-model", provider="log-provider" + ) + + summary = TurnSummary( + llm_response="Log response", tool_calls=[] + ) + + store_transcript( + user_id="log_test_user@example.com", + conversation_id="log-conv", + model_id="log-model", + provider_id="log-provider", + query_is_valid=True, + query="Log test query", + query_request=query_request, + summary=summary, + rag_chunks=[], + truncated=False, + attachments=[], + ) + + # Check that transcript storage is logged + log_messages = [record.message for record in caplog.records] + storage_logs = [ + msg + for msg in log_messages + if "Storing transcript for anonymous user" in msg + ] + assert len(storage_logs) > 0 + + # Verify the log shows only anonymous user ID (no original user ID) + storage_log = storage_logs[0] + assert "anon-logging-test" in storage_log + # Should NOT contain any reference to original user ID + assert "log_test_user" not in storage_log + assert "log_test..." not in storage_log diff --git a/tests/unit/utils/test_user_anonymization.py b/tests/unit/utils/test_user_anonymization.py new file mode 100644 index 00000000..a0a559ea --- /dev/null +++ b/tests/unit/utils/test_user_anonymization.py @@ -0,0 +1,292 @@ +"""Tests for user anonymization utilities.""" + +import hashlib +import hmac +import importlib +import os +from unittest.mock import patch, MagicMock + +import pytest +from sqlalchemy.exc import IntegrityError + +from models.database.user_mapping import UserMapping + +# Functions under test are accessed via a module fixture (see `ua` below) + + +# Set up test environment variable for each test +@pytest.fixture(autouse=True) +def setup_test_pepper(): + """Set up test pepper environment variable for all tests.""" + test_pepper = "test-pepper-for-anonymization-tests" + with patch.dict(os.environ, {"USER_ANON_PEPPER": test_pepper}): + # Reload the module to pick up the new environment variable + import utils.user_anonymization # pylint: disable=import-outside-toplevel + + importlib.reload(utils.user_anonymization) + yield + + +@pytest.fixture +def user_anon_module(): + """Return a fresh reference to the reloaded utils.user_anonymization module.""" + import utils.user_anonymization as _ua # pylint: disable=import-outside-toplevel + + importlib.reload(_ua) + return _ua + + +class TestUserAnonymization: # pylint: disable=redefined-outer-name,protected-access + """Test user anonymization functionality.""" + + def test_hash_user_id_consistency(self, user_anon_module): + """Test that user ID hashing is consistent.""" + user_id = "test@example.com" + hash1 = user_anon_module._hash_user_id(user_id) + hash2 = user_anon_module._hash_user_id(user_id) + + assert hash1 == hash2 + assert len(hash1) == 64 # SHA-256 hex length + assert hash1 != user_id # Should be different from original + + def test_hash_user_id_different_for_different_users(self, user_anon_module): + """Test that different user IDs produce different hashes.""" + user1_hash = user_anon_module._hash_user_id("user1@example.com") + user2_hash = user_anon_module._hash_user_id("user2@example.com") + + assert user1_hash != user2_hash + + @patch("utils.user_anonymization.get_session") + @patch("utils.user_anonymization.get_suid") + def test_get_anonymous_user_id_new_user( + self, mock_get_suid, mock_get_session, user_anon_module + ): + """Test creating new anonymous ID for first-time user.""" + # Setup mocks + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = None + mock_get_suid.return_value = "anon-123-456" + + user_id = "new_user@example.com" + result = user_anon_module.get_anonymous_user_id(user_id) + + # Verify result + assert result == "anon-123-456" + + # Verify database interactions + mock_session.add.assert_called_once() + mock_session.commit.assert_called_once() + + # Verify the UserMapping was created correctly + added_mapping = mock_session.add.call_args[0][0] + assert isinstance(added_mapping, UserMapping) + assert added_mapping.anonymous_id == "anon-123-456" + assert added_mapping.user_id_hash == user_anon_module._hash_user_id(user_id) + + @patch("utils.user_anonymization.get_session") + def test_get_anonymous_user_id_existing_user( + self, mock_get_session, user_anon_module + ): + """Test retrieving existing anonymous ID for returning user.""" + # Setup existing mapping + existing_mapping = UserMapping() + existing_mapping.anonymous_id = "existing-anon-789" + existing_mapping.user_id_hash = user_anon_module._hash_user_id( + "existing@example.com" + ) + + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = ( + existing_mapping + ) + + result = user_anon_module.get_anonymous_user_id("existing@example.com") + + # Verify result + assert result == "existing-anon-789" + + # Verify no new mapping was created + mock_session.add.assert_not_called() + mock_session.commit.assert_not_called() + + @patch("utils.user_anonymization.get_session") + @patch("utils.user_anonymization.get_suid") + def test_get_anonymous_user_id_race_condition( + self, mock_get_suid, mock_get_session, user_anon_module + ): + """Test handling race condition when creating user mapping.""" + # Setup mocks for race condition scenario + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + + # First query returns None (no existing mapping) + # Add raises IntegrityError (race condition) + # Second query returns existing mapping (created by other thread) + existing_mapping = UserMapping() + existing_mapping.anonymous_id = "race-condition-uuid" + + mock_session.query.return_value.filter_by.return_value.first.side_effect = [ + None, # First check - no existing mapping + existing_mapping, # Second check after race condition + ] + mock_session.add.side_effect = IntegrityError("Duplicate key", None, None) + mock_get_suid.return_value = "new-uuid" + + result = user_anon_module.get_anonymous_user_id("race_user@example.com") + + # Should return the mapping created by the other thread + assert result == "race-condition-uuid" + mock_session.rollback.assert_called_once() + + @patch("utils.user_anonymization.get_session") + @patch("utils.user_anonymization.get_suid") + def test_get_anonymous_user_id_race_condition_failure( + self, mock_get_suid, mock_get_session, user_anon_module + ): + """Test handling race condition where retrieval also fails.""" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + + # First query returns None, add fails, second query also returns None + mock_session.query.return_value.filter_by.return_value.first.side_effect = [ + None, + None, + ] + mock_session.add.side_effect = IntegrityError("Duplicate key", None, None) + mock_get_suid.return_value = "new-uuid" + + with pytest.raises( + RuntimeError, match="Unable to create or retrieve anonymous user ID" + ): + user_anon_module.get_anonymous_user_id("problematic_user@example.com") + + @patch("utils.user_anonymization.get_session") + def test_get_user_count(self, mock_get_session, user_anon_module): + """Test getting total user count.""" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.count.return_value = 42 + + result = user_anon_module.get_user_count() + + assert result == 42 + mock_session.query.assert_called_once_with(UserMapping) + + @patch("utils.user_anonymization.get_session") + def test_find_anonymous_user_id_existing(self, mock_get_session, user_anon_module): + """Test finding existing anonymous ID without creating new one.""" + existing_mapping = UserMapping() + existing_mapping.anonymous_id = "found-uuid" + + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = ( + existing_mapping + ) + + result = user_anon_module.find_anonymous_user_id("existing@example.com") + + assert result == "found-uuid" + + @patch("utils.user_anonymization.get_session") + def test_find_anonymous_user_id_not_found(self, mock_get_session, user_anon_module): + """Test finding non-existing anonymous ID returns None.""" + mock_session = MagicMock() + mock_get_session.return_value.__enter__.return_value = mock_session + mock_session.query.return_value.filter_by.return_value.first.return_value = None + + result = user_anon_module.find_anonymous_user_id("nonexistent@example.com") + + assert result is None + + def test_hmac_prevents_rainbow_attacks(self, user_anon_module): + """Test that HMAC makes rainbow table attacks impractical.""" + # Common passwords/emails that might be in rainbow tables + common_ids = ["admin", "test@test.com", "user123", "john.doe@company.com"] + + for user_id in common_ids: + hash_result = user_anon_module._hash_user_id(user_id) + # Hash should not match simple SHA-256 of just the user ID + simple_hash = hashlib.sha256(user_id.encode()).hexdigest() + assert hash_result != simple_hash + + # Hash should not match HMAC without pepper + hmac_without_pepper = hmac.new( + b"", user_id.strip().lower().encode(), hashlib.sha256 + ).hexdigest() + assert hash_result != hmac_without_pepper + + def test_anonymization_preserves_uniqueness(self, user_anon_module): + """Test that different users get different anonymous IDs.""" + users = [ + "user1@example.com", + "user2@example.com", + "admin@company.com", + "test.user@domain.org", + ] + + hashes = [user_anon_module._hash_user_id(user) for user in users] + + # All hashes should be unique + assert len(set(hashes)) == len(hashes) + + def test_user_id_normalization(self, user_anon_module): + """Test that user ID normalization works correctly.""" + # Test case variations should produce the same hash + variations = [ + "User@Example.Com", + "user@example.com", + " user@example.com ", + "USER@EXAMPLE.COM", + " USER@EXAMPLE.COM ", + ] + + hashes = [user_anon_module._hash_user_id(variation) for variation in variations] + + # All variations should produce the same hash + assert ( + len(set(hashes)) == 1 + ), "All case/whitespace variations should hash the same" + + # But different actual users should still be different + different_user = user_anon_module._hash_user_id("different@example.com") + assert different_user not in hashes + + def test_missing_pepper_env_var(self): + """Test that missing pepper environment variable raises clear error.""" + import utils.user_anonymization # pylint: disable=import-outside-toplevel + + # Test that module import fails with missing env var + with patch.dict(os.environ, {}, clear=True): + with pytest.raises( + RuntimeError, match="USER_ANON_PEPPER environment variable is required" + ): + # Force reimport to trigger the env var check + importlib.reload(utils.user_anonymization) + + +class TestUserMappingModel: # pylint: disable=redefined-outer-name + """Test the UserMapping database model.""" + + def test_user_mapping_attributes(self): + """Test UserMapping model has correct attributes.""" + mapping = UserMapping() + + # Check that required attributes exist + assert hasattr(mapping, "anonymous_id") + assert hasattr(mapping, "user_id_hash") + assert hasattr(mapping, "created_at") + + # Check table name + assert UserMapping.__tablename__ == "user_mapping" + + def test_user_mapping_creation(self): + """Test creating UserMapping instance.""" + mapping = UserMapping( + anonymous_id="test-uuid-123", user_id_hash="hashed_user_id" + ) + + assert mapping.anonymous_id == "test-uuid-123" + assert mapping.user_id_hash == "hashed_user_id"