From 5f656dec587d3ed549d99e164525f8d7ce5a678a Mon Sep 17 00:00:00 2001 From: onmete Date: Wed, 23 Jul 2025 15:48:09 +0200 Subject: [PATCH 1/4] Define central storage path config option for user data collection --- README.md | 9 +-- lightspeed-stack.yaml | 3 +- src/app/endpoints/feedback.py | 15 +++-- src/app/endpoints/query.py | 4 +- src/models/config.py | 22 ++++---- src/services/data_collector.py | 24 ++++---- tests/configuration/lightspeed-stack.yaml | 2 +- tests/integration/test_configuration.py | 2 +- tests/unit/app/endpoints/test_feedback.py | 2 +- tests/unit/app/endpoints/test_query.py | 4 +- tests/unit/models/test_config.py | 66 +++++++--------------- tests/unit/services/test_data_collector.py | 11 ---- 12 files changed, 58 insertions(+), 106 deletions(-) diff --git a/README.md b/README.md index 3729d6ae..12ad159d 100644 --- a/README.md +++ b/README.md @@ -107,9 +107,8 @@ llama_stack: url: http://localhost:8321 user_data_collection: feedback_enabled: true - feedback_storage: "/tmp/data/feedback" transcripts_enabled: true - transcripts_storage: "/tmp/data/transcripts" + user_data_dir: "/tmp/data" ``` ### Llama Stack project and configuration @@ -186,9 +185,8 @@ llama_stack: library_client_config_path: user_data_collection: feedback_enabled: true - feedback_storage: "/tmp/data/feedback" transcripts_enabled: true - transcripts_storage: "/tmp/data/transcripts" + user_data_dir: "/tmp/data" ``` ## System prompt @@ -438,9 +436,8 @@ The data collector service is configured through the `user_data_collection.data_ ```yaml user_data_collection: feedback_enabled: true - feedback_storage: "/tmp/data/feedback" transcripts_enabled: true - transcripts_storage: "/tmp/data/transcripts" + user_data_dir: "/tmp/data" data_collector: enabled: true ingress_server_url: "https://your-ingress-server.com" diff --git a/lightspeed-stack.yaml b/lightspeed-stack.yaml index 01a36205..c8e91b5e 100644 --- a/lightspeed-stack.yaml +++ b/lightspeed-stack.yaml @@ -17,9 +17,8 @@ llama_stack: api_key: xyzzy user_data_collection: feedback_enabled: true - feedback_storage: "/tmp/data/feedback" transcripts_enabled: true - transcripts_storage: "/tmp/data/transcripts" + user_data_dir: "/tmp/data" data_collector: enabled: false ingress_server_url: null diff --git a/src/app/endpoints/feedback.py b/src/app/endpoints/feedback.py index 1fcbe656..dd386f8b 100644 --- a/src/app/endpoints/feedback.py +++ b/src/app/endpoints/feedback.py @@ -109,19 +109,18 @@ def store_feedback(user_id: str, feedback: dict) -> None: feedback: The feedback to store. """ logger.debug("Storing feedback for user %s", user_id) - # Creates storage path only if it doesn't exist. The `exist_ok=True` prevents - # race conditions in case of multiple server instances trying to set up storage - # at the same location. - storage_path = Path( - configuration.user_data_collection_configuration.feedback_storage or "" - ) - storage_path.mkdir(parents=True, exist_ok=True) + + # Create feedback directory if it doesn't exist + feedback_dir = Path( + configuration.user_data_collection_configuration.feedback_storage + ) or Path("") + feedback_dir.mkdir(parents=True, exist_ok=True) current_time = str(datetime.now(UTC)) data_to_store = {"user_id": user_id, "timestamp": current_time, **feedback} # stores feedback in a file under unique uuid - feedback_file_path = storage_path / f"{get_suid()}.json" + feedback_file_path = feedback_dir / f"{get_suid()}.json" with open(feedback_file_path, "w", encoding="utf-8") as feedback_file: json.dump(data_to_store, feedback_file) diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py index d7629a02..c9fb7b4e 100644 --- a/src/app/endpoints/query.py +++ b/src/app/endpoints/query.py @@ -350,9 +350,7 @@ def construct_transcripts_path(user_id: str, conversation_id: str) -> Path: # this Path sanitization pattern uid = os.path.normpath("/" + user_id).lstrip("/") cid = os.path.normpath("/" + conversation_id).lstrip("/") - file_path = ( - configuration.user_data_collection_configuration.transcripts_storage or "" - ) + file_path = configuration.user_data_collection_configuration.transcripts_storage return Path(file_path, uid, cid) diff --git a/src/models/config.py b/src/models/config.py index 90a53275..5c8c8e6d 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -130,21 +130,19 @@ class UserDataCollection(BaseModel): """User data collection configuration.""" feedback_enabled: bool = False - feedback_storage: Optional[str] = None transcripts_enabled: bool = False - transcripts_storage: Optional[str] = None + user_data_dir: str = "user_data" data_collector: DataCollectorConfiguration = DataCollectorConfiguration() - @model_validator(mode="after") - def check_storage_location_is_set_when_needed(self) -> Self: - """Check that storage_location is set when enabled.""" - if self.feedback_enabled and self.feedback_storage is None: - raise ValueError("feedback_storage is required when feedback is enabled") - if self.transcripts_enabled and self.transcripts_storage is None: - raise ValueError( - "transcripts_storage is required when transcripts is enabled" - ) - return self + @property + def feedback_storage(self) -> str: + """Feedback storage directory path.""" + return str(Path(self.user_data_dir) / "feedback") + + @property + def transcripts_storage(self) -> str: + """Transcripts storage directory path.""" + return str(Path(self.user_data_dir) / "transcripts") class AuthenticationConfiguration(BaseModel): diff --git a/src/services/data_collector.py b/src/services/data_collector.py index f87d2503..c5fc3e79 100644 --- a/src/services/data_collector.py +++ b/src/services/data_collector.py @@ -71,18 +71,16 @@ def _perform_collection(self) -> None: try: if feedback_files: udc_config = configuration.user_data_collection_configuration - if udc_config.feedback_storage: - feedback_base = Path(udc_config.feedback_storage) - collections_sent += self._create_and_send_tarball( - feedback_files, "feedback", feedback_base - ) + feedback_base = Path(udc_config.feedback_storage) + collections_sent += self._create_and_send_tarball( + feedback_files, "feedback", feedback_base + ) if transcript_files: udc_config = configuration.user_data_collection_configuration - if udc_config.transcripts_storage: - transcript_base = Path(udc_config.transcripts_storage) - collections_sent += self._create_and_send_tarball( - transcript_files, "transcripts", transcript_base - ) + transcript_base = Path(udc_config.transcripts_storage) + collections_sent += self._create_and_send_tarball( + transcript_files, "transcripts", transcript_base + ) logger.info( "Successfully sent %s collections to ingress server", collections_sent @@ -95,7 +93,7 @@ def _collect_feedback_files(self) -> List[Path]: """Collect all feedback files that need to be collected.""" udc_config = configuration.user_data_collection_configuration - if not udc_config.feedback_enabled or not udc_config.feedback_storage: + if not udc_config.feedback_enabled: return [] feedback_dir = Path(udc_config.feedback_storage) @@ -108,7 +106,7 @@ def _collect_transcript_files(self) -> List[Path]: """Collect all transcript files that need to be collected.""" udc_config = configuration.user_data_collection_configuration - if not udc_config.transcripts_enabled or not udc_config.transcripts_storage: + if not udc_config.transcripts_enabled: return [] transcripts_dir = Path(udc_config.transcripts_storage) @@ -223,7 +221,7 @@ def _cleanup_empty_directories(self) -> None: """Remove empty directories from transcript storage.""" udc_config = configuration.user_data_collection_configuration - if not udc_config.transcripts_enabled or not udc_config.transcripts_storage: + if not udc_config.transcripts_enabled: return transcripts_dir = Path(udc_config.transcripts_storage) diff --git a/tests/configuration/lightspeed-stack.yaml b/tests/configuration/lightspeed-stack.yaml index 1cf9565c..8f21cf5e 100644 --- a/tests/configuration/lightspeed-stack.yaml +++ b/tests/configuration/lightspeed-stack.yaml @@ -17,7 +17,7 @@ llama_stack: api_key: xyzzy user_data_collection: feedback_enabled: true - feedback_storage: "/tmp/data/feedback" + user_data_dir: "/tmp/ls_user_data" mcp_servers: - name: "server1" provider_id: "provider1" diff --git a/tests/integration/test_configuration.py b/tests/integration/test_configuration.py index 54b1f0cd..e58b972e 100644 --- a/tests/integration/test_configuration.py +++ b/tests/integration/test_configuration.py @@ -56,7 +56,7 @@ def test_loading_proper_configuration(configuration_filename: str) -> None: # check 'user_data_collection' section udc_config = cfg.user_data_collection_configuration assert udc_config.feedback_enabled is True - assert udc_config.feedback_storage == "/tmp/data/feedback" + assert udc_config.feedback_storage == "/tmp/ls_user_data/feedback" # check MCP servers section mcp_servers = cfg.mcp_servers diff --git a/tests/unit/app/endpoints/test_feedback.py b/tests/unit/app/endpoints/test_feedback.py index 1e2e5df2..bf1325e3 100644 --- a/tests/unit/app/endpoints/test_feedback.py +++ b/tests/unit/app/endpoints/test_feedback.py @@ -96,7 +96,7 @@ def test_feedback_endpoint_handler_error(mocker): def test_store_feedback(mocker): """Test that store_feedback calls the correct storage function.""" - configuration.user_data_collection_configuration.feedback_storage = "fake-path" + configuration.user_data_collection_configuration.user_data_dir = "fake" mocker.patch("builtins.open", mocker.mock_open()) mocker.patch("app.endpoints.feedback.Path", return_value=mocker.MagicMock()) diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py index 8ace106a..ed97e0a0 100644 --- a/tests/unit/app/endpoints/test_query.py +++ b/tests/unit/app/endpoints/test_query.py @@ -911,9 +911,7 @@ def test_retrieve_response_shield_violation(prepare_agent_mocks, mocker): def test_construct_transcripts_path(setup_configuration, mocker): """Test the construct_transcripts_path function.""" # Update configuration for this test - setup_configuration.user_data_collection_configuration.transcripts_storage = ( - "/tmp/transcripts" - ) + setup_configuration.user_data_collection_configuration.user_data_dir = "/tmp" mocker.patch("app.endpoints.query.configuration", setup_configuration) user_id = "user123" diff --git a/tests/unit/models/test_config.py b/tests/unit/models/test_config.py index 293f48a1..60826990 100644 --- a/tests/unit/models/test_config.py +++ b/tests/unit/models/test_config.py @@ -134,37 +134,28 @@ def test_llama_stack_wrong_configuration_no_config_file() -> None: def test_user_data_collection_feedback_enabled() -> None: """Test the UserDataCollection constructor for feedback.""" # correct configuration - cfg = UserDataCollection(feedback_enabled=False, feedback_storage=None) + cfg = UserDataCollection(feedback_enabled=False) assert cfg is not None assert cfg.feedback_enabled is False - assert cfg.feedback_storage is None - - -def test_user_data_collection_feedback_disabled() -> None: - """Test the UserDataCollection constructor for feedback.""" - # incorrect configuration - with pytest.raises( - ValueError, - match="feedback_storage is required when feedback is enabled", - ): - UserDataCollection(feedback_enabled=True, feedback_storage=None) + assert cfg.user_data_dir == "user_data" + assert cfg.feedback_storage == "user_data/feedback" def test_user_data_collection_transcripts_enabled() -> None: """Test the UserDataCollection constructor for transcripts.""" # correct configuration - cfg = UserDataCollection(transcripts_enabled=False, transcripts_storage=None) + cfg = UserDataCollection(transcripts_enabled=False) assert cfg is not None + assert cfg.transcripts_enabled is False + assert cfg.user_data_dir == "user_data" + assert cfg.transcripts_storage == "user_data/transcripts" -def test_user_data_collection_transcripts_disabled() -> None: - """Test the UserDataCollection constructor for transcripts.""" - # incorrect configuration - with pytest.raises( - ValueError, - match="transcripts_storage is required when transcripts is enabled", - ): - UserDataCollection(transcripts_enabled=True, transcripts_storage=None) +def test_user_data_collection_custom_dir() -> None: + """Test the UserDataCollection constructor with custom directory.""" + cfg = UserDataCollection(user_data_dir="/custom/path") + assert cfg.feedback_storage == "/custom/path/feedback" + assert cfg.transcripts_storage == "/custom/path/transcripts" def test_user_data_collection_data_collector_enabled() -> None: @@ -337,9 +328,7 @@ def test_configuration_empty_mcp_servers() -> None: use_as_library_client=True, library_client_config_path="tests/configuration/run.yaml", ), - user_data_collection=UserDataCollection( - feedback_enabled=False, feedback_storage=None - ), + user_data_collection=UserDataCollection(feedback_enabled=False), mcp_servers=[], customization=None, ) @@ -362,9 +351,7 @@ def test_configuration_single_mcp_server() -> None: use_as_library_client=True, library_client_config_path="tests/configuration/run.yaml", ), - user_data_collection=UserDataCollection( - feedback_enabled=False, feedback_storage=None - ), + user_data_collection=UserDataCollection(feedback_enabled=False), mcp_servers=[mcp_server], customization=None, ) @@ -394,9 +381,7 @@ def test_configuration_multiple_mcp_servers() -> None: use_as_library_client=True, library_client_config_path="tests/configuration/run.yaml", ), - user_data_collection=UserDataCollection( - feedback_enabled=False, feedback_storage=None - ), + user_data_collection=UserDataCollection(feedback_enabled=False), mcp_servers=mcp_servers, customization=None, ) @@ -421,9 +406,7 @@ def test_dump_configuration(tmp_path) -> None: use_as_library_client=True, library_client_config_path="tests/configuration/run.yaml", ), - user_data_collection=UserDataCollection( - feedback_enabled=False, feedback_storage=None - ), + user_data_collection=UserDataCollection(feedback_enabled=False), mcp_servers=[], customization=None, ) @@ -468,9 +451,8 @@ def test_dump_configuration(tmp_path) -> None: }, "user_data_collection": { "feedback_enabled": False, - "feedback_storage": None, "transcripts_enabled": False, - "transcripts_storage": None, + "user_data_dir": "user_data", "data_collector": { "enabled": False, "ingress_server_url": None, @@ -511,9 +493,7 @@ def test_dump_configuration_with_one_mcp_server(tmp_path) -> None: use_as_library_client=True, library_client_config_path="tests/configuration/run.yaml", ), - user_data_collection=UserDataCollection( - feedback_enabled=False, feedback_storage=None - ), + user_data_collection=UserDataCollection(feedback_enabled=False), mcp_servers=mcp_servers, customization=None, ) @@ -553,9 +533,8 @@ def test_dump_configuration_with_one_mcp_server(tmp_path) -> None: }, "user_data_collection": { "feedback_enabled": False, - "feedback_storage": None, "transcripts_enabled": False, - "transcripts_storage": None, + "user_data_dir": "user_data", "data_collector": { "enabled": False, "ingress_server_url": None, @@ -605,9 +584,7 @@ def test_dump_configuration_with_more_mcp_servers(tmp_path) -> None: use_as_library_client=True, library_client_config_path="tests/configuration/run.yaml", ), - user_data_collection=UserDataCollection( - feedback_enabled=False, feedback_storage=None - ), + user_data_collection=UserDataCollection(feedback_enabled=False), mcp_servers=mcp_servers, customization=None, ) @@ -653,9 +630,8 @@ def test_dump_configuration_with_more_mcp_servers(tmp_path) -> None: }, "user_data_collection": { "feedback_enabled": False, - "feedback_storage": None, "transcripts_enabled": False, - "transcripts_storage": None, + "user_data_dir": "user_data", "data_collector": { "enabled": False, "ingress_server_url": None, diff --git a/tests/unit/services/test_data_collector.py b/tests/unit/services/test_data_collector.py index b2103ad5..90456923 100644 --- a/tests/unit/services/test_data_collector.py +++ b/tests/unit/services/test_data_collector.py @@ -57,17 +57,6 @@ def test_collect_feedback_files_disabled(mock_config) -> None: assert result == [] -@patch("services.data_collector.configuration") -def test_collect_feedback_files_no_storage(mock_config) -> None: - """Test collecting feedback files when no storage configured.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.feedback_enabled = True - mock_config.user_data_collection_configuration.feedback_storage = None - - result = service._collect_feedback_files() - assert result == [] - - @patch("services.data_collector.configuration") def test_collect_feedback_files_directory_not_exists(mock_config) -> None: """Test collecting feedback files when directory doesn't exist.""" From 10ded634ec8205c4738e74e09dde834360f91d59 Mon Sep 17 00:00:00 2001 From: onmete Date: Wed, 23 Jul 2025 16:27:45 +0200 Subject: [PATCH 2/4] Remove redundant "or" if feedback storage is not existant --- src/app/endpoints/feedback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/app/endpoints/feedback.py b/src/app/endpoints/feedback.py index dd386f8b..e5e18ba0 100644 --- a/src/app/endpoints/feedback.py +++ b/src/app/endpoints/feedback.py @@ -113,7 +113,7 @@ def store_feedback(user_id: str, feedback: dict) -> None: # Create feedback directory if it doesn't exist feedback_dir = Path( configuration.user_data_collection_configuration.feedback_storage - ) or Path("") + ) feedback_dir.mkdir(parents=True, exist_ok=True) current_time = str(datetime.now(UTC)) From 3d944cbb2469d7b9aeec4a1d583e3fbd713dcee1 Mon Sep 17 00:00:00 2001 From: onmete Date: Wed, 23 Jul 2025 19:48:25 +0200 Subject: [PATCH 3/4] Change UserDataCollection config values to be pathlib.Path --- src/app/endpoints/feedback.py | 5 +- src/models/config.py | 10 +-- src/services/data_collector.py | 10 +-- tests/integration/test_configuration.py | 4 +- tests/unit/app/endpoints/test_feedback.py | 7 +- tests/unit/app/endpoints/test_query.py | 4 +- tests/unit/models/test_config.py | 12 ++-- tests/unit/services/test_data_collector.py | 74 +++++++++++----------- 8 files changed, 65 insertions(+), 61 deletions(-) diff --git a/src/app/endpoints/feedback.py b/src/app/endpoints/feedback.py index e5e18ba0..ae053be9 100644 --- a/src/app/endpoints/feedback.py +++ b/src/app/endpoints/feedback.py @@ -2,7 +2,6 @@ import logging from typing import Any -from pathlib import Path import json from datetime import datetime, UTC @@ -111,9 +110,7 @@ def store_feedback(user_id: str, feedback: dict) -> None: logger.debug("Storing feedback for user %s", user_id) # Create feedback directory if it doesn't exist - feedback_dir = Path( - configuration.user_data_collection_configuration.feedback_storage - ) + feedback_dir = configuration.user_data_collection_configuration.feedback_storage feedback_dir.mkdir(parents=True, exist_ok=True) current_time = str(datetime.now(UTC)) diff --git a/src/models/config.py b/src/models/config.py index 5c8c8e6d..7b67e450 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -131,18 +131,18 @@ class UserDataCollection(BaseModel): feedback_enabled: bool = False transcripts_enabled: bool = False - user_data_dir: str = "user_data" + user_data_dir: Path = Path("user_data") data_collector: DataCollectorConfiguration = DataCollectorConfiguration() @property - def feedback_storage(self) -> str: + def feedback_storage(self) -> Path: """Feedback storage directory path.""" - return str(Path(self.user_data_dir) / "feedback") + return self.user_data_dir / "feedback" @property - def transcripts_storage(self) -> str: + def transcripts_storage(self) -> Path: """Transcripts storage directory path.""" - return str(Path(self.user_data_dir) / "transcripts") + return self.user_data_dir / "transcripts" class AuthenticationConfiguration(BaseModel): diff --git a/src/services/data_collector.py b/src/services/data_collector.py index c5fc3e79..f76efaed 100644 --- a/src/services/data_collector.py +++ b/src/services/data_collector.py @@ -71,13 +71,13 @@ def _perform_collection(self) -> None: try: if feedback_files: udc_config = configuration.user_data_collection_configuration - feedback_base = Path(udc_config.feedback_storage) + feedback_base = udc_config.feedback_storage collections_sent += self._create_and_send_tarball( feedback_files, "feedback", feedback_base ) if transcript_files: udc_config = configuration.user_data_collection_configuration - transcript_base = Path(udc_config.transcripts_storage) + transcript_base = udc_config.transcripts_storage collections_sent += self._create_and_send_tarball( transcript_files, "transcripts", transcript_base ) @@ -96,7 +96,7 @@ def _collect_feedback_files(self) -> List[Path]: if not udc_config.feedback_enabled: return [] - feedback_dir = Path(udc_config.feedback_storage) + feedback_dir = udc_config.feedback_storage if not feedback_dir.exists(): return [] @@ -109,7 +109,7 @@ def _collect_transcript_files(self) -> List[Path]: if not udc_config.transcripts_enabled: return [] - transcripts_dir = Path(udc_config.transcripts_storage) + transcripts_dir = udc_config.transcripts_storage if not transcripts_dir.exists(): return [] @@ -224,7 +224,7 @@ def _cleanup_empty_directories(self) -> None: if not udc_config.transcripts_enabled: return - transcripts_dir = Path(udc_config.transcripts_storage) + transcripts_dir = udc_config.transcripts_storage if not transcripts_dir.exists(): return diff --git a/tests/integration/test_configuration.py b/tests/integration/test_configuration.py index e58b972e..8ebad025 100644 --- a/tests/integration/test_configuration.py +++ b/tests/integration/test_configuration.py @@ -1,5 +1,7 @@ """Integration tests for configuration loading and handling.""" +from pathlib import Path + import pytest from configuration import configuration @@ -56,7 +58,7 @@ def test_loading_proper_configuration(configuration_filename: str) -> None: # check 'user_data_collection' section udc_config = cfg.user_data_collection_configuration assert udc_config.feedback_enabled is True - assert udc_config.feedback_storage == "/tmp/ls_user_data/feedback" + assert udc_config.feedback_storage == Path("/tmp/ls_user_data/feedback") # check MCP servers section mcp_servers = cfg.mcp_servers diff --git a/tests/unit/app/endpoints/test_feedback.py b/tests/unit/app/endpoints/test_feedback.py index bf1325e3..2696266e 100644 --- a/tests/unit/app/endpoints/test_feedback.py +++ b/tests/unit/app/endpoints/test_feedback.py @@ -1,5 +1,7 @@ """Unit tests for the /feedback REST API endpoint.""" +from pathlib import Path + from fastapi import HTTPException, status import pytest @@ -96,10 +98,11 @@ def test_feedback_endpoint_handler_error(mocker): def test_store_feedback(mocker): """Test that store_feedback calls the correct storage function.""" - configuration.user_data_collection_configuration.user_data_dir = "fake" + configuration.user_data_collection_configuration.user_data_dir = Path("fake") mocker.patch("builtins.open", mocker.mock_open()) - mocker.patch("app.endpoints.feedback.Path", return_value=mocker.MagicMock()) + # Mock mkdir to prevent actual directory creation + mocker.patch.object(Path, "mkdir") mocker.patch("app.endpoints.feedback.get_suid", return_value="fake-uuid") # Mock the JSON to assert the data is stored correctly diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py index ed97e0a0..db7711f6 100644 --- a/tests/unit/app/endpoints/test_query.py +++ b/tests/unit/app/endpoints/test_query.py @@ -2,6 +2,8 @@ # pylint: disable=too-many-lines +from pathlib import Path + import json from fastapi import HTTPException, status import pytest @@ -911,7 +913,7 @@ def test_retrieve_response_shield_violation(prepare_agent_mocks, mocker): def test_construct_transcripts_path(setup_configuration, mocker): """Test the construct_transcripts_path function.""" # Update configuration for this test - setup_configuration.user_data_collection_configuration.user_data_dir = "/tmp" + setup_configuration.user_data_collection_configuration.user_data_dir = Path("/tmp") mocker.patch("app.endpoints.query.configuration", setup_configuration) user_id = "user123" diff --git a/tests/unit/models/test_config.py b/tests/unit/models/test_config.py index 60826990..d2bb50d0 100644 --- a/tests/unit/models/test_config.py +++ b/tests/unit/models/test_config.py @@ -137,8 +137,8 @@ def test_user_data_collection_feedback_enabled() -> None: cfg = UserDataCollection(feedback_enabled=False) assert cfg is not None assert cfg.feedback_enabled is False - assert cfg.user_data_dir == "user_data" - assert cfg.feedback_storage == "user_data/feedback" + assert cfg.user_data_dir == Path("user_data") + assert cfg.feedback_storage == Path("user_data/feedback") def test_user_data_collection_transcripts_enabled() -> None: @@ -147,15 +147,15 @@ def test_user_data_collection_transcripts_enabled() -> None: cfg = UserDataCollection(transcripts_enabled=False) assert cfg is not None assert cfg.transcripts_enabled is False - assert cfg.user_data_dir == "user_data" - assert cfg.transcripts_storage == "user_data/transcripts" + assert cfg.user_data_dir == Path("user_data") + assert cfg.transcripts_storage == Path("user_data/transcripts") def test_user_data_collection_custom_dir() -> None: """Test the UserDataCollection constructor with custom directory.""" cfg = UserDataCollection(user_data_dir="/custom/path") - assert cfg.feedback_storage == "/custom/path/feedback" - assert cfg.transcripts_storage == "/custom/path/transcripts" + assert cfg.feedback_storage == Path("/custom/path/feedback") + assert cfg.transcripts_storage == Path("/custom/path/transcripts") def test_user_data_collection_data_collector_enabled() -> None: diff --git a/tests/unit/services/test_data_collector.py b/tests/unit/services/test_data_collector.py index 90456923..0db7d4c9 100644 --- a/tests/unit/services/test_data_collector.py +++ b/tests/unit/services/test_data_collector.py @@ -4,7 +4,6 @@ from unittest.mock import patch, MagicMock import requests import tarfile - from services.data_collector import DataCollectorService @@ -62,7 +61,9 @@ def test_collect_feedback_files_directory_not_exists(mock_config) -> None: """Test collecting feedback files when directory doesn't exist.""" service = DataCollectorService() mock_config.user_data_collection_configuration.feedback_enabled = True - mock_config.user_data_collection_configuration.feedback_storage = "/tmp/feedback" + mock_config.user_data_collection_configuration.feedback_storage = Path( + "/tmp/feedback" + ) with patch("services.data_collector.Path") as mock_path: mock_path.return_value.exists.return_value = False @@ -76,16 +77,17 @@ def test_collect_feedback_files_success(mock_config) -> None: """Test collecting feedback files successfully.""" service = DataCollectorService() mock_config.user_data_collection_configuration.feedback_enabled = True - mock_config.user_data_collection_configuration.feedback_storage = "/tmp/feedback" + # Create a mock Path object with the required methods + mock_feedback_dir = MagicMock() + mock_feedback_dir.exists.return_value = True mock_files = [Path("/tmp/feedback/file1.json")] + mock_feedback_dir.glob.return_value = mock_files - with patch("services.data_collector.Path") as mock_path: - mock_path.return_value.exists.return_value = True - mock_path.return_value.glob.return_value = mock_files + mock_config.user_data_collection_configuration.feedback_storage = mock_feedback_dir - result = service._collect_feedback_files() - assert result == mock_files + result = service._collect_feedback_files() + assert result == mock_files @patch("services.data_collector.configuration") @@ -103,7 +105,7 @@ def test_collect_transcript_files_directory_not_exists(mock_config) -> None: """Test collecting transcript files when directory doesn't exist.""" service = DataCollectorService() mock_config.user_data_collection_configuration.transcripts_enabled = True - mock_config.user_data_collection_configuration.transcripts_storage = ( + mock_config.user_data_collection_configuration.transcripts_storage = Path( "/tmp/transcripts" ) @@ -119,18 +121,19 @@ def test_collect_transcript_files_success(mock_config) -> None: """Test collecting transcript files successfully.""" service = DataCollectorService() mock_config.user_data_collection_configuration.transcripts_enabled = True - mock_config.user_data_collection_configuration.transcripts_storage = ( - "/tmp/transcripts" - ) + # Create a mock Path object with required methods + mock_transcripts_dir = MagicMock() + mock_transcripts_dir.exists.return_value = True mock_files = [Path("/tmp/transcripts/user1/conv1/file1.json")] + mock_transcripts_dir.rglob.return_value = mock_files - with patch("services.data_collector.Path") as mock_path: - mock_path.return_value.exists.return_value = True - mock_path.return_value.rglob.return_value = mock_files + mock_config.user_data_collection_configuration.transcripts_storage = ( + mock_transcripts_dir + ) - result = service._collect_transcript_files() - assert result == mock_files + result = service._collect_transcript_files() + assert result == mock_files @patch("services.data_collector.configuration") @@ -487,9 +490,6 @@ def test_cleanup_empty_directories_success(mock_config) -> None: """Test successful directory cleanup.""" service = DataCollectorService() mock_config.user_data_collection_configuration.transcripts_enabled = True - mock_config.user_data_collection_configuration.transcripts_storage = ( - "/tmp/transcripts" - ) transcripts_dir = MagicMock() user_dir = MagicMock() @@ -505,11 +505,12 @@ def test_cleanup_empty_directories_success(mock_config) -> None: conv_dir.is_dir.return_value = True conv_dir.iterdir.return_value = [] # Empty directory - with patch("services.data_collector.Path", return_value=transcripts_dir): - service._cleanup_empty_directories() + mock_config.user_data_collection_configuration.transcripts_storage = transcripts_dir + + service._cleanup_empty_directories() - conv_dir.rmdir.assert_called_once() - user_dir.rmdir.assert_called_once() + conv_dir.rmdir.assert_called_once() + user_dir.rmdir.assert_called_once() @patch("services.data_collector.configuration") @@ -517,9 +518,6 @@ def test_cleanup_empty_directories_with_errors(mock_config) -> None: """Test directory cleanup when rmdir operations fail.""" service = DataCollectorService() mock_config.user_data_collection_configuration.transcripts_enabled = True - mock_config.user_data_collection_configuration.transcripts_storage = ( - "/tmp/transcripts" - ) transcripts_dir = MagicMock() user_dir = MagicMock() @@ -536,12 +534,13 @@ def test_cleanup_empty_directories_with_errors(mock_config) -> None: conv_dir.rmdir.side_effect = OSError("Permission denied") user_dir.rmdir.side_effect = OSError("Permission denied") - with patch("services.data_collector.Path", return_value=transcripts_dir): - # Should not raise exception - service._cleanup_empty_directories() + mock_config.user_data_collection_configuration.transcripts_storage = transcripts_dir - conv_dir.rmdir.assert_called_once() - user_dir.rmdir.assert_called_once() + # Should not raise exception + service._cleanup_empty_directories() + + conv_dir.rmdir.assert_called_once() + user_dir.rmdir.assert_called_once() @patch("services.data_collector.configuration") @@ -549,14 +548,15 @@ def test_cleanup_empty_directories_directory_not_exists(mock_config) -> None: """Test directory cleanup when transcripts directory doesn't exist.""" service = DataCollectorService() mock_config.user_data_collection_configuration.transcripts_enabled = True + + # Create a mock Path object that doesn't exist + mock_transcripts_dir = MagicMock() + mock_transcripts_dir.exists.return_value = False mock_config.user_data_collection_configuration.transcripts_storage = ( - "/tmp/transcripts" + mock_transcripts_dir ) - with patch("services.data_collector.Path") as mock_path: - mock_path.return_value.exists.return_value = False - - service._cleanup_empty_directories() + service._cleanup_empty_directories() @patch("services.data_collector.configuration") From 1007cb039942cd2c6961d47db25539d6a32cd430 Mon Sep 17 00:00:00 2001 From: onmete Date: Thu, 24 Jul 2025 09:28:24 +0200 Subject: [PATCH 4/4] Flatten data collection config and use it to initialize data export --- src/lightspeed_stack.py | 4 +- src/models/config.py | 55 +- src/runners/data_collector.py | 17 +- src/services/data_collector.py | 97 ++- tests/unit/models/test_config.py | 100 ++- .../runners/test_data_collector_runner.py | 17 +- tests/unit/services/test_data_collector.py | 646 ++++++++---------- 7 files changed, 436 insertions(+), 500 deletions(-) diff --git a/src/lightspeed_stack.py b/src/lightspeed_stack.py index 4479e308..2a348df7 100644 --- a/src/lightspeed_stack.py +++ b/src/lightspeed_stack.py @@ -79,9 +79,7 @@ def main() -> None: if args.dump_configuration: configuration.configuration.dump() elif args.start_data_collector: - start_data_collector( - configuration.user_data_collection_configuration.data_collector - ) + start_data_collector(configuration.user_data_collection_configuration) else: start_uvicorn(configuration.service_configuration) logger.info("Lightspeed stack finished") diff --git a/src/models/config.py b/src/models/config.py index 7b67e450..4d9ff7b7 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -101,38 +101,24 @@ def check_llama_stack_model(self) -> Self: return self -class DataCollectorConfiguration(BaseModel): - """Data collector configuration for sending data to ingress server.""" - - enabled: bool = False - ingress_server_url: Optional[str] = None - ingress_server_auth_token: Optional[str] = None - ingress_content_service_name: Optional[str] = None - collection_interval: PositiveInt = constants.DATA_COLLECTOR_COLLECTION_INTERVAL - cleanup_after_send: bool = True # Remove local files after successful send - connection_timeout: PositiveInt = constants.DATA_COLLECTOR_CONNECTION_TIMEOUT - - @model_validator(mode="after") - def check_data_collector_configuration(self) -> Self: - """Check data collector configuration.""" - if self.enabled and self.ingress_server_url is None: - raise ValueError( - "ingress_server_url is required when data collector is enabled" - ) - if self.enabled and self.ingress_content_service_name is None: - raise ValueError( - "ingress_content_service_name is required when data collector is enabled" - ) - return self - - class UserDataCollection(BaseModel): """User data collection configuration.""" + # data collection feedback_enabled: bool = False transcripts_enabled: bool = False user_data_dir: Path = Path("user_data") - data_collector: DataCollectorConfiguration = DataCollectorConfiguration() + + # data export + export_enabled: bool = False + ingress_server_url: str = "" + ingress_server_auth_token: str = "" + ingress_content_service_name: str = "" + export_collection_interval: PositiveInt = ( + constants.DATA_COLLECTOR_COLLECTION_INTERVAL + ) + export_connection_timeout: PositiveInt = constants.DATA_COLLECTOR_CONNECTION_TIMEOUT + cleanup_after_send: bool = True # Remove local files after successful export @property def feedback_storage(self) -> Path: @@ -144,6 +130,23 @@ def transcripts_storage(self) -> Path: """Transcripts storage directory path.""" return self.user_data_dir / "transcripts" + @model_validator(mode="after") + def check_data_collector_configuration(self) -> Self: + """Check data collector configuration.""" + if self.export_enabled and self.ingress_server_url == "": + raise ValueError( + "ingress_server_url is required when data export is enabled" + ) + if self.export_enabled and self.ingress_server_auth_token == "": + raise ValueError( + "ingress_server_auth_token is required when data export is enabled" + ) + if self.export_enabled and self.ingress_content_service_name == "": + raise ValueError( + "ingress_content_service_name is required when data export is enabled" + ) + return self + class AuthenticationConfiguration(BaseModel): """Authentication configuration.""" diff --git a/src/runners/data_collector.py b/src/runners/data_collector.py index 7bf05e8f..346eaeca 100644 --- a/src/runners/data_collector.py +++ b/src/runners/data_collector.py @@ -2,22 +2,31 @@ import logging -from models.config import DataCollectorConfiguration +from models.config import UserDataCollection from services.data_collector import DataCollectorService logger: logging.Logger = logging.getLogger(__name__) -def start_data_collector(configuration: DataCollectorConfiguration) -> None: +def start_data_collector(configuration: UserDataCollection) -> None: """Start the data collector service as a standalone process.""" logger.info("Starting data collector runner") - if not configuration.enabled: + if not configuration.export_enabled: logger.info("Data collection is disabled") return try: - service = DataCollectorService() + service = DataCollectorService( + feedback_dir=configuration.feedback_storage, + transcripts_dir=configuration.transcripts_storage, + collection_interval=configuration.export_collection_interval, + cleanup_after_send=configuration.cleanup_after_send, + ingress_server_url=configuration.ingress_server_url, + ingress_server_auth_token=configuration.ingress_server_auth_token, + ingress_content_service_name=configuration.ingress_content_service_name, + connection_timeout=configuration.export_connection_timeout, + ) service.run() except Exception as e: logger.error( diff --git a/src/services/data_collector.py b/src/services/data_collector.py index f76efaed..ed6bb3d1 100644 --- a/src/services/data_collector.py +++ b/src/services/data_collector.py @@ -9,12 +9,12 @@ import requests import constants -from configuration import configuration from log import get_logger logger = get_logger(__name__) +# pylint: disable=too-many-instance-attributes class DataCollectorService: # pylint: disable=too-few-public-methods """Service for collecting and sending user data to ingress server. @@ -22,12 +22,30 @@ class DataCollectorService: # pylint: disable=too-few-public-methods including feedback and transcripts to the configured ingress server. """ + # pylint: disable=too-many-arguments,too-many-positional-arguments + def __init__( + self, + feedback_dir: Path, + transcripts_dir: Path, + collection_interval: int, + ingress_server_url: str, + ingress_server_auth_token: str, + ingress_content_service_name: str, + connection_timeout: int, + cleanup_after_send: bool, + ) -> None: + """Initialize the data collector service.""" + self.feedback_dir = feedback_dir + self.transcripts_dir = transcripts_dir + self.collection_interval = collection_interval + self.ingress_server_url = ingress_server_url + self.ingress_server_auth_token = ingress_server_auth_token + self.ingress_content_service_name = ingress_content_service_name + self.connection_timeout = connection_timeout + self.cleanup_after_send = cleanup_after_send + def run(self) -> None: """Run the periodic data collection loop.""" - collector_config = ( - configuration.user_data_collection_configuration.data_collector - ) - logger.info("Starting data collection service") while True: @@ -35,10 +53,10 @@ def run(self) -> None: self._perform_collection() logger.info( "Next collection scheduled in %s seconds", - collector_config.collection_interval, + self.collection_interval, ) - if collector_config.collection_interval is not None: - time.sleep(collector_config.collection_interval) + if self.collection_interval is not None: + time.sleep(self.collection_interval) except KeyboardInterrupt: logger.info("Data collection service stopped by user") break @@ -70,14 +88,12 @@ def _perform_collection(self) -> None: collections_sent = 0 try: if feedback_files: - udc_config = configuration.user_data_collection_configuration - feedback_base = udc_config.feedback_storage + feedback_base = self.feedback_dir collections_sent += self._create_and_send_tarball( feedback_files, "feedback", feedback_base ) if transcript_files: - udc_config = configuration.user_data_collection_configuration - transcript_base = udc_config.transcripts_storage + transcript_base = self.transcripts_dir collections_sent += self._create_and_send_tarball( transcript_files, "transcripts", transcript_base ) @@ -91,30 +107,18 @@ def _perform_collection(self) -> None: def _collect_feedback_files(self) -> List[Path]: """Collect all feedback files that need to be collected.""" - udc_config = configuration.user_data_collection_configuration - - if not udc_config.feedback_enabled: + if not self.feedback_dir.exists(): return [] - feedback_dir = udc_config.feedback_storage - if not feedback_dir.exists(): - return [] - - return list(feedback_dir.glob("*.json")) + return list(self.feedback_dir.glob("*.json")) def _collect_transcript_files(self) -> List[Path]: """Collect all transcript files that need to be collected.""" - udc_config = configuration.user_data_collection_configuration - - if not udc_config.transcripts_enabled: - return [] - - transcripts_dir = udc_config.transcripts_storage - if not transcripts_dir.exists(): + if not self.transcripts_dir.exists(): return [] # Recursively find all JSON files in the transcript directory structure - return list(transcripts_dir.rglob("*.json")) + return list(self.transcripts_dir.rglob("*.json")) def _create_and_send_tarball( self, files: List[Path], data_type: str, base_directory: Path @@ -123,15 +127,11 @@ def _create_and_send_tarball( if not files: return 0 - collector_config = ( - configuration.user_data_collection_configuration.data_collector - ) - # Create one tarball with all files tarball_path = self._create_tarball(files, data_type, base_directory) try: self._send_tarball(tarball_path) - if collector_config.cleanup_after_send: + if self.cleanup_after_send: self._cleanup_files(files) self._cleanup_empty_directories() return 1 @@ -167,22 +167,13 @@ def _create_tarball( def _send_tarball(self, tarball_path: Path) -> None: """Send the tarball to the ingress server.""" - collector_config = ( - configuration.user_data_collection_configuration.data_collector - ) - - if collector_config.ingress_server_url is None: - raise ValueError("Ingress server URL is not configured") - # pylint: disable=line-too-long headers = { - "Content-Type": f"application/vnd.redhat.{collector_config.ingress_content_service_name}.periodic+tar", + "Content-Type": f"application/vnd.redhat.{self.ingress_content_service_name}.periodic+tar", } - if collector_config.ingress_server_auth_token: - headers["Authorization"] = ( - f"Bearer {collector_config.ingress_server_auth_token}" - ) + if self.ingress_server_auth_token: + headers["Authorization"] = f"Bearer {self.ingress_server_auth_token}" with open(tarball_path, "rb") as f: data = f.read() @@ -190,14 +181,14 @@ def _send_tarball(self, tarball_path: Path) -> None: logger.info( "Sending tarball %s to %s", tarball_path.name, - collector_config.ingress_server_url, + self.ingress_server_url, ) response = requests.post( - collector_config.ingress_server_url, + self.ingress_server_url, data=data, headers=headers, - timeout=collector_config.connection_timeout, + timeout=self.connection_timeout, ) if response.status_code >= 400: @@ -219,17 +210,11 @@ def _cleanup_files(self, files: List[Path]) -> None: def _cleanup_empty_directories(self) -> None: """Remove empty directories from transcript storage.""" - udc_config = configuration.user_data_collection_configuration - - if not udc_config.transcripts_enabled: - return - - transcripts_dir = udc_config.transcripts_storage - if not transcripts_dir.exists(): + if not self.transcripts_dir.exists(): return # Remove empty directories (conversation and user directories) - for user_dir in transcripts_dir.iterdir(): + for user_dir in self.transcripts_dir.iterdir(): if user_dir.is_dir(): for conv_dir in user_dir.iterdir(): if conv_dir.is_dir() and not any(conv_dir.iterdir()): diff --git a/tests/unit/models/test_config.py b/tests/unit/models/test_config.py index d2bb50d0..36399677 100644 --- a/tests/unit/models/test_config.py +++ b/tests/unit/models/test_config.py @@ -22,7 +22,6 @@ UserDataCollection, TLSConfiguration, ModelContextProtocolServer, - DataCollectorConfiguration, ) from utils.checks import InvalidConfigurationError @@ -153,7 +152,7 @@ def test_user_data_collection_transcripts_enabled() -> None: def test_user_data_collection_custom_dir() -> None: """Test the UserDataCollection constructor with custom directory.""" - cfg = UserDataCollection(user_data_dir="/custom/path") + cfg = UserDataCollection(user_data_dir=Path("/custom/path")) assert cfg.feedback_storage == Path("/custom/path/feedback") assert cfg.transcripts_storage == Path("/custom/path/transcripts") @@ -162,16 +161,13 @@ def test_user_data_collection_data_collector_enabled() -> None: """Test the UserDataCollection constructor for data collector.""" # correct configuration cfg = UserDataCollection( - data_collector=DataCollectorConfiguration( - enabled=True, - ingress_server_url="http://localhost:8080", - ingress_server_auth_token="xyzzy", - ingress_content_service_name="lightspeed-core", - collection_interval=60, - ) + export_enabled=True, + ingress_server_url="http://localhost:8080", + ingress_server_auth_token="xyzzy", + ingress_content_service_name="lightspeed-core", ) assert cfg is not None - assert cfg.data_collector.enabled is True + assert cfg.export_enabled is True def test_user_data_collection_data_collector_wrong_configuration() -> None: @@ -179,29 +175,33 @@ def test_user_data_collection_data_collector_wrong_configuration() -> None: # incorrect configuration with pytest.raises( ValueError, - match="ingress_server_url is required when data collector is enabled", + match="ingress_server_url is required when data export is enabled", + ): + UserDataCollection( + export_enabled=True, + ingress_server_url="", + ingress_server_auth_token="xyzzy", + ingress_content_service_name="lightspeed-core", + ) + with pytest.raises( + ValueError, + match="ingress_content_service_name is required when data export is enabled", ): UserDataCollection( - data_collector=DataCollectorConfiguration( - enabled=True, - ingress_server_url=None, - ingress_server_auth_token="xyzzy", - ingress_content_service_name="lightspeed-core", - collection_interval=60, - ) + export_enabled=True, + ingress_server_url="http://localhost:8080", + ingress_server_auth_token="xyzzy", + ingress_content_service_name="", ) with pytest.raises( ValueError, - match="ingress_content_service_name is required when data collector is enabled", + match="ingress_server_auth_token is required when data export is enabled", ): UserDataCollection( - data_collector=DataCollectorConfiguration( - enabled=True, - ingress_server_url="http://localhost:8080", - ingress_server_auth_token="xyzzy", - ingress_content_service_name=None, - collection_interval=60, - ) + export_enabled=True, + ingress_server_url="http://localhost:8080", + ingress_server_auth_token="", + ingress_content_service_name="lightspeed-core", ) @@ -453,15 +453,13 @@ def test_dump_configuration(tmp_path) -> None: "feedback_enabled": False, "transcripts_enabled": False, "user_data_dir": "user_data", - "data_collector": { - "enabled": False, - "ingress_server_url": None, - "ingress_server_auth_token": None, - "ingress_content_service_name": None, - "collection_interval": DATA_COLLECTOR_COLLECTION_INTERVAL, - "cleanup_after_send": True, - "connection_timeout": DATA_COLLECTOR_CONNECTION_TIMEOUT, - }, + "export_enabled": False, + "ingress_server_url": "", + "ingress_server_auth_token": "", + "ingress_content_service_name": "", + "export_collection_interval": DATA_COLLECTOR_COLLECTION_INTERVAL, + "export_connection_timeout": DATA_COLLECTOR_CONNECTION_TIMEOUT, + "cleanup_after_send": True, }, "mcp_servers": [], "authentication": { @@ -535,15 +533,13 @@ def test_dump_configuration_with_one_mcp_server(tmp_path) -> None: "feedback_enabled": False, "transcripts_enabled": False, "user_data_dir": "user_data", - "data_collector": { - "enabled": False, - "ingress_server_url": None, - "ingress_server_auth_token": None, - "ingress_content_service_name": None, - "collection_interval": DATA_COLLECTOR_COLLECTION_INTERVAL, - "cleanup_after_send": True, - "connection_timeout": DATA_COLLECTOR_CONNECTION_TIMEOUT, - }, + "export_enabled": False, + "ingress_server_url": "", + "ingress_server_auth_token": "", + "ingress_content_service_name": "", + "export_collection_interval": DATA_COLLECTOR_COLLECTION_INTERVAL, + "export_connection_timeout": DATA_COLLECTOR_CONNECTION_TIMEOUT, + "cleanup_after_send": True, }, "mcp_servers": [ { @@ -632,15 +628,13 @@ def test_dump_configuration_with_more_mcp_servers(tmp_path) -> None: "feedback_enabled": False, "transcripts_enabled": False, "user_data_dir": "user_data", - "data_collector": { - "enabled": False, - "ingress_server_url": None, - "ingress_server_auth_token": None, - "ingress_content_service_name": None, - "collection_interval": DATA_COLLECTOR_COLLECTION_INTERVAL, - "cleanup_after_send": True, - "connection_timeout": DATA_COLLECTOR_CONNECTION_TIMEOUT, - }, + "export_enabled": False, + "ingress_server_url": "", + "ingress_server_auth_token": "", + "ingress_content_service_name": "", + "export_collection_interval": DATA_COLLECTOR_COLLECTION_INTERVAL, + "export_connection_timeout": DATA_COLLECTOR_CONNECTION_TIMEOUT, + "cleanup_after_send": True, }, "mcp_servers": [ { diff --git a/tests/unit/runners/test_data_collector_runner.py b/tests/unit/runners/test_data_collector_runner.py index 4384b8ab..14903304 100644 --- a/tests/unit/runners/test_data_collector_runner.py +++ b/tests/unit/runners/test_data_collector_runner.py @@ -2,18 +2,17 @@ from unittest.mock import patch -from models.config import DataCollectorConfiguration +from models.config import UserDataCollection from runners.data_collector import start_data_collector def test_start_data_collector() -> None: """Test the function to start data collector service.""" - configuration = DataCollectorConfiguration( - enabled=True, + configuration = UserDataCollection( + export_enabled=True, ingress_server_url="http://localhost:8080", ingress_server_auth_token="xyzzy", ingress_content_service_name="lightspeed-core", - collection_interval=60, ) # don't start real data collector service @@ -24,12 +23,11 @@ def test_start_data_collector() -> None: def test_start_data_collector_disabled() -> None: """Test the function to start data collector service.""" - configuration = DataCollectorConfiguration( - enabled=False, + configuration = UserDataCollection( + export_enabled=False, ingress_server_url="http://localhost:8080", ingress_server_auth_token="xyzzy", ingress_content_service_name="lightspeed-core", - collection_interval=60, ) # don't start real data collector service @@ -40,12 +38,11 @@ def test_start_data_collector_disabled() -> None: def test_start_data_collector_exception() -> None: """Test the function to start data collector service when an exception occurs.""" - configuration = DataCollectorConfiguration( - enabled=True, + configuration = UserDataCollection( + export_enabled=True, ingress_server_url="http://localhost:8080", ingress_server_auth_token="xyzzy", ingress_content_service_name="lightspeed-core", - collection_interval=60, ) # Mock the DataCollectorService to raise an exception diff --git a/tests/unit/services/test_data_collector.py b/tests/unit/services/test_data_collector.py index 0db7d4c9..15c157b2 100644 --- a/tests/unit/services/test_data_collector.py +++ b/tests/unit/services/test_data_collector.py @@ -7,20 +7,35 @@ from services.data_collector import DataCollectorService +def _create_test_service(**kwargs) -> DataCollectorService: + """Create a DataCollectorService instance with default test parameters.""" + defaults = { + "feedback_dir": Path("/tmp/feedback"), + "transcripts_dir": Path("/tmp/transcripts"), + "collection_interval": 60, + "ingress_server_url": "http://test-server.com", + "ingress_server_auth_token": "test-token", + "ingress_content_service_name": "test-service", + "connection_timeout": 30, + "cleanup_after_send": True, + } + defaults.update(kwargs) + return DataCollectorService(**defaults) + + def test_data_collector_service_creation() -> None: """Test that DataCollectorService can be created.""" - service = DataCollectorService() + service = _create_test_service() assert service is not None + assert service.feedback_dir == Path("/tmp/feedback") + assert service.transcripts_dir == Path("/tmp/transcripts") + assert service.collection_interval == 60 @patch("services.data_collector.time.sleep") -@patch("services.data_collector.configuration") -def test_run_normal_operation(mock_config, mock_sleep) -> None: +def test_run_normal_operation(mock_sleep) -> None: """Test normal operation of the run method.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.data_collector.collection_interval = ( - 60 - ) + service = _create_test_service() with patch.object(service, "_perform_collection") as mock_perform: mock_perform.side_effect = [None, KeyboardInterrupt()] @@ -32,10 +47,9 @@ def test_run_normal_operation(mock_config, mock_sleep) -> None: @patch("services.data_collector.time.sleep") -@patch("services.data_collector.configuration") -def test_run_with_exception(mock_config, mock_sleep) -> None: +def test_run_with_exception(mock_sleep) -> None: """Test run method with exception handling.""" - service = DataCollectorService() + service = _create_test_service() with patch.object(service, "_perform_collection") as mock_perform: mock_perform.side_effect = [OSError("Test error"), KeyboardInterrupt()] @@ -43,534 +57,470 @@ def test_run_with_exception(mock_config, mock_sleep) -> None: service.run() assert mock_perform.call_count == 2 - mock_sleep.assert_called_once_with(300) + mock_sleep.assert_called_once_with( + 300 + ) # constants.DATA_COLLECTOR_RETRY_INTERVAL -@patch("services.data_collector.configuration") -def test_collect_feedback_files_disabled(mock_config) -> None: - """Test collecting feedback files when disabled.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.feedback_enabled = False +def test_collect_feedback_files_directory_not_exists() -> None: + """Test collecting feedback files when directory doesn't exist.""" + service = _create_test_service(feedback_dir=Path("/nonexistent/feedback")) result = service._collect_feedback_files() assert result == [] -@patch("services.data_collector.configuration") -def test_collect_feedback_files_directory_not_exists(mock_config) -> None: - """Test collecting feedback files when directory doesn't exist.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.feedback_enabled = True - mock_config.user_data_collection_configuration.feedback_storage = Path( - "/tmp/feedback" - ) - - with patch("services.data_collector.Path") as mock_path: - mock_path.return_value.exists.return_value = False - - result = service._collect_feedback_files() - assert result == [] - - -@patch("services.data_collector.configuration") -def test_collect_feedback_files_success(mock_config) -> None: +def test_collect_feedback_files_success() -> None: """Test collecting feedback files successfully.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.feedback_enabled = True - - # Create a mock Path object with the required methods - mock_feedback_dir = MagicMock() - mock_feedback_dir.exists.return_value = True - mock_files = [Path("/tmp/feedback/file1.json")] - mock_feedback_dir.glob.return_value = mock_files + service = _create_test_service() + mock_files = [Path("/tmp/feedback/file1.json"), Path("/tmp/feedback/file2.json")] - mock_config.user_data_collection_configuration.feedback_storage = mock_feedback_dir + with ( + patch("pathlib.Path.exists", return_value=True), + patch("pathlib.Path.glob", return_value=mock_files) as mock_glob, # noqa: F841 + ): - result = service._collect_feedback_files() - assert result == mock_files + result = service._collect_feedback_files() + assert result == mock_files -@patch("services.data_collector.configuration") -def test_collect_transcript_files_disabled(mock_config) -> None: - """Test collecting transcript files when disabled.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.transcripts_enabled = False +def test_collect_transcript_files_directory_not_exists() -> None: + """Test collecting transcript files when directory doesn't exist.""" + service = _create_test_service(transcripts_dir=Path("/nonexistent/transcripts")) result = service._collect_transcript_files() assert result == [] -@patch("services.data_collector.configuration") -def test_collect_transcript_files_directory_not_exists(mock_config) -> None: - """Test collecting transcript files when directory doesn't exist.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.transcripts_enabled = True - mock_config.user_data_collection_configuration.transcripts_storage = Path( - "/tmp/transcripts" - ) - - with patch("services.data_collector.Path") as mock_path: - mock_path.return_value.exists.return_value = False - - result = service._collect_transcript_files() - assert result == [] - - -@patch("services.data_collector.configuration") -def test_collect_transcript_files_success(mock_config) -> None: +def test_collect_transcript_files_success() -> None: """Test collecting transcript files successfully.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.transcripts_enabled = True - - # Create a mock Path object with required methods - mock_transcripts_dir = MagicMock() - mock_transcripts_dir.exists.return_value = True + service = _create_test_service() mock_files = [Path("/tmp/transcripts/user1/conv1/file1.json")] - mock_transcripts_dir.rglob.return_value = mock_files - mock_config.user_data_collection_configuration.transcripts_storage = ( - mock_transcripts_dir - ) + with ( + patch("pathlib.Path.exists", return_value=True), + patch( + "pathlib.Path.rglob", return_value=mock_files + ) as mock_rglob, # noqa: F841 + ): - result = service._collect_transcript_files() - assert result == mock_files + result = service._collect_transcript_files() + assert result == mock_files -@patch("services.data_collector.configuration") -def test_perform_collection_no_files(mock_config) -> None: - """Test _perform_collection when no files are found.""" - service = DataCollectorService() +def test_perform_collection_no_files() -> None: + """Test perform collection when no files are found.""" + service = _create_test_service() with ( patch.object(service, "_collect_feedback_files", return_value=[]), patch.object(service, "_collect_transcript_files", return_value=[]), ): - service._perform_collection() + # Should not raise any exceptions and should return early + service._perform_collection() -@patch("services.data_collector.configuration") -def test_perform_collection_with_files(mock_config) -> None: - """Test _perform_collection when files are found.""" - service = DataCollectorService() +def test_perform_collection_with_files() -> None: + """Test perform collection with files.""" + service = _create_test_service() feedback_files = [Path("/tmp/feedback/file1.json")] + transcript_files = [Path("/tmp/transcripts/user1/conv1/file1.json")] with ( patch.object(service, "_collect_feedback_files", return_value=feedback_files), - patch.object(service, "_collect_transcript_files", return_value=[]), - patch.object(service, "_create_and_send_tarball", return_value=1), + patch.object( + service, "_collect_transcript_files", return_value=transcript_files + ), + patch.object( + service, "_create_and_send_tarball", return_value=1 + ) as mock_create_send, ): + service._perform_collection() + # Should be called once for feedback and once for transcripts + assert mock_create_send.call_count == 2 + mock_create_send.assert_any_call( + feedback_files, "feedback", service.feedback_dir + ) + mock_create_send.assert_any_call( + transcript_files, "transcripts", service.transcripts_dir + ) + -@patch("services.data_collector.configuration") -def test_perform_collection_with_exception(mock_config) -> None: - """Test _perform_collection when an exception occurs.""" - service = DataCollectorService() +def test_perform_collection_with_exception() -> None: + """Test perform collection with exception.""" + service = _create_test_service() + feedback_files = [Path("/tmp/feedback/file1.json")] with ( - patch.object( - service, "_collect_feedback_files", return_value=[Path("/tmp/test.json")] - ), + patch.object(service, "_collect_feedback_files", return_value=feedback_files), patch.object(service, "_collect_transcript_files", return_value=[]), patch.object( - service, "_create_and_send_tarball", side_effect=Exception("Test error") + service, "_create_and_send_tarball", side_effect=OSError("Test error") ), ): + + # Should re-raise the exception try: service._perform_collection() - assert False, "Expected exception" - except Exception as e: + assert False, "Expected OSError to be raised" + except OSError as e: assert str(e) == "Test error" -@patch("services.data_collector.configuration") -def test_create_and_send_tarball_no_files(mock_config) -> None: - """Test creating tarball with no files.""" - service = DataCollectorService() +def test_create_and_send_tarball_no_files() -> None: + """Test create and send tarball with no files.""" + service = _create_test_service() - result = service._create_and_send_tarball([], "test", Path("/tmp")) + result = service._create_and_send_tarball([], "feedback", service.feedback_dir) assert result == 0 -@patch("services.data_collector.configuration") -def test_create_and_send_tarball_success(mock_config) -> None: - """Test creating and sending tarball successfully.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.data_collector.cleanup_after_send = ( - True - ) - - files = [Path("/tmp/test/file1.json")] - tarball_path = Path("/tmp/test_tarball.tar.gz") +def test_create_and_send_tarball_success() -> None: + """Test create and send tarball successfully.""" + service = _create_test_service() + files = [Path("/tmp/feedback/file1.json")] with ( - patch.object(service, "_create_tarball", return_value=tarball_path), - patch.object(service, "_send_tarball"), - patch.object(service, "_cleanup_files"), - patch.object(service, "_cleanup_empty_directories"), - patch.object(service, "_cleanup_tarball"), + patch.object( + service, "_create_tarball", return_value=Path("/tmp/test.tar.gz") + ) as mock_create, + patch.object(service, "_send_tarball") as mock_send, + patch.object(service, "_cleanup_files") as mock_cleanup_files, + patch.object(service, "_cleanup_empty_directories") as mock_cleanup_dirs, + patch.object(service, "_cleanup_tarball") as mock_cleanup_tarball, ): - result = service._create_and_send_tarball(files, "test", Path("/tmp")) - assert result == 1 + result = service._create_and_send_tarball( + files, "feedback", service.feedback_dir + ) + + assert result == 1 + mock_create.assert_called_once_with(files, "feedback", service.feedback_dir) + mock_send.assert_called_once_with(Path("/tmp/test.tar.gz")) + mock_cleanup_files.assert_called_once_with(files) + mock_cleanup_dirs.assert_called_once() + mock_cleanup_tarball.assert_called_once_with(Path("/tmp/test.tar.gz")) -@patch("services.data_collector.configuration") -def test_create_and_send_tarball_no_cleanup(mock_config) -> None: - """Test creating and sending tarball without cleanup.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.data_collector.cleanup_after_send = ( - False - ) - files = [Path("/tmp/test/file1.json")] +def test_create_and_send_tarball_no_cleanup() -> None: + """Test create and send tarball without cleanup.""" + service = _create_test_service(cleanup_after_send=False) + files = [Path("/tmp/feedback/file1.json")] with ( - patch.object(service, "_create_tarball", return_value=Path("/tmp/test.tar.gz")), - patch.object(service, "_send_tarball"), - patch.object(service, "_cleanup_tarball"), + patch.object( + service, "_create_tarball", return_value=Path("/tmp/test.tar.gz") + ) as mock_create, + patch.object(service, "_send_tarball") as mock_send, + patch.object(service, "_cleanup_files") as mock_cleanup_files, + patch.object(service, "_cleanup_empty_directories") as mock_cleanup_dirs, + patch.object(service, "_cleanup_tarball") as mock_cleanup_tarball, ): - result = service._create_and_send_tarball(files, "test", Path("/tmp")) + + result = service._create_and_send_tarball( + files, "feedback", service.feedback_dir + ) + assert result == 1 + mock_create.assert_called_once_with(files, "feedback", service.feedback_dir) + mock_send.assert_called_once_with(Path("/tmp/test.tar.gz")) + # Cleanup should not be called when cleanup_after_send is False + mock_cleanup_files.assert_not_called() + mock_cleanup_dirs.assert_not_called() + # But tarball cleanup should still happen + mock_cleanup_tarball.assert_called_once_with(Path("/tmp/test.tar.gz")) -@patch("services.data_collector.datetime") -@patch("services.data_collector.tempfile.gettempdir") @patch("services.data_collector.tarfile.open") -def test_create_tarball_success(mock_tarfile, mock_gettempdir, mock_datetime) -> None: +@patch("services.data_collector.tempfile.gettempdir", return_value="/tmp") +@patch("services.data_collector.datetime") +def test_create_tarball_success( + mock_datetime, mock_gettempdir, mock_tarfile_open +) -> None: """Test creating tarball successfully.""" - service = DataCollectorService() - mock_datetime.now.return_value.strftime.return_value = "20230101_120000" - mock_gettempdir.return_value = "/tmp" + service = _create_test_service() + files = [Path("/tmp/feedback/file1.json")] - mock_tar = MagicMock() - mock_tarfile.return_value.__enter__.return_value = mock_tar + # Mock datetime to return predictable timestamp + mock_datetime.now.return_value.strftime.return_value = "20231201_120000" - files = [Path("/data/test/file1.json")] + # Mock tarfile + mock_tar = MagicMock() + mock_tarfile_open.return_value.__enter__.return_value = mock_tar + # Mock Path.stat() for the created tarball with patch.object(Path, "stat") as mock_stat: mock_stat.return_value.st_size = 1024 - result = service._create_tarball(files, "test", Path("/data")) + result = service._create_tarball(files, "feedback", service.feedback_dir) - expected_path = Path("/tmp/test_20230101_120000.tar.gz") + expected_path = Path("/tmp/feedback_20231201_120000.tar.gz") assert result == expected_path + mock_tarfile_open.assert_called_once_with(expected_path, "w:gz") mock_tar.add.assert_called_once() -@patch("services.data_collector.datetime") -@patch("services.data_collector.tempfile.gettempdir") @patch("services.data_collector.tarfile.open") -def test_create_tarball_file_add_error( - mock_tarfile, mock_gettempdir, mock_datetime -) -> None: +def test_create_tarball_file_add_error(mock_tarfile_open) -> None: """Test creating tarball with file add error.""" - service = DataCollectorService() - mock_datetime.now.return_value.strftime.return_value = "20230101_120000" - mock_gettempdir.return_value = "/tmp" + service = _create_test_service() + files = [Path("/tmp/feedback/file1.json")] + # Mock tarfile to raise error on add mock_tar = MagicMock() - mock_tar.add.side_effect = OSError("File error") - mock_tarfile.return_value.__enter__.return_value = mock_tar - - files = [Path("/data/test/file1.json")] + mock_tar.add.side_effect = OSError("Permission denied") + mock_tarfile_open.return_value.__enter__.return_value = mock_tar with patch.object(Path, "stat") as mock_stat: mock_stat.return_value.st_size = 1024 - result = service._create_tarball(files, "test", Path("/data")) - - expected_path = Path("/tmp/test_20230101_120000.tar.gz") - assert result == expected_path + # Should not raise exception, just log warning + result = service._create_tarball(files, "feedback", service.feedback_dir) + assert isinstance(result, Path) -@patch("services.data_collector.configuration") @patch("services.data_collector.requests.post") -def test_send_tarball_success(mock_post, mock_config) -> None: - """Test successful tarball sending.""" - service = DataCollectorService() - - mock_config.user_data_collection_configuration.data_collector.ingress_server_url = ( - "http://test.com" - ) - mock_config.user_data_collection_configuration.data_collector.ingress_server_auth_token = ( - "token" - ) - mock_config.user_data_collection_configuration.data_collector.connection_timeout = ( - 30 - ) +def test_send_tarball_success(mock_post) -> None: + """Test sending tarball successfully.""" + service = _create_test_service() + tarball_path = Path("/tmp/test.tar.gz") + # Mock successful response mock_response = MagicMock() mock_response.status_code = 200 mock_post.return_value = mock_response - with patch("builtins.open", create=True) as mock_open: + with patch("builtins.open", mock_data=b"test data") as mock_open: mock_open.return_value.__enter__.return_value.read.return_value = b"test data" - service._send_tarball(Path("/tmp/test.tar.gz")) + + service._send_tarball(tarball_path) mock_post.assert_called_once() + call_args = mock_post.call_args + assert call_args[1]["data"] == b"test data" + assert "Authorization" in call_args[1]["headers"] + assert call_args[1]["headers"]["Authorization"] == "Bearer test-token" -@patch("services.data_collector.configuration") @patch("services.data_collector.requests.post") -def test_send_tarball_no_auth_token(mock_post, mock_config) -> None: +def test_send_tarball_no_auth_token(mock_post) -> None: """Test sending tarball without auth token.""" - service = DataCollectorService() - - mock_config.user_data_collection_configuration.data_collector.ingress_server_url = ( - "http://test.com" - ) - mock_config.user_data_collection_configuration.data_collector.ingress_server_auth_token = ( - None - ) - mock_config.user_data_collection_configuration.data_collector.connection_timeout = ( - 30 - ) + service = _create_test_service(ingress_server_auth_token="") + tarball_path = Path("/tmp/test.tar.gz") + # Mock successful response mock_response = MagicMock() mock_response.status_code = 200 mock_post.return_value = mock_response - with patch("builtins.open", create=True): - service._send_tarball(Path("/tmp/test.tar.gz")) + with patch("builtins.open", mock_data=b"test data") as mock_open: + mock_open.return_value.__enter__.return_value.read.return_value = b"test data" + + service._send_tarball(tarball_path) + mock_post.assert_called_once() + call_args = mock_post.call_args + assert "Authorization" not in call_args[1]["headers"] -@patch("services.data_collector.configuration") @patch("services.data_collector.requests.post") -def test_send_tarball_http_error(mock_post, mock_config) -> None: - """Test tarball sending with HTTP error.""" - service = DataCollectorService() - - mock_config.user_data_collection_configuration.data_collector.ingress_server_url = ( - "http://test.com" - ) - mock_config.user_data_collection_configuration.data_collector.connection_timeout = ( - 30 - ) +def test_send_tarball_http_error(mock_post) -> None: + """Test sending tarball with HTTP error.""" + service = _create_test_service() + tarball_path = Path("/tmp/test.tar.gz") + # Mock error response mock_response = MagicMock() mock_response.status_code = 500 - mock_response.text = "Server Error" + mock_response.text = "Internal Server Error" mock_post.return_value = mock_response - with patch("builtins.open", create=True): - try: - service._send_tarball(Path("/tmp/test.tar.gz")) - assert False, "Expected exception" - except Exception as e: - assert "Failed to send tarball" in str(e) + with patch("builtins.open", mock_data=b"test data") as mock_open: + mock_open.return_value.__enter__.return_value.read.return_value = b"test data" + try: + service._send_tarball(tarball_path) + assert False, "Expected HTTPError to be raised" + except requests.HTTPError as e: + assert "500" in str(e) -@patch("services.data_collector.configuration") -def test_send_tarball_missing_url(mock_config) -> None: - """Test tarball sending when ingress server URL is None.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.data_collector.ingress_server_url = ( - None - ) +def test_send_tarball_missing_url() -> None: + """Test sending tarball with missing URL.""" + service = _create_test_service(ingress_server_url="") + tarball_path = Path("/tmp/test.tar.gz") - try: - service._send_tarball(Path("/tmp/test.tar.gz")) - assert False, "Expected ValueError" - except ValueError as e: - assert "Ingress server URL is not configured" in str(e) + with patch("builtins.open", mock_data=b"test data") as mock_open: + mock_open.return_value.__enter__.return_value.read.return_value = b"test data" + # This should raise an exception when requests.post is called with empty URL + try: + service._send_tarball(tarball_path) + except Exception: + pass # Expected to fail with empty URL -@patch("services.data_collector.configuration") -def test_perform_collection_with_specific_exceptions(mock_config) -> None: - """Test _perform_collection with specific exception types that should be caught.""" - service = DataCollectorService() - # Test with OSError - with ( - patch.object( - service, "_collect_feedback_files", return_value=[Path("/tmp/test.json")] - ), - patch.object(service, "_collect_transcript_files", return_value=[]), - patch.object( - service, "_create_and_send_tarball", side_effect=OSError("OS Error") - ), - ): - try: - service._perform_collection() - assert False, "Expected OSError" - except OSError as e: - assert str(e) == "OS Error" +def test_perform_collection_with_specific_exceptions() -> None: + """Test perform collection with specific exception types.""" + service = _create_test_service() + feedback_files = [Path("/tmp/feedback/file1.json")] # Test with requests.RequestException with ( - patch.object( - service, "_collect_feedback_files", return_value=[Path("/tmp/test.json")] - ), + patch.object(service, "_collect_feedback_files", return_value=feedback_files), patch.object(service, "_collect_transcript_files", return_value=[]), patch.object( service, "_create_and_send_tarball", - side_effect=requests.RequestException("Request Error"), + side_effect=requests.RequestException("Network error"), ), ): + try: service._perform_collection() - assert False, "Expected RequestException" - except requests.RequestException as e: - assert str(e) == "Request Error" + assert False, "Expected RequestException to be raised" + except requests.RequestException: + pass # Test with tarfile.TarError with ( - patch.object( - service, "_collect_feedback_files", return_value=[Path("/tmp/test.json")] - ), + patch.object(service, "_collect_feedback_files", return_value=feedback_files), patch.object(service, "_collect_transcript_files", return_value=[]), patch.object( service, "_create_and_send_tarball", - side_effect=tarfile.TarError("Tar Error"), + side_effect=tarfile.TarError("Tar error"), ), ): + try: service._perform_collection() - assert False, "Expected TarError" - except tarfile.TarError as e: - assert str(e) == "Tar Error" + assert False, "Expected TarError to be raised" + except tarfile.TarError: + pass def test_cleanup_files_success() -> None: - """Test successful file cleanup.""" - service = DataCollectorService() - files = [Path("/tmp/test1.json"), Path("/tmp/test2.json")] + """Test cleaning up files successfully.""" + service = _create_test_service() + files = [Path("/tmp/file1.json"), Path("/tmp/file2.json")] with patch.object(Path, "unlink") as mock_unlink: service._cleanup_files(files) + assert mock_unlink.call_count == 2 def test_cleanup_files_with_error() -> None: - """Test file cleanup with error.""" - service = DataCollectorService() - files = [Path("/tmp/test1.json")] + """Test cleaning up files with error.""" + service = _create_test_service() + files = [Path("/tmp/file1.json")] - with patch.object(Path, "unlink") as mock_unlink: - mock_unlink.side_effect = OSError("Permission denied") + with patch.object(Path, "unlink", side_effect=OSError("Permission denied")): + # Should not raise exception, just log warning service._cleanup_files(files) - mock_unlink.assert_called_once() def test_cleanup_tarball_success() -> None: - """Test successful tarball cleanup.""" - service = DataCollectorService() + """Test cleaning up tarball successfully.""" + service = _create_test_service() + tarball_path = Path("/tmp/test.tar.gz") with patch.object(Path, "unlink") as mock_unlink: - service._cleanup_tarball(Path("/tmp/test.tar.gz")) + service._cleanup_tarball(tarball_path) mock_unlink.assert_called_once() def test_cleanup_tarball_with_error() -> None: - """Test tarball cleanup with error.""" - service = DataCollectorService() - - with patch.object(Path, "unlink") as mock_unlink: - mock_unlink.side_effect = OSError("Permission denied") - service._cleanup_tarball(Path("/tmp/test.tar.gz")) - mock_unlink.assert_called_once() - - -@patch("services.data_collector.configuration") -def test_cleanup_empty_directories_disabled(mock_config) -> None: - """Test directory cleanup when transcripts disabled.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.transcripts_enabled = False - - service._cleanup_empty_directories() - + """Test cleaning up tarball with error.""" + service = _create_test_service() + tarball_path = Path("/tmp/test.tar.gz") -@patch("services.data_collector.configuration") -def test_cleanup_empty_directories_success(mock_config) -> None: - """Test successful directory cleanup.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.transcripts_enabled = True + with patch.object(Path, "unlink", side_effect=OSError("Permission denied")): + # Should not raise exception, just log warning + service._cleanup_tarball(tarball_path) - transcripts_dir = MagicMock() - user_dir = MagicMock() - conv_dir = MagicMock() - transcripts_dir.exists.return_value = True - transcripts_dir.iterdir.return_value = [user_dir] - user_dir.is_dir.return_value = True - user_dir.iterdir.side_effect = [ - [conv_dir], - [], - ] # First call returns conv_dir, second call empty - conv_dir.is_dir.return_value = True - conv_dir.iterdir.return_value = [] # Empty directory - - mock_config.user_data_collection_configuration.transcripts_storage = transcripts_dir +def test_cleanup_empty_directories_directory_not_exists() -> None: + """Test cleanup empty directories when directory doesn't exist.""" + service = _create_test_service(transcripts_dir=Path("/nonexistent")) + # Should not raise exception service._cleanup_empty_directories() - conv_dir.rmdir.assert_called_once() - user_dir.rmdir.assert_called_once() - - -@patch("services.data_collector.configuration") -def test_cleanup_empty_directories_with_errors(mock_config) -> None: - """Test directory cleanup when rmdir operations fail.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.transcripts_enabled = True - transcripts_dir = MagicMock() - user_dir = MagicMock() - conv_dir = MagicMock() +def test_cleanup_empty_directories_success() -> None: + """Test cleaning up empty directories successfully.""" + service = _create_test_service() - transcripts_dir.exists.return_value = True - transcripts_dir.iterdir.return_value = [user_dir] - user_dir.is_dir.return_value = True - user_dir.iterdir.side_effect = [[conv_dir], []] - conv_dir.is_dir.return_value = True - conv_dir.iterdir.return_value = [] + # Mock directory structure + mock_user_dir = MagicMock() + mock_conv_dir = MagicMock() + mock_conv_dir.is_dir.return_value = True + mock_conv_dir.iterdir.return_value = [] # Empty directory + mock_user_dir.is_dir.return_value = True + mock_user_dir.iterdir.return_value = [mock_conv_dir] - # Both rmdir operations fail - conv_dir.rmdir.side_effect = OSError("Permission denied") - user_dir.rmdir.side_effect = OSError("Permission denied") + with ( + patch("pathlib.Path.exists", return_value=True), + patch("pathlib.Path.iterdir", return_value=[mock_user_dir]), + ): - mock_config.user_data_collection_configuration.transcripts_storage = transcripts_dir + # After removing conv_dir, user_dir should be empty + mock_user_dir.iterdir.side_effect = [ + [mock_conv_dir], + [], + ] # First call returns conv_dir, second call empty - # Should not raise exception - service._cleanup_empty_directories() + service._cleanup_empty_directories() - conv_dir.rmdir.assert_called_once() - user_dir.rmdir.assert_called_once() + mock_conv_dir.rmdir.assert_called_once() + mock_user_dir.rmdir.assert_called_once() -@patch("services.data_collector.configuration") -def test_cleanup_empty_directories_directory_not_exists(mock_config) -> None: - """Test directory cleanup when transcripts directory doesn't exist.""" - service = DataCollectorService() - mock_config.user_data_collection_configuration.transcripts_enabled = True +def test_cleanup_empty_directories_with_errors() -> None: + """Test cleaning up empty directories with errors.""" + service = _create_test_service() - # Create a mock Path object that doesn't exist - mock_transcripts_dir = MagicMock() - mock_transcripts_dir.exists.return_value = False - mock_config.user_data_collection_configuration.transcripts_storage = ( - mock_transcripts_dir - ) + # Mock directory structure + mock_user_dir = MagicMock() + mock_conv_dir = MagicMock() + mock_conv_dir.is_dir.return_value = True + mock_conv_dir.iterdir.return_value = [] # Empty directory + mock_conv_dir.rmdir.side_effect = OSError("Permission denied") + mock_user_dir.is_dir.return_value = True + mock_user_dir.iterdir.return_value = [mock_conv_dir] - service._cleanup_empty_directories() + with ( + patch("pathlib.Path.exists", return_value=True), + patch("pathlib.Path.iterdir", return_value=[mock_user_dir]), + ): + # Should not raise exception even with rmdir errors + service._cleanup_empty_directories() -@patch("services.data_collector.configuration") -def test_perform_collection_with_transcript_files(mock_config) -> None: - """Test _perform_collection with transcript files only.""" - service = DataCollectorService() - transcript_files = [Path("/tmp/transcripts/file1.json")] +def test_perform_collection_with_transcript_files() -> None: + """Test perform collection with only transcript files.""" + service = _create_test_service() + transcript_files = [Path("/tmp/transcripts/user1/conv1/file1.json")] with ( patch.object(service, "_collect_feedback_files", return_value=[]), patch.object( service, "_collect_transcript_files", return_value=transcript_files ), - patch.object(service, "_create_and_send_tarball", return_value=1), + patch.object( + service, "_create_and_send_tarball", return_value=1 + ) as mock_create_send, ): + service._perform_collection() + + # Should be called once for transcripts only + mock_create_send.assert_called_once_with( + transcript_files, "transcripts", service.transcripts_dir + )