lightspeed-core · tisnik · Jul 14, 2025 · Jul 3, 2025 · Jul 7, 2025 · Jul 7, 2025
diff --git a/Makefile b/Makefile
@@ -8,6 +8,9 @@ PYTHON_REGISTRY = pypi
 run: ## Run the service locally
 	uv run src/lightspeed_stack.py
 
+run-data-collector: ## Run the data collector service locally
+	uv run src/lightspeed_stack.py --data-collector
+
 test-unit: ## Run the unit tests
 	@echo "Running unit tests..."
 	@echo "Reports will be written to ${ARTIFACT_DIR}"

diff --git a/README.md b/README.md
@@ -157,6 +157,7 @@ Usage: make <OPTIONS> ... <TARGETS>
 Available targets are:
 
 run                               Run the service locally
+run-data-collector                Run the data collector service
 test-unit                         Run the unit tests
 test-integration                  Run integration tests tests
 test-e2e                          Run BDD tests for the service
@@ -308,3 +309,46 @@ This script re-generated OpenAPI schema for the Lightspeed Service REST API.
 make schema
 ```
 
+## Data Collector Service
+
+The data collector service is a standalone service that runs separately from the main web service. It is responsible for collecting and sending user data including feedback and transcripts to an ingress server for analysis and archival.
+
+### Features
+
+- **Periodic Collection**: Runs at configurable intervals
+- **Data Packaging**: Packages feedback and transcript files into compressed tar.gz archives
+- **Secure Transmission**: Sends data to a configured ingress server with optional authentication
+- **File Cleanup**: Optionally removes local files after successful transmission
+- **Error Handling**: Includes retry logic and comprehensive error handling
+
+### Configuration
+
+The data collector service is configured through the `user_data_collection.data_collector` section in your configuration file:
+
+```yaml
+user_data_collection:
+  feedback_disabled: false
+  feedback_storage: "/tmp/data/feedback"
+  transcripts_disabled: false
+  transcripts_storage: "/tmp/data/transcripts"
+  data_collector:
+    enabled: true
+    ingress_server_url: "https://your-ingress-server.com"
+    ingress_server_auth_token: "your-auth-token"
+    ingress_content_service_name: "lightspeed-team"
+    collection_interval: 7200  # 2 hours in seconds
+    cleanup_after_send: true
+    connection_timeout: 30
+```
+
+### Running the Service
+
+To run the data collector service:
+
+```bash
+# Using Python directly
+uv run src/lightspeed_stack.py --data-collector
+
+# Using Make target
+make run-data-collector
+```
diff --git a/lightspeed-stack.yaml b/lightspeed-stack.yaml
@@ -20,5 +20,13 @@ user_data_collection:
   feedback_storage: "/tmp/data/feedback"
   transcripts_disabled: false
   transcripts_storage: "/tmp/data/transcripts"
+  data_collector:
+    enabled: false
+    ingress_server_url: null
+    ingress_server_auth_token: null
+    ingress_content_service_name: null
+    collection_interval: 7200  # 2 hours in seconds
+    cleanup_after_send: true
+    connection_timeout_seconds: 30
 authentication:
   module: "noop"
diff --git a/pyproject.toml b/pyproject.toml
@@ -47,6 +47,7 @@ dev = [
     "pydocstyle>=6.3.0",
     "mypy>=1.16.0",
     "types-PyYAML>=6.0.2",
+    "types-requests>=2.28.0",
     "ruff>=0.11.13",
     "aiosqlite",
     "behave>=1.2.6",

diff --git a/src/constants.py b/src/constants.py
@@ -42,3 +42,8 @@
     }
 )
 DEFAULT_AUTHENTICATION_MODULE = AUTH_MOD_NOOP
+
+# Data collector constants
+DATA_COLLECTOR_COLLECTION_INTERVAL = 7200  # 2 hours in seconds
+DATA_COLLECTOR_CONNECTION_TIMEOUT = 30
+DATA_COLLECTOR_RETRY_INTERVAL = 300  # 5 minutes in seconds
diff --git a/src/lightspeed_stack.py b/src/lightspeed_stack.py
@@ -10,6 +10,7 @@
 from rich.logging import RichHandler
 
 from runners.uvicorn import start_uvicorn
+from runners.data_collector import start_data_collector
 from configuration import configuration
 from client import LlamaStackClientHolder, AsyncLlamaStackClientHolder
 
@@ -47,6 +48,13 @@ def create_argument_parser() -> ArgumentParser:
         help="path to configuration file (default: lightspeed-stack.yaml)",
         default="lightspeed-stack.yaml",
     )
+    parser.add_argument(
+        "--data-collector",
+        dest="start_data_collector",
+        help="start data collector service instead of web service",
+        action="store_true",
+        default=False,
+    )
     return parser
 
 
@@ -70,6 +78,10 @@ def main() -> None:
 
     if args.dump_configuration:
         configuration.configuration.dump()
+    elif args.start_data_collector:
+        start_data_collector(
+            configuration.user_data_collection_configuration.data_collector
+        )
     else:
         start_uvicorn(configuration.service_configuration)
     logger.info("Lightspeed stack finished")

diff --git a/src/models/config.py b/src/models/config.py
@@ -2,7 +2,7 @@
 
 from typing import Optional
 
-from pydantic import BaseModel, model_validator, FilePath, AnyHttpUrl
+from pydantic import BaseModel, model_validator, FilePath, AnyHttpUrl, PositiveInt
 from typing_extensions import Self
 
 import constants
@@ -85,13 +85,39 @@ def check_llama_stack_model(self) -> Self:
         return self
 
 
+class DataCollectorConfiguration(BaseModel):
+    """Data collector configuration for sending data to ingress server."""
+
+    enabled: bool = False
+    ingress_server_url: Optional[str] = None
+    ingress_server_auth_token: Optional[str] = None
+    ingress_content_service_name: Optional[str] = None
+    collection_interval: PositiveInt = constants.DATA_COLLECTOR_COLLECTION_INTERVAL
+    cleanup_after_send: bool = True  # Remove local files after successful send
+    connection_timeout: PositiveInt = constants.DATA_COLLECTOR_CONNECTION_TIMEOUT
+
+    @model_validator(mode="after")
+    def check_data_collector_configuration(self) -> Self:
+        """Check data collector configuration."""
+        if self.enabled and self.ingress_server_url is None:
+            raise ValueError(
+                "ingress_server_url is required when data collector is enabled"
+            )
+        if self.enabled and self.ingress_content_service_name is None:
+            raise ValueError(
+                "ingress_content_service_name is required when data collector is enabled"
+            )
+        return self
+
+
 class UserDataCollection(BaseModel):
     """User data collection configuration."""
 
     feedback_disabled: bool = True
     feedback_storage: Optional[str] = None
     transcripts_disabled: bool = True
     transcripts_storage: Optional[str] = None
+    data_collector: DataCollectorConfiguration = DataCollectorConfiguration()
 
     @model_validator(mode="after")
     def check_storage_location_is_set_when_needed(self) -> Self:

diff --git a/src/runners/data_collector.py b/src/runners/data_collector.py
@@ -0,0 +1,26 @@
+"""Data collector runner."""
+
+import logging
+
+from models.config import DataCollectorConfiguration
+from services.data_collector import DataCollectorService
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def start_data_collector(configuration: DataCollectorConfiguration) -> None:
+    """Start the data collector service as a standalone process."""
+    logger.info("Starting data collector runner")
+
+    if not configuration.enabled:
+        logger.info("Data collection is disabled")
+        return
+
+    try:
+        service = DataCollectorService()
+        service.run()
+    except Exception as e:
+        logger.error(
+            "Data collector service encountered an exception: %s", e, exc_info=True
+        )
+        raise
-    try:
-        service = DataCollectorService()
-        service.run()
-    except Exception as e:
-        logger.error(
-            "Data collector service encountered an exception: %s", e, exc_info=True
-        )
-        raise
+--- src/runners/data_collector.py
+@@ lines 19-26
+     try:
+-        service = DataCollectorService()
+        service = DataCollectorService(configuration)
+         service.run()
+     except Exception as e:
+         logger.error(
+             "Data collector service encountered an exception: %s", e, exc_info=True
+         )
+         raise
-    try:
-        service = DataCollectorService()
-        service.run()
-    except Exception as e:
-        logger.error(
-            "Data collector service encountered an exception: %s", e, exc_info=True
-        )
-        raise
+--- src/services/data_collector.py
+@@
+ class DataCollectorService:  # pylint: disable=too-few-public-methods
+    def __init__(self, config: DataCollectorConfiguration):
+        self._collector_config = config
+ 
+     def run(self) -> None:
+-        collector_config = configuration.user_data_collection_configuration.data_collector
+        collector_config = self._collector_config
+         logger.info("Starting data collection service")
+         …
-    try:
-        service = DataCollectorService()
-        service.run()
-    except Exception as e:
-        logger.error(
-            "Data collector service encountered an exception: %s", e, exc_info=True
-        )
-        raise
+--- src/runners/data_collector.py
+@@ lines 19-26
+     try:
+-        service = DataCollectorService()
+        service = DataCollectorService(configuration)
+         service.run()
+     except Exception as e:
+         logger.error(
+             "Data collector service encountered an exception: %s", e, exc_info=True
+         )
+         raise
-    try:
-        service = DataCollectorService()
-        service.run()
-    except Exception as e:
-        logger.error(
-            "Data collector service encountered an exception: %s", e, exc_info=True
-        )
-        raise
+--- src/services/data_collector.py
+@@
+ class DataCollectorService:  # pylint: disable=too-few-public-methods
+    def __init__(self, config: DataCollectorConfiguration):
+        self._collector_config = config
+ 
+     def run(self) -> None:
+-        collector_config = configuration.user_data_collection_configuration.data_collector
+        collector_config = self._collector_config
+         logger.info("Starting data collection service")
+         …
diff --git a/src/services/__init__.py b/src/services/__init__.py
@@ -0,0 +1 @@
+"""Services package."""