diff --git a/docs/index.rst b/docs/index.rst index a196d98..c692b83 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,7 +11,7 @@ Jupyter Telemetry Telemetry provides a configurable traitlets object, EventLog, for structured event-logging in Python. It leverages Python's standard logging library for filtering, handling, and recording events. All events are validated (using jsonschema) against registered JSON schemas. -If you're looking for telemetry in Jupyter frontend applications (like JupyterLab), checkout the work happening in jupyterlab-telemetry_! +If you're looking for telemetry in Jupyter frontend applications (like JupyterLab), checkout the work happening in jupyterlab-telemetry_! .. _jupyterlab-telemetry: https://github.com/jupyterlab/jupyterlab-telemetry @@ -98,8 +98,8 @@ Schemas can be registered from a Python dict object, a file, or a URL. This exam :maxdepth: 2 :caption: Table of Contents: - pages/schemas.rst - + pages/schemas + pages/sensitive-data Indices and tables ------------------ diff --git a/docs/pages/schemas.rst b/docs/pages/schemas.rst index bf2ded8..180fc0d 100644 --- a/docs/pages/schemas.rst +++ b/docs/pages/schemas.rst @@ -2,7 +2,7 @@ Writing a Schema ================ -Schemas should follow valid `JSON schema`_. These schemas can be written in valid YAML or JSON. +Schemas should follow valid `JSON schema`_. These schemas can be written in valid YAML or JSON. At a minimum, valid schemas should have the following keys: @@ -16,7 +16,13 @@ At a minimum, valid schemas should have the following keys: + ``title`` : name of the property + ``description``: documentation for this property. - + ``pii``: (optional) boolean for whether this property is personally identifiable information or not. + + ``level``: the level of sensitivity of this property. + + Jupyter Telemetry provides three levels of sensitivity. The list of sensitivity level in increasing order: + + + ``'unrestricted'`` + + ``'user-identifier'`` + + ``'user-identifiable-information'`` - ``required``: list of required properties. @@ -33,11 +39,15 @@ Here is a minimal example of a valid JSON schema for an event. properties: name: title: Name - description: | - Name of event - type: string + level: unrestricted + description: Name of event + user: + title: User name + level: user-identifier + description: Name of user who initiated event required: - name + - user .. _JSON schema: https://json-schema.org/ \ No newline at end of file diff --git a/docs/pages/sensitive-data.rst b/docs/pages/sensitive-data.rst new file mode 100644 index 0000000..6f41822 --- /dev/null +++ b/docs/pages/sensitive-data.rst @@ -0,0 +1,42 @@ +Logging sensitive data +====================== + +Jupyter Telemetry requires that all fields in an event schema explicitly state when personally-identifiable information is being collected. Each property in an event schema must have a ``sensitivity_level`` attribute. + +Since events may include data with varying degrees of sensitivity, Jupyter Telemetry uses a multi-level security approach. It exposed three levels of sensitivity: + + + ``'unrestricted'`` + + ``'personal-identifier'`` + + ``'personally-identifiable-information'`` + +Each event property can be given one of these three levels: + +.. code-block:: yaml + + $id: example.schema + ... + sensitivity_level: + properties: + name: + title: Name + sensitivity_level: unrestricted + description: Name of event + type: string + user: + title: User name + sensitivity_level: personal-identifier + description: Name of the user who initiated the event. + affiliation: + title: I + +Jupyter Telemetry uses ``sensitivity_level`` to drop sensitive data when emitting events. By default, properties greater than "unclassifed" are dropped from recorded event data. + +Each logging handler increase the level of sensitive data it emots. This can be configured by changing its ``.event_level`` attribute. + +.. code-block:: python + + import logging + + handler = logging.FileHandler('events.log') + handler.event_level = 'secret' + diff --git a/jupyter_telemetry/eventlog.py b/jupyter_telemetry/eventlog.py index f0cb448..47f2eaf 100644 --- a/jupyter_telemetry/eventlog.py +++ b/jupyter_telemetry/eventlog.py @@ -6,13 +6,13 @@ from datetime import datetime import jsonschema -from pythonjsonlogger import jsonlogger from ruamel.yaml import YAML from traitlets import List from traitlets.config import Configurable, Config from .traits import Handlers from . import TELEMETRY_METADATA_VERSION +from .formatter import JsonEventFormatter yaml = YAML(typ='safe') @@ -64,19 +64,23 @@ def __init__(self, *args, **kwargs): self.log.propagate = False # We will use log.info to emit self.log.setLevel(logging.INFO) + self.schemas = {} if self.handlers: - formatter = jsonlogger.JsonFormatter(json_serializer=_skip_message) for handler in self.handlers: + # Create a formatter for this handler. + formatter = JsonEventFormatter( + logger=self, + handler=handler, + json_serializer=_skip_message + ) + # Set formatted for handler. handler.setFormatter(formatter) self.log.addHandler(handler) - self.schemas = {} - def _load_config(self, cfg, section_names=None, traits=None): """Load EventLog traits from a Config object, patching the handlers trait in the Config object to avoid deepcopy errors. - """ my_cfg = self._find_my_config(cfg) handlers = my_cfg.pop("handlers", []) @@ -112,7 +116,7 @@ def register_schema(self, schema): jsonschema.validators.validator_for(schema).check_schema(schema) # Check that the properties we require are present - required_schema_fields = {'$id', 'version'} + required_schema_fields = {'$id', 'version', 'properties'} for rsf in required_schema_fields: if rsf not in schema: raise ValueError( diff --git a/jupyter_telemetry/formatter.py b/jupyter_telemetry/formatter.py new file mode 100644 index 0000000..f4c895d --- /dev/null +++ b/jupyter_telemetry/formatter.py @@ -0,0 +1,58 @@ +from traitlets import HasTraits, validate, Set +from pythonjsonlogger import jsonlogger + + +class JsonEventFormatter(jsonlogger.JsonFormatter): + """Patch the jsonlogger formatter to include levels for telemetry. + + Properties in a logged event that has a level less than + the handler's event_level will be dropped from the emitted event. + """ + def __init__(self, logger, handler, *args, **kwargs): + self.logger = logger + self.handler = handler + super(JsonEventFormatter, self).__init__(*args, **kwargs) + + @property + def allowed_tags(self): + return getattr(self.handler, 'allowed_tags', {}) + + @property + def hashed_tags(self): + return getattr(self.handler, 'hashed_tags', {}) + + def process_log_record(self, log_record): + log_record = super(JsonEventFormatter, self).process_log_record(log_record) + return self.process_tags(log_record) + + def drop_property(self, key, record): + del record[key] + return record + + def hash_property(self, key, record): + hash_function = lambda x: x + record[key] = hash_function(record[key]) + return record + + def process_tags(self, log_record): + """ + """ + # Registered schemas are identified by their name and version. + key = (log_record['__schema__'], log_record['__schema_version__']) + schema = self.logger.schemas[key]['properties'] + props = [key for key in log_record.keys() + if not key.startswith('__') and key != 'message'] + + # Walk through the recorded event and handle each key + # based on its tag/category. + for key in props: + tag = schema[key]['tag'] + if tag in self.allowed_tags or tag == "unrestricted": + # If the tag is found in the allowed_tags trait, do nothing. + if tag in self.hashed_tags: + log_record = self.hash_property(key, log_record) + # Drop tags not listed in allowed_tags + else: + log_record = self.drop_property(key, log_record) + + return log_record diff --git a/tests/test_eventlog.py b/tests/test_eventlog.py index f706353..1a0409b 100644 --- a/tests/test_eventlog.py +++ b/tests/test_eventlog.py @@ -40,7 +40,6 @@ def test_good_config_file(tmp_path): # Pass config to EventLog e = EventLog(config=cfg) - # Assert the assert len(e.handlers) > 0 assert isinstance(e.handlers[0], logging.Handler) @@ -49,4 +48,4 @@ def test_bad_config_file(tmp_path): cfg = get_config_from_file(tmp_path, BAD_CONFIG) with pytest.raises(TraitError): - e = EventLog(config=cfg) + EventLog(config=cfg) diff --git a/tests/test_register_schema.py b/tests/test_register_schema.py index 90d75f4..9155d3f 100644 --- a/tests/test_register_schema.py +++ b/tests/test_register_schema.py @@ -55,7 +55,8 @@ def test_reserved_properties(): 'version': 1, 'properties': { '__fail__': { - 'type': 'string' + 'type': 'string', + 'level': 'unclassified' }, }, }) @@ -100,7 +101,8 @@ def test_record_event(): 'version': 1, 'properties': { 'something': { - 'type': 'string' + 'type': 'string', + 'level': 'unclassified' }, }, } @@ -138,7 +140,8 @@ def test_register_schema_file(): 'version': 1, 'properties': { 'something': { - 'type': 'string' + 'type': 'string', + 'level': 'unclassified' }, }, } @@ -166,7 +169,8 @@ def test_allowed_schemas(): 'version': 1, 'properties': { 'something': { - 'type': 'string' + 'type': 'string', + 'level': 'unclassified' }, }, } @@ -194,10 +198,12 @@ def test_record_event_badschema(): 'version': 1, 'properties': { 'something': { - 'type': 'string' + 'type': 'string', + 'level': 'unclassified' }, 'status': { - 'enum': ['success', 'failure'] + 'enum': ['success', 'failure'], + 'level': 'unclassified' } } } diff --git a/tests/test_tags.py b/tests/test_tags.py new file mode 100644 index 0000000..c50e027 --- /dev/null +++ b/tests/test_tags.py @@ -0,0 +1,79 @@ +import io +import json +import logging + +from jupyter_telemetry.eventlog import EventLog + +import pytest + + +@pytest.fixture +def version(): return 1 + + +@pytest.fixture() +def schema_id(): return 'test.event' + + +@pytest.fixture +def schema(schema_id, version): + return { + '$id': schema_id, + 'title': 'Test Event', + 'version': version, + 'description': 'Test Event.', + 'type': 'object', + 'properties': { + 'nothing-exciting': { + 'description': 'a property with nothing exciting happening', + 'tag': 'unrestricted', + 'type': 'string' + }, + 'id': { + 'description': 'user ID', + 'tag': 'user-identifier', + 'type': 'string' + }, + 'email': { + 'description': 'email address', + 'tag': 'user-identifiable-information', + 'type': 'string' + }, + } + } + + +@pytest.mark.parametrize( + 'tags,expected_props', + [ + ({'unrestricted'}, {'nothing-exciting'}), + ({'user-identifier'}, {'nothing-exciting', 'id'}), + ({'user-identifiable-information'}, {'nothing-exciting', 'email'}) + ] +) +def test_properties_tags(schema, schema_id, version, tags, expected_props): + sink = io.StringIO() + + # Create a handler that captures+records events with allowed tags. + handler = logging.StreamHandler(sink) + handler.allowed_tags = tags + + e = EventLog( + handlers=[handler], + allowed_schemas=[schema_id] + ) + e.register_schema(schema) + + event = { + 'nothing-exciting': 'hello, world', + 'id': 'test id', + 'email': 'test@testemail.com', + } + + # Record event and read output + e.record_event(schema_id, version, event) + recorded_event = json.loads(sink.getvalue()) + recorded_props = set([key for key in recorded_event if not key.startswith('__')]) + + assert expected_props == recorded_props +