Skip to content

Commit

Permalink
Merge pull request #59 from kiendang/categories-filter
Browse files Browse the repository at this point in the history
Categories filtering for nested properties
  • Loading branch information
Zsailer committed Jun 2, 2021
2 parents e053682 + 27b21e6 commit d2ded1d
Show file tree
Hide file tree
Showing 7 changed files with 737 additions and 44 deletions.
184 changes: 184 additions & 0 deletions jupyter_telemetry/_categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
from collections import deque

from jsonschema import Draft7Validator, validators
from jsonschema.exceptions import ValidationError


class ExtractCategories(ValidationError):
"""
A special `jsonschema.ValidationError` that carries information about the
`categories` keyword, intended to be yielded whenever a `categories` keyword
is encountered during `jsonschema` JSON validation.
The primary use case for this class is to make use of the JSON validation
mechanism implemented by `jsonschema` to extract all categories associated
with each property in a JSON instance based on a JSON schema. It is not
intended to be used as an actual validation error.
"""

def __init__(self, property, categories, *args, **kwargs):
super(ValidationError, self).__init__(*args, **kwargs)
self.property = property
self.categories = categories


def extend_with_categories(validator_class):
"""
Extend a `jsonschema.IValidator` class so that it yields a `_ExtractCategories`
whenever a `categories` keyword is encountered during JSON validation
Parameters
----------
validator_class : jsonschema.IValidator
an existing validator class
Returns
-------
jsonschema.IValidator
a new `jsonschema.IValidator` class extending the one provided
Examples
--------
from jsonschema import Draft7Validator
CategoryExtractor = extend_with_categories(Draft7Validator)
"""
validate_properties = validator_class.VALIDATORS["properties"]

def get_categories(validator, properties, instance, schema):
for property, subschema in properties.items():
if "categories" in subschema:
yield ExtractCategories(property, subschema["categories"], message=None)

for error in validate_properties(
validator, properties, instance, schema,
):
yield error

return validators.extend(
validator_class, {"properties": get_categories},
)


JSONSchemaValidator = Draft7Validator
CategoryExtractor = extend_with_categories(JSONSchemaValidator)


# Ignore categories under any of these jsonschema keywords
IGNORE_CATEGORIES_SCHEMA_KEYWORDS = {
'if', 'not', 'anyOf', 'oneOf', 'then', 'else'
}


def extract_categories_from_errors(errors):
for e in errors:
if (
isinstance(e, ExtractCategories) and
not any(p in IGNORE_CATEGORIES_SCHEMA_KEYWORDS
for p in e.absolute_schema_path)
):
yield e
else:
yield from extract_categories_from_errors(e.context)


def extract_categories_from_event(event, schema):
"""
Generate a `dict` of `_ExtractCategories` whose keys are pointers to the properties
Parameters
----------
event : dict
A telemetry event
schema : dict
A JSON schema
Returns
-------
dict
A mapping from properties in the event to their categories.
In each entry, the key is a pointer to a property in the event
(in the form of a tuple) and the value is a `_ExtractCategories`
containing the categories associated with that property.
"""
return {
tuple(c.absolute_path + deque([c.property])): c
for c in extract_categories_from_errors(
CategoryExtractor(schema).iter_errors(event)
)
}


def filter_categories_from_event(event, schema, allowed_categories, allowed_properties):
"""
Filter properties from an event based on their categories.
Only whitelisted properties and properties whose categories are allowed are kept.
Parameters
----------
event : dict
The input telemetry event
schema : dict
A JSON schema that makes use of the the `categories` keyword to
specify what categories are associated with a certain property.
allowed_categories : set
Specify which categories are allowed
allowed_properties : set
Whitelist certain top level properties.
These properties are included in the output event even if not all of
their properties are allowed.
Returns
-------
dict
The output event after category filtering
"""
categories = extract_categories_from_event(event, schema)

# Top-level properties without declared categories are set to null
for property in event.keys():
path = (property,)
if path not in categories:
event[property] = None

# Allow only properties whose categories are included in allowed_categories
# and whose top-level parent is included in allowed_properties
not_allowed = (
c for p, c in categories.items()
if not (set(c.categories).issubset(allowed_categories) or
p[0] in allowed_properties)
)

for c in not_allowed:
# In case both a sub property and its parent, e.g. ['user', 'name'] and
# ['user'], do not have all the allowed categories and are to be removed,
# if the parent is removed first then attempting to access
# the descendent would either return None or raise an IndexError or
# KeyError. Just skip it.
try:
item = deep_get(event, c.absolute_path)
except IndexError:
continue
except KeyError:
continue

if item is not None:
item[c.property] = None

return event


def deep_get(instance, path):
result = instance
while result is not None and path:
result = result[path.popleft()]
return result
1 change: 1 addition & 0 deletions jupyter_telemetry/categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ._categories import JSONSchemaValidator, filter_categories_from_event # noqa
26 changes: 8 additions & 18 deletions jupyter_telemetry/eventlog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import logging
from datetime import datetime

import jsonschema
from pythonjsonlogger import jsonlogger
try:
from ruamel.yaml import YAML
Expand All @@ -29,6 +28,8 @@
from .traits import Handlers, SchemaOptions
from . import TELEMETRY_METADATA_VERSION

from .categories import JSONSchemaValidator, filter_categories_from_event

yaml = YAML(typ='safe')


Expand Down Expand Up @@ -131,7 +132,7 @@ def register_schema(self, schema):
"""
# Check if our schema itself is valid
# This throws an exception if it isn't valid
jsonschema.validators.validator_for(schema).check_schema(schema)
JSONSchemaValidator.check_schema(schema)

# Check that the properties we require are present
required_schema_fields = {'$id', 'version', 'properties'}
Expand Down Expand Up @@ -225,7 +226,7 @@ def record_event(self, schema_name, version, event, timestamp_override=None):
schema = self.schemas[(schema_name, version)]

# Validate the event data.
jsonschema.validate(event, schema)
JSONSchemaValidator(schema).validate(event)

# Generate the empty event capsule.
if timestamp_override is None:
Expand All @@ -244,21 +245,10 @@ def record_event(self, schema_name, version, event, timestamp_override=None):
allowed_categories = self.get_allowed_categories(schema_name)
allowed_properties = self.get_allowed_properties(schema_name)

# Iterate through the event properties, and only record the
# properties labelled with allowed_categories
for property_name, data in event.items():
prop_categories = schema["properties"][property_name]["categories"]
# If the property is explicitly listed in
# the allowed_properties, then include it in the capsule
if property_name in allowed_properties:
capsule[property_name] = data
# All of the property categories must be listed in the the allowed
# categories for this property to be recorded.
elif any([cat in allowed_categories for cat in prop_categories]):
capsule[property_name] = data
# Else return that property with a value of null
else:
capsule[property_name] = None
filtered_event = filter_categories_from_event(
event, schema, allowed_categories, allowed_properties
)
capsule.update(filtered_event)

self.log.info(capsule)
return capsule
Empty file added tests/__init__.py
Empty file.
37 changes: 11 additions & 26 deletions tests/test_allowed_schemas.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import io
import json
import logging
from textwrap import dedent as _
from ruamel.yaml import YAML

from jupyter_telemetry.eventlog import EventLog

import pytest

from .utils import get_event_data


SCHEMA_ID = "test.event"
VERSION = 1


@pytest.fixture
def schema():
return {
return {
'$id': SCHEMA_ID,
'title': 'Test Event',
'version': VERSION,
Expand Down Expand Up @@ -103,13 +103,13 @@ def test_missing_categories_label():
assert 'All properties must have a "categories"' in str(err.value)



EVENT_DATA = {
'nothing-exciting': 'hello, world',
'id': 'test id',
'email': 'test@testemail.com',
}


@pytest.mark.parametrize(
'allowed_schemas,expected_output',
[
Expand Down Expand Up @@ -198,28 +198,13 @@ def test_missing_categories_label():
]
)
def test_allowed_schemas(schema, allowed_schemas, expected_output):
sink = io.StringIO()

# Create a handler that captures+records events with allowed tags.
handler = logging.StreamHandler(sink)

e = EventLog(
handlers=[handler],
allowed_schemas=allowed_schemas
event_data = get_event_data(
EVENT_DATA,
schema,
SCHEMA_ID,
VERSION,
allowed_schemas
)
e.register_schema(schema)

event = {
'nothing-exciting': 'hello, world',
'id': 'test id',
'email': 'test@testemail.com',
}

# Record event and read output
e.record_event(SCHEMA_ID, VERSION, EVENT_DATA)
recorded_event = json.loads(sink.getvalue())
event_data = {key: value for key, value in recorded_event.items() if not key.startswith('__')}

# Verify that *exactly* the right properties are recorded.
assert expected_output == event_data

0 comments on commit d2ded1d

Please sign in to comment.