## Introduction

### Imports

In [57]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
import json
from pprint import pprint

## Data exploration - Tree

In [None]:
from json_ld_semantics.semantics import Tree

### First, we load the raw JSON data

In [59]:
filename = "../inputs/2022_MARCH.json"

with open(filename, "r") as jsonfile:
    json_data = json.loads(jsonfile.read())
    
print(str(json_data)[:512] + "...")

{'timelineObjects': [{'placeVisit': {'location': {'latitudeE7': 465196535, 'longitudeE7': 66322734, 'placeId': 'ChIJ5aeJzT4pjEcRXu7iysk_F-s', 'address': 'Lausanne\nSuisse', 'name': 'Lausanne', 'locationConfidence': 100.0, 'calibratedProbability': 100.0}, 'duration': {'startTimestamp': '2022-03-07T13:08:43.398Z', 'endTimestamp': '2022-03-07T17:13:38.828Z'}, 'placeConfidence': 'USER_CONFIRMED', 'visitConfidence': 100, 'otherCandidateLocations': [{'latitudeE7': 465177565, 'longitudeE7': 66284570, 'placeId': 'C...


### Then, we create a tree with the data

In [60]:
tree = Tree(json_data)

### We can explore whatever is inside

In [61]:
tree

Tree '$' with 1 children
- fieldName: $
- data: Length of 290595
- foundType: Root
- descriptiveType: None
- unique: None
- default: None
- description: None
- example: None
- regex: None
- parent: None
- path: $

In [62]:
tree.children[0]

NodeDict 'timelineObjects' with 77 childrens
- fieldName: timelineObjects
- data: Length of 290574
- foundType: list
- descriptiveType: None
- unique: None
- default: None
- description: None
- example: None
- regex: None
- parent: $
- path: $.timelineObjects

In [63]:
print(len(tree.get_paths()))
tree.get_paths()

149


{'$',
 '$.timelineObjects',
 '$.timelineObjects[*]',
 '$.timelineObjects[*].activitySegment',
 '$.timelineObjects[*].activitySegment.activities',
 '$.timelineObjects[*].activitySegment.activities[*]',
 '$.timelineObjects[*].activitySegment.activities[*].activityType',
 '$.timelineObjects[*].activitySegment.activities[*].probability',
 '$.timelineObjects[*].activitySegment.activityType',
 '$.timelineObjects[*].activitySegment.confidence',
 '$.timelineObjects[*].activitySegment.distance',
 '$.timelineObjects[*].activitySegment.duration',
 '$.timelineObjects[*].activitySegment.duration.endTimestamp',
 '$.timelineObjects[*].activitySegment.duration.startTimestamp',
 '$.timelineObjects[*].activitySegment.editActionMetadata',
 '$.timelineObjects[*].activitySegment.editActionMetadata.activitySegment',
 '$.timelineObjects[*].activitySegment.editActionMetadata.activitySegment.activityConfidence',
 '$.timelineObjects[*].activitySegment.editActionMetadata.activitySegment.activityType',
 '$.timeli

In [64]:
tree.get_children_from_path("$.timelineObjects[4].activitySegment.activities[0].activityType")

[NodeDict 'activityType'
 - fieldName: activityType
 - data: Length of 9
 - foundType: str
 - descriptiveType: None
 - unique: None
 - default: None
 - description: None
 - example: None
 - regex: None
 - parent: [0]
 - path: $.timelineObjects[4].activitySegment.activities[0].activityType]

#### `get_children_from_path` supports wildcards and returns every match

In [65]:
tree.get_children_from_path("$.timelineObjects[4].activitySegment.activities[*].*")

[NodeDict 'activityType'
 - fieldName: activityType
 - data: Length of 9
 - foundType: str
 - descriptiveType: None
 - unique: None
 - default: None
 - description: None
 - example: None
 - regex: None
 - parent: [0]
 - path: $.timelineObjects[4].activitySegment.activities[0].activityType,
 NodeDict 'probability'
 - fieldName: probability
 - data: Length of 17
 - foundType: float
 - descriptiveType: None
 - unique: None
 - default: None
 - description: None
 - example: None
 - regex: None
 - parent: [0]
 - path: $.timelineObjects[4].activitySegment.activities[0].probability,
 NodeDict 'activityType'
 - fieldName: activityType
 - data: Length of 8
 - foundType: str
 - descriptiveType: None
 - unique: None
 - default: None
 - description: None
 - example: None
 - regex: None
 - parent: [1]
 - path: $.timelineObjects[4].activitySegment.activities[1].activityType,
 NodeDict 'probability'
 - fieldName: probability
 - data: Length of 18
 - foundType: float
 - descriptiveType: None
 - unique:

## Data semantics - Model

In [None]:
from json_ld_semantics.model import Model

### We create a new model

In [87]:
model = Model(name="Google Geolocation – YEAR_MONTH.json")

### We can add files to a model to process it

In [86]:
model.add_files(filename).process_files()

[('../inputs/2022_MARCH.json', {}),
 ('../inputs/2022_MARCH.json', {}),
 ('../inputs/2022_MARCH.json', {})]

### Or, if we already have a traversal, we can create the model directly from it

In [68]:
model = Model(name="Google Geolocation – YEAR_MONTH.json", traversal=tree.export_traversal())

### Now, we can see what paths are available in our model

In [69]:
model.get_paths()

{'$',
 '$.timelineObjects',
 '$.timelineObjects[*]',
 '$.timelineObjects[*].activitySegment',
 '$.timelineObjects[*].activitySegment.activities',
 '$.timelineObjects[*].activitySegment.activities[*]',
 '$.timelineObjects[*].activitySegment.activities[*].activityType',
 '$.timelineObjects[*].activitySegment.activities[*].probability',
 '$.timelineObjects[*].activitySegment.activityType',
 '$.timelineObjects[*].activitySegment.confidence',
 '$.timelineObjects[*].activitySegment.distance',
 '$.timelineObjects[*].activitySegment.duration',
 '$.timelineObjects[*].activitySegment.duration.endTimestamp',
 '$.timelineObjects[*].activitySegment.duration.startTimestamp',
 '$.timelineObjects[*].activitySegment.editActionMetadata',
 '$.timelineObjects[*].activitySegment.editActionMetadata.activitySegment',
 '$.timelineObjects[*].activitySegment.editActionMetadata.activitySegment.activityConfidence',
 '$.timelineObjects[*].activitySegment.editActionMetadata.activitySegment.activityType',
 '$.timeli

### But more importantly, we can see what attributes are linked to these paths

In [70]:
model.traversal

{'$': {'foundType': json_ld_semantics.semantics.Root,
  'descriptiveType': None,
  'unique': None,
  'default': None,
  'description': None,
  'example': None,
  'regex': None,
  'traversal': {'$.timelineObjects': {'foundType': list,
    'descriptiveType': None,
    'unique': None,
    'default': None,
    'description': None,
    'example': None,
    'regex': None,
    'traversal': {'$.timelineObjects[*]': {'foundType': dict,
      'descriptiveType': None,
      'unique': None,
      'default': None,
      'description': None,
      'example': None,
      'regex': None,
      'traversal': {'$.timelineObjects[*].placeVisit': {'foundType': dict,
        'descriptiveType': None,
        'unique': None,
        'default': None,
        'description': None,
        'example': None,
        'regex': None,
        'traversal': {'$.timelineObjects[*].placeVisit.location': {'foundType': dict,
          'descriptiveType': None,
          'unique': None,
          'default': None,
          'des

In [71]:
import pandas as pd

In [72]:
liste = model.to_list()
df = pd.DataFrame(data=liste[1:], columns=liste[0])
df.head()

Unnamed: 0,path,foundType,descriptiveType,unique,default,description,example,regex
0,$,<class 'json_ld_semantics.semantics.Root'>,,,,,,
1,$.timelineObjects,<class 'list'>,,,,,,
2,$.timelineObjects[*],<class 'dict'>,,,,,,
3,$.timelineObjects[*].placeVisit,<class 'dict'>,,,,,,
4,$.timelineObjects[*].placeVisit.location,<class 'dict'>,,,,,,


### It's a bit empty for now... Let's add some info.

In [73]:
model.set_attribute(
    "$.timelineObjects",
    unique=True,
    description="List of semantic locations determined by Google. Can be either `placeVisit` or `activitySegment`."
)

True

In [74]:
model.set_attribute(
    "$.timelineObjects[*]",
    unique=False,
    description="One of the semantic locations. Can be either `placeVisit` or `activitySegment`."
)

True

In [75]:
model.set_attribute(
    "$.timelineObjects[*].placeVisit.location",
    descriptiveType="https://schema.org/location",
    unique=False,
    description="Location that Google thinks you visited (latitude, longitude, id, address, name and confidence)."
)

True

In [76]:
liste = model.to_list()
df = pd.DataFrame(data=liste[1:], columns=liste[0])
df.head()

Unnamed: 0,path,foundType,descriptiveType,unique,default,description,example,regex
0,$,<class 'json_ld_semantics.semantics.Root'>,,,,,,
1,$.timelineObjects,<class 'list'>,,True,,List of semantic locations determined by Googl...,,
2,$.timelineObjects[*],<class 'dict'>,,False,,One of the semantic locations. Can be either `...,,
3,$.timelineObjects[*].placeVisit,<class 'dict'>,,,,,,
4,$.timelineObjects[*].placeVisit.location,<class 'dict'>,https://schema.org/location,False,,Location that Google thinks you visited (latit...,,


## Data semantics - Apply the model

### Now that we have a working model, let's apply it back to our existing tree

In [88]:
tree.get_children_from_path("$.timelineObjects[0].placeVisit.location")[0]

NodeDict 'location' with 7 childrens
- fieldName: location
- data: Length of 204
- foundType: dict
- descriptiveType: https://schema.org/location
- unique: False
- default: None
- description: Location that Google thinks you visited (latitude, longitude, id, address, name and confidence).
- example: None
- regex: None
- parent: placeVisit
- path: $.timelineObjects[0].placeVisit.location

In [81]:
tree.apply(model)

True

In [82]:
tree.get_children_from_path("$.timelineObjects[0].placeVisit.location")[0]

NodeDict 'timelineObjects' with 77 childrens
- fieldName: timelineObjects
- data: Length of 290574
- foundType: list
- descriptiveType: None
- unique: True
- default: None
- description: List of semantic locations determined by Google. Can be either `placeVisit` or `activitySegment`.
- example: None
- regex: None
- parent: $
- path: $.timelineObjects

## Data filtering - Add filters to the model

In [90]:
from json_ld_semantics.filters import Filter

### First, we need to create a Filter object. Either directly...

In [104]:
keep_placeVisit_only = Filter(model, path__startswith="$.timelineObjects[*].placeVisit")

### ... Or with an existing, external filter.

In [105]:
{
    "paths": [
        "bla",
        "bla"
    ]
    "filters": [
        ("left", "op", "right"),
        ("left", "op", "right"),
        ("left", "op", "right"),
    ]
}

# or

[
    ("left", "op", "right"),
    ("left", "op", "right"),
    ("left", "op", "right"),
]

[('path',
  <function json_ld_semantics.filters.<lambda>(a, b)>,
  '$.timelineObjects[*].placeVisit')]

In [None]:
keep_HIGH_CONFIDENCE_only = Filter

### Once the filters are available, you can apply it on existing data.

In [None]:
tree

In [None]:
tree.apply_model(model)

In [None]:
tree

## Data concierge - Export and import

### If you have an existing model, you can export it.

In [None]:
model.dump_traversal(filename="my_model.pickle")

### Afterwards it can be imported as well.

In [None]:
model.load_traversal(filename="my_model.pickle")