In [1]:
import json
import os
import typing

PARENT_DIR = os.path.dirname(os.getcwd())

import apache_beam as beam
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.runners.interactive import interactive_runner
from apache_beam.transforms.combiners import CountCombineFn

In [2]:
class EventLog(typing.NamedTuple):
    ip: str
    id: str
    lat: float
    lng: float
    user_agent: str
    age_bracket: str
    opted_into_marketing: bool
    http_request: str
    http_response: int
    file_size_bytes: int
    event_datetime: str
    event_ts: int


class PerUserAggregate(typing.NamedTuple):
    id: str
    page_views: int
    total_bytes: int
    max_bytes: int
    min_bytes: int

beam.coders.registry.register_coder(EventLog, beam.coders.RowCoder)
beam.coders.registry.register_coder(PerUserAggregate, beam.coders.RowCoder)


def parse_json(element: str):
    row = json.loads(element)
    # lat/lng sometimes empty string
    if not row["lat"] or not row["lng"]:
        row = {**row, **{"lat": -1, "lng": -1}}
    return EventLog(**row)

In [3]:
p = beam.Pipeline(interactive_runner.InteractiveRunner())
events = (
    p
    | "Read from files"
    >> beam.io.ReadFromText(
        file_pattern=os.path.join(os.path.join(PARENT_DIR, "inputs", "*.out"))
    )
    | "Parse elements" >> beam.Map(parse_json).with_output_types(EventLog)
    | "Aggregate by user"
    >> beam.GroupBy("id")
    .aggregate_field("id", CountCombineFn(), "page_views")
    .aggregate_field("file_size_bytes", sum, "total_bytes")
    .aggregate_field("file_size_bytes", max, "max_bytes")
    .aggregate_field("file_size_bytes", min, "min_bytes")
    .with_output_types(PerUserAggregate)
)

ib.show(events)