In [1]:
import json
import os
import typing

import apache_beam as beam
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.runners.interactive import interactive_runner

In [2]:
class EventLog(typing.NamedTuple):
    ip: str
    id: str
    lat: float
    lng: float
    user_agent: str
    age_bracket: str
    opted_into_marketing: bool
    http_request: str
    http_response: int
    file_size_bytes: int
    event_datetime: str
    event_ts: int


class UserTraffic(typing.NamedTuple):
    id: str
    page_views: int
    total_bytes: int
    max_bytes: int
    min_bytes: int


beam.coders.registry.register_coder(EventLog, beam.coders.RowCoder)
beam.coders.registry.register_coder(UserTraffic, beam.coders.RowCoder)


def parse_json(element: str):
    row = json.loads(element)
    # lat/lng sometimes empty string
    if not row["lat"] or not row["lng"]:
        row = {**row, **{"lat": -1, "lng": -1}}
    return EventLog(**row)


class Aggregate(beam.DoFn):
    def process(self, element: typing.Tuple[str, typing.Iterable[int]]):
        key, values = element
        yield UserTraffic(
            id=key,
            page_views=len(values),
            total_bytes=sum(values),
            max_bytes=max(values),
            min_bytes=min(values),
        )


PARENT_DIR = os.path.dirname(os.getcwd())

In [3]:
p = beam.Pipeline(interactive_runner.InteractiveRunner())
events = (
    p
    | "Read from files"
    >> beam.io.ReadFromText(
        file_pattern=os.path.join(os.path.join(PARENT_DIR, "inputs", "*.out"))
    )
    | "Parse elements" >> beam.Map(parse_json).with_output_types(EventLog)
    | "Form key value pair" >> beam.Map(lambda e: (e.id, e.file_size_bytes))
    | "Group by key" >> beam.GroupByKey()
    | "Aggregate by id" >> beam.ParDo(Aggregate()).with_output_types(UserTraffic)
)

ib.show(events)