In [1]:
import json
import os
import typing
import datetime

import apache_beam as beam
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.runners.interactive import interactive_runner
from apache_beam.transforms.combiners import CountCombineFn

In [2]:
class EventLog(typing.NamedTuple):
    ip: str
    id: str
    lat: float
    lng: float
    user_agent: str
    age_bracket: str
    opted_into_marketing: bool
    http_request: str
    http_response: int
    file_size_bytes: int
    event_datetime: str
    event_ts: int

beam.coders.registry.register_coder(EventLog, beam.coders.RowCoder)

def parse_json(element: str):
    row = json.loads(element)
    # lat/lng sometimes empty string
    if not row["lat"] or not row["lng"]:
        row = {**row, **{"lat": -1, "lng": -1}}
    return EventLog(**row)

def add_timestamp(element: EventLog):
    ts = datetime.datetime.strptime(element.event_datetime, "%Y-%m-%dT%H:%M:%S.%f").timestamp()
    return beam.window.TimestampedValue(element, ts)

class AddWindowTS(beam.DoFn):
    def process(self, element: int, window=beam.DoFn.WindowParam):
        window_start = window.start.to_utc_datetime().isoformat(timespec="seconds")
        window_end = window.end.to_utc_datetime().isoformat(timespec="seconds")
        output = {"window_start": window_start, "window_end": window_end, "page_views": element}
        yield output

PARENT_DIR = os.path.dirname(os.getcwd())

In [3]:
p = beam.Pipeline(interactive_runner.InteractiveRunner())

events = (
    p
    | "Read from files"
    >> beam.io.ReadFromText(
        file_pattern=os.path.join(os.path.join(PARENT_DIR, "inputs", "*.out"))
    )
    | "Parse elements" >> beam.Map(parse_json).with_output_types(EventLog)
    | "Add event timestamp" >> beam.Map(add_timestamp)
    | "Tumble window per minute" >> beam.WindowInto(beam.window.FixedWindows(60))
    | "Count per minute" >> beam.CombineGlobally(CountCombineFn()).without_defaults()
    | "Add window timestamp" >> beam.ParDo(AddWindowTS())
)

ib.show(events)