In [1]:
import json
import os
import typing
import datetime

import apache_beam as beam
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.runners.interactive import interactive_runner

In [2]:
class EventLog(typing.NamedTuple):
    ip: str
    id: str
    lat: float
    lng: float
    user_agent: str
    age_bracket: str
    opted_into_marketing: bool
    http_request: str
    http_response: int
    file_size_bytes: int
    event_datetime: str
    event_ts: int

beam.coders.registry.register_coder(EventLog, beam.coders.RowCoder)

def parse_json(element: str):
    row = json.loads(element)
    # lat/lng sometimes empty string
    if not row["lat"] or not row["lng"]:
        row = {**row, **{"lat": -1, "lng": -1}}
    return EventLog(**row)


def format_timestamp(element: EventLog):
    event_ts = datetime.datetime.fromisoformat(element.event_datetime)
    temp_dict = element._asdict()
    temp_dict["event_datetime"] = datetime.datetime.strftime(
        event_ts, "%Y-%m-%d %H:%M:%S"
    )
    return EventLog(**temp_dict)

PARENT_DIR = os.path.dirname(os.getcwd())

In [3]:
p = beam.Pipeline(interactive_runner.InteractiveRunner())

events = (
    p
    | "Read from files"
    >> beam.io.ReadFromText(
        file_pattern=os.path.join(os.path.join(PARENT_DIR, "inputs", "*.out"))
    )
    | "Parse elements" >> beam.Map(parse_json).with_output_types(EventLog)
    | "Format timestamp" >> beam.Map(format_timestamp).with_output_types(EventLog)
)

ib.show(events, n=5)

In [4]:
%load_ext apache_beam.runners.interactive.sql.beam_sql_magics

In [5]:
%%beam_sql -o output
WITH cte AS (
    SELECT CAST(event_datetime AS TIMESTAMP) AS ts
    FROM events
)
SELECT
    CAST(TUMBLE_START(ts, INTERVAL '1' MINUTE) AS VARCHAR) AS window_start,
    CAST(TUMBLE_END(ts, INTERVAL '1' MINUTE) AS VARCHAR) AS window_end,
    COUNT(*) AS page_view
FROM cte
GROUP BY
    TUMBLE(ts, INTERVAL '1' MINUTE)

2.53.0: Pulling from apache/beam_java11_sdk
Digest: sha256:4f90eceef156cdab47136c978b873c6dc84bb9812a65fbbc515e9cf0071ffd5e
Status: Image is up to date for apache/beam_java11_sdk:2.53.0
docker.io/apache/beam_java11_sdk:2.53.0
ef5e168e3cb71778e9bb65678b9798803a33f46df9a913929cd6e176c175165e


<PCollection[SqlTransform(beam:external:java:sql:v1).output] at 0x7fe4283fa740>