In [0]:
%flink.pyflink
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
# -*- coding: utf-8 -*-

"""
sliding-windows.py
~~~~~~~~~~~~~~~~~~~
This module:
    1. Creates a table environment
    2. Creates a source table from a Kinesis Data Stream
    3. Creates a sink table writing to a Kinesis Data Stream
    4. Queries from the Source Table and
       creates a sliding window over 10 seconds to calculate the minimum value over the window.
    5. These sliding window results are inserted into the Sink table.
"""

from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes
from pyflink.table.window import Slide
from pyflink.table.udf import udf
import os
import json

# 1. Creates a Table Environment
env_settings = EnvironmentSettings.in_streaming_mode()
table_env = TableEnvironment.create(env_settings)
# env = StreamExecutionEnvironment.get_execution_environment()
# table_env = StreamTableEnvironment.create(env)


def create_input_table(table_name, stream_name, region, stream_initpos):
    return """ CREATE TABLE {0} (
                event_time TIMESTAMP(3),
                case_number STRING,
                block STRING,
                primary_type STRING,
                location_description STRING,
                arrest STRING,
                district STRING,
                ward STRING,
                community_area STRING,
                WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND
              )
              PARTITIONED BY (community_area)
              WITH (
                'connector' = 'kinesis',
                'stream' = '{1}',
                'aws.region' = '{2}',
                'scan.stream.initpos' = '{3}',
                'format' = 'json',
                'json.timestamp-format.standard' = 'ISO-8601'
              ) """.format(table_name, stream_name, region, stream_initpos)

def create_output_table(table_name, stream_name, region):
    return """ CREATE TABLE {0} (
                community_area STRING,
                case_count BIGINT,
                event_time VARCHAR(64)
              )
              PARTITIONED BY (community_area)
              WITH (
                'connector' = 'kinesis',
                'stream' = '{1}',
                'aws.region' = '{2}',
                'format' = 'json',
                'json.timestamp-format.standard' = 'ISO-8601'
              ) """.format(table_name, stream_name, region)


def perform_sliding_window_aggregation(input_table_name):
    # use SQL Table in the Table API
    input_table = table_env.from_path(input_table_name)

    sliding_window_table = (
        input_table
            .window(
                Slide.over("1.minutes")
                .every("30.seconds")
                .on("event_time")
                .alias("one_minute_window")
            )
            .group_by("community_area, one_minute_window")
            .select("community_area, case_number.count as case_count, to_string(one_minute_window.end) as event_time")
            .where("case_count > 1000")
    )

    return sliding_window_table


@udf(input_types=[DataTypes.TIMESTAMP(3)], result_type=DataTypes.STRING())
def to_string(i):
    return str(i)


table_env.create_temporary_system_function("to_string", to_string)




In [1]:
%flink.pyflink

# tables
input_table_name = "input_table"
output_table_name = "output_table"

# 2. Creates a source table from a Kinesis Data Stream
table_env.execute_sql(create_input_table(input_table_name, "big-crime-stream-crimes", "us-east-1", "LATEST"))

# 3. Creates a sink table writing to a Kinesis Data Stream
table_env.execute_sql(create_output_table(output_table_name, "big-crime-stream-summary", "us-east-1"))


In [2]:
%flink.pyflink

# 4. Queries from the Source Table and creates a sliding window over 10 seconds to calculate the minimum value
# over the window.
sliding_window_table = perform_sliding_window_aggregation(input_table_name)
table_env.create_temporary_view("sliding_window_table", sliding_window_table)


In [3]:
%flink.pyflink

# 5. These sliding windows are inserted into the sink table
table_result1 = table_env.execute_sql("INSERT INTO {0} SELECT community_area, case_count, event_time FROM {1}"
                                          .format(output_table_name, "sliding_window_table"))


job_status = table_result1.get_job_client().get_job_status()

In [4]:
%flink.pyflink

job_status.cancel()



In [5]:
%flink.ssql
