In [1]:
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

t_env.connect(FileSystem().path('input.csv')) \
    .with_format(OldCsv()
                 .field('word', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path('output')) \
    .with_format(OldCsv()
                 .field_delimiter('\t')
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

t_env.from_path('mySource') \
    .group_by('word') \
    .select('word, count(1)') \
    .insert_into('mySink')

t_env.execute("tutorial_job")

<pyflink.common.job_execution_result.JobExecutionResult at 0x7f7de058b450>

In [2]:
from pyflink.table.descriptors import OldCsv, FileSystem

(
    t_env
    .connect(FileSystem().path('output'))
    .with_format(
        OldCsv()
        .field_delimiter('\t')
        .field('word', DataTypes.STRING())
#         .field('count', DataTypes.BIGINT())
    )
    .with_schema(
        Schema()
        .field('word', DataTypes.STRING())
#         .field('count', DataTypes.BIGINT())
    )
    .create_temporary_table('mySink')
)

<pyflink.table.descriptors.StreamTableDescriptor at 0x7f6f08609ed0>

In [None]:
(
    t_env
    .from_path("mySource")
    .group_by("word")
    .select("word, count(1)")
    .insert_into("mySink")
)
t_env.execute("tutorial_job")

In [3]:
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, StreamTableEnvironment
from pyflink.table.descriptors import Kafka, Schema, Csv

exec_env = StreamExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_config.get_configuration().set_string(
    "pipeline.jars",
    "file:///workdir/boris/local/lib/flink-sql-connector-kafka_2.11-1.11.2.jar"
)
t_env = StreamTableEnvironment.create(exec_env, t_config)
(
    t_env
    .connect(
        Kafka().version("universal").topic("main_topic")
        .property("bootstrap.servers", "localhost:9092")
    )
    .with_format(
        Csv().derive_schema()
    )
    .with_schema(
        Schema()
        .field("word", DataTypes.STRING())
    )
    .create_temporary_table("mySource")
)

<pyflink.table.descriptors.StreamTableDescriptor at 0x7f4598505bd0>

In [4]:
(
    t_env.connect(
        Kafka()
        .version("universal")
        .topic("result_topic")
        .property("bootstrap.servers", "localhost:9092")
    )
    .with_format(
        Csv().derive_schema()
    )
    .with_schema(
        Schema()
        .field("word", DataTypes.STRING())
#         .field("count", DataTypes.BIGINT())
    )
    .create_temporary_table("mySink")
)

<pyflink.table.descriptors.StreamTableDescriptor at 0x7f4598501ed0>

In [None]:
(
    t_env
    .from_path("mySource")
#     .group_by("word")
#     .select("word, count(1)")
    .select("word")
    .insert_into("mySink")
)
t_env.execute("tutorial_job")