In [1]:
from pyflink.table import DataTypes, TableEnvironment, EnvironmentSettings
from pyflink.table.expressions import lit, call
from pyflink.table.udf import ScalarFunction
from pyflink.table.udf import udf
from pyflink.table.catalog import JdbcCatalog, CatalogBaseTable, ObjectPath
from pyflink.table import *

In [2]:
import re

def cleaning(string):
    string = ' '.join([w for w in string.split() if w.find('@') < 0])
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    string = re.sub('[^A-Za-z0-9 ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string.lower()

In [3]:
settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
t_env = TableEnvironment.create(settings)

In [4]:
name = "my_catalog"
default_database = "postgres"
username = "postgres"
password = "postgres"
base_url = "jdbc:postgresql://postgresql:5432"

catalog = JdbcCatalog(name, default_database, username, password, base_url)
t_env.register_catalog(name, catalog)

In [5]:
catalog.list_tables('postgres')

['public.employee',
 'public.employee_salary',
 'public.employee_test',
 'public.salary']

In [6]:
table_employee = t_env.sql_query('SELECT * FROM my_catalog.postgres.employee')
table_employee.print_schema()

(
  `id` INT NOT NULL,
  `name` STRING,
  `last_update` TIMESTAMP(6),
  `last_comment` STRING,
  `time_created` TIMESTAMP(6)
)


In [7]:
table_salary = t_env.sql_query('SELECT * FROM my_catalog.postgres.salary')
table_salary.print_schema()

(
  `id` INT NOT NULL,
  `salary` DOUBLE
)


In [8]:
# table_salary.to_pandas()

In [9]:
source_ddl = f"""
    create table kafka_employee (
        id INT,
        name VARCHAR,
        last_update BIGINT,
        last_comment STRING,
        time_created BIGINT,
        PRIMARY KEY (id) NOT ENFORCED
    )
    with (
        'connector' = 'kafka',
        'topic' = 'employee.public.employee',
        'properties.bootstrap.servers' = 'broker:9092',
        'properties.group.id' = 'testGroup9',
        'scan.startup.mode' = 'earliest-offset',
        'format' = 'debezium-json',
        'debezium-json.schema-include' = 'true'
    )
"""

t_env.execute_sql(source_ddl)

<pyflink.table.table_result.TableResult at 0x7f06f47e8ee0>

In [10]:
source_ddl = f"""
    create table kafka_salary (
        id INT,
        salary FLOAT,
        PRIMARY KEY (id) NOT ENFORCED
    )
    with (
        'connector' = 'kafka',
        'topic' = 'employee.public.salary',
        'properties.bootstrap.servers' = 'broker:9092',
        'properties.group.id' = 'testGroup9',
        'scan.startup.mode' = 'earliest-offset',
        'format' = 'debezium-json',
        'debezium-json.schema-include' = 'true'
    )
"""

t_env.execute_sql(source_ddl)

<pyflink.table.table_result.TableResult at 0x7f06f47e8cd0>

In [11]:
# tab = t_env.from_path('kafka_employee')
# tab

In [12]:
# tab.select('id, name').execute_insert('my_catalog.postgres.employee_test').print()

In [13]:
class SentimentClassifier(ScalarFunction):
    def __init__(self, filename = 'tfidf-nb-malay-sentiment.pkl'):
        import pydoop.hdfs
        import os
        import pickle
        
        hdfs = pydoop.hdfs.hdfs(host = 'hdfs', port = 9000)
        with hdfs.open_file(os.path.join('/user', filename), 'rb') as fopen:
            self.model = pickle.loads(fopen.read())

    def eval(self, string):
        return str(self.model.predict([cleaning(string)])[0])

In [14]:
classifier = SentimentClassifier()

2022-03-04 07:27:51,160 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [15]:
classifier.eval('saya busuk')

'Negative'

In [16]:
sentiment_classifier = udf(classifier, result_type=DataTypes.STRING())

In [17]:
t_env.register_function('sentiment_classifier', sentiment_classifier)

In [18]:
# """
# INSERT INTO my_catalog.postgres.employee_salary
# """

In [19]:
# tab = t_env.from_path('kafka_salary')

In [20]:
# """
# CREATE TABLE IF NOT EXISTS public.employee_salary (
#   id INT,
#   name text,
#   last_comment text,
#   sentiment text,
#   salary FLOAT,
#   last_update TIMESTAMP,
#   time_created TIMESTAMP,
#   PRIMARY KEY (id)
# );

# CREATE TABLE IF NOT EXISTS public.employee_test (id INT, name text, PRIMARY KEY (id));
# """

In [21]:
sql_join = """
SELECT s.id, e.name, e.last_comment, sentiment_classifier(e.last_comment) as sentiment,
s.salary, e.last_update, e.time_created FROM kafka_salary s, my_catalog.postgres.employee_salary e
WHERE s.id = e.id
"""
tab_join = t_env.sql_query(sql_join)

In [22]:
tab_join.print_schema()

(
  `id` INT NOT NULL,
  `name` STRING,
  `last_comment` STRING,
  `sentiment` STRING,
  `salary` FLOAT,
  `last_update` TIMESTAMP(6),
  `time_created` TIMESTAMP(6)
)


In [23]:
insert = tab_join.execute_insert('my_catalog.postgres.employee_salary')

In [24]:
dir(insert)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_j_table_result',
 'collect',
 'get_job_client',
 'get_result_kind',
 'get_table_schema',
 'print',
 'wait']

In [None]:
insert.print()