In [1]:
from pyflink.table import DataTypes, TableEnvironment, EnvironmentSettings
from pyflink.table.expressions import lit, call
from pyflink.table.udf import ScalarFunction
from pyflink.table.udf import udf
from pyflink.table.catalog import JdbcCatalog, CatalogBaseTable, ObjectPath
from pyflink.table import *

In [2]:
import re

def cleaning(string):
    string = ' '.join([w for w in string.split() if w.find('@') < 0])
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    string = re.sub('[^A-Za-z0-9 ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string.lower()

In [3]:
settings = EnvironmentSettings.new_instance().use_blink_planner().build()
t_env = TableEnvironment.create(settings)

In [4]:
name = "my_catalog"
default_database = "postgres"
username = "postgres"
password = "postgres"
base_url = "jdbc:postgresql://postgresql:5432"

catalog = JdbcCatalog(name, default_database, username, password, base_url)
t_env.register_catalog(name, catalog)

In [5]:
catalog.list_tables('postgres')

['public.employee',
 'public.employee_salary',
 'public.employee_test',
 'public.salary']

In [6]:
table_employee = t_env.sql_query('SELECT * FROM my_catalog.postgres.employee')
table_employee.print_schema()

(
  `id` INT NOT NULL,
  `name` STRING,
  `last_update` TIMESTAMP(6),
  `last_comment` STRING,
  `time_created` TIMESTAMP(6)
)


In [7]:
table_salary = t_env.sql_query('SELECT * FROM my_catalog.postgres.salary')
table_salary.print_schema()

(
  `id` INT NOT NULL,
  `salary` DOUBLE
)


In [8]:
table_salary.to_pandas()

Unnamed: 0,id,salary
0,1,1000.0
1,2,1000.0


In [9]:
source_ddl = f"""
    create table kafka_employee (
        id INT,
        name VARCHAR,
        last_update BIGINT,
        last_comment STRING,
        time_created BIGINT,
        PRIMARY KEY (id) NOT ENFORCED
    )
    with (
        'connector' = 'kafka',
        'topic' = 'employee.public.employee',
        'properties.bootstrap.servers' = 'broker:9092',
        'properties.group.id' = 'testGroup11',
        'scan.startup.mode' = 'earliest-offset',
        'format' = 'debezium-json',
        'debezium-json.schema-include' = 'true'
    )
"""

t_env.execute_sql(source_ddl)

<pyflink.table.table_result.TableResult at 0x7f602d88c790>

In [10]:
source_ddl = f"""
    create table kafka_salary (
        id INT,
        salary FLOAT,
        PRIMARY KEY (id) NOT ENFORCED
    )
    with (
        'connector' = 'kafka',
        'topic' = 'employee.public.salary',
        'properties.bootstrap.servers' = 'broker:9092',
        'properties.group.id' = 'testGroup11',
        'scan.startup.mode' = 'earliest-offset',
        'format' = 'debezium-json',
        'debezium-json.schema-include' = 'true'
    )
"""

t_env.execute_sql(source_ddl)

<pyflink.table.table_result.TableResult at 0x7f607836a6a0>

In [11]:
# tab = t_env.from_path('kafka_employee')
# tab

In [12]:
# t_env.sql_query("select id, name, 1 from kafka_employee").execute_insert('my_catalog.postgres.employee_test').print()

In [13]:
class SentimentClassifier(ScalarFunction):
    def __init__(self, filename = 'tfidf-nb-malay-sentiment.pkl'):
        import pydoop.hdfs
        import os
        import pickle
        
        hdfs = pydoop.hdfs.hdfs(host = 'hdfs', port = 9000)
        with hdfs.open_file(os.path.join('/user', filename), 'rb') as fopen:
            self.model = pickle.loads(fopen.read())

    def eval(self, string):
        return str(self.model.predict([cleaning(string)])[0])

In [14]:
classifier = SentimentClassifier()

2022-03-05 07:10:01,151 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [15]:
classifier.eval('saya busuk')

'Negative'

In [16]:
sentiment_classifier = udf(classifier, result_type=DataTypes.STRING())

In [17]:
t_env.register_function('sentiment_classifier', sentiment_classifier)

In [18]:
sql = """
select e.*, sentiment_classifier(e.last_comment) as sentiment, s.salary from my_catalog.postgres.employee e,
my_catalog.postgres.salary s
where e.id = s.id
"""
tab = t_env.sql_query(sql)
tab.print_schema()

(
  `id` INT NOT NULL,
  `name` STRING,
  `last_update` TIMESTAMP(6),
  `last_comment` STRING,
  `time_created` TIMESTAMP(6),
  `sentiment` STRING,
  `salary` DOUBLE
)


In [19]:
tab.to_pandas()

Unnamed: 0,id,name,last_update,last_comment,time_created,sentiment,salary
0,1,husein,2022-03-04 05:01:44.181104,haram jadah betui,2022-03-04 05:01:44.181104,Negative,1000.0
1,2,kasim,2022-03-04 05:01:44.181104,saya suka kerja disini,2022-03-04 05:01:44.181104,Positive,1000.0


In [21]:
sink_ddl = """
CREATE TABLE employee_salary (
    id INT,
    name STRING,
    last_comment STRING,
    sentiment STRING,
    SALARY DOUBLE,
    last_update TIMESTAMP,
    time_created TIMESTAMP,
    PRIMARY KEY (id) NOT ENFORCED
) WITH (
  'connector' = 'upsert-kafka',
  'topic' = 'employee-salary',
  'properties.bootstrap.servers' = 'broker:9092',
  'key.format' = 'json',
  'value.format' = 'json'
)
"""
t_env.execute_sql(sink_ddl)

<pyflink.table.table_result.TableResult at 0x7f5fffacc2e0>

In [22]:
sql_join = """
SELECT e.id, e.name, e.last_comment, sentiment_classifier(e.last_comment) as sentiment, 
CAST(s.salary AS DOUBLE) as salary, e.last_update, e.time_created
FROM kafka_salary s, my_catalog.postgres.employee e
WHERE s.id = e.id
"""
tab_join = t_env.sql_query(sql_join)

In [23]:
tab_join.print_schema()

(
  `id` INT NOT NULL,
  `name` STRING,
  `last_comment` STRING,
  `sentiment` STRING,
  `salary` DOUBLE,
  `last_update` TIMESTAMP(6),
  `time_created` TIMESTAMP(6)
)


In [25]:
# insert = tab_join.execute_insert('employee_salary').print()

In [27]:
# !pip3 install confluent-kafka

In [28]:
from confluent_kafka import Consumer

In [29]:
conf = {'bootstrap.servers': 'broker:9092', 'group.id': 'test', 'session.timeout.ms': 6000,
            'auto.offset.reset': 'earliest'}
c = Consumer(conf)
c.subscribe(['employee-salary'])

In [31]:
message = c.poll(0)
message

<cimpl.Message at 0x7f5ffd7239c0>

In [32]:
message.value()

b'{"id":1,"name":"husein","last_comment":"haram jadah betui","sentiment":"Negative","SALARY":1000.0,"last_update":"2022-03-04 05:01:44.181104","time_created":"2022-03-04 05:01:44.181104"}'