In [1]:
from pyflink.table import DataTypes, TableEnvironment, EnvironmentSettings
from pyflink.table.expressions import lit, call
from pyflink.table.udf import ScalarFunction
from pyflink.table.udf import udf
from pyflink.table.catalog import JdbcCatalog, CatalogBaseTable, ObjectPath
from pyflink.table import *

In [2]:
import re

def cleaning(string):
    string = ' '.join([w for w in string.split() if w.find('@') < 0])
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    string = re.sub('[^A-Za-z0-9 ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string.lower()

In [3]:
settings = EnvironmentSettings.new_instance().use_blink_planner().build()
t_env = TableEnvironment.create(settings)

In [4]:
name = "my_catalog"
default_database = "postgres"
username = "postgres"
password = "postgres"
base_url = "jdbc:postgresql://postgresql:5432"

catalog = JdbcCatalog(name, default_database, username, password, base_url)
t_env.register_catalog(name, catalog)

In [5]:
catalog.list_tables('postgres')

['public.employee', 'public.salary']

In [6]:
table_employee = t_env.sql_query('SELECT * FROM my_catalog.postgres.employee')
table_employee.print_schema()

(
  `id` INT NOT NULL,
  `name` STRING,
  `last_update` TIMESTAMP(6),
  `last_comment` STRING,
  `time_created` TIMESTAMP(6)
)


In [7]:
table_salary = t_env.sql_query('SELECT * FROM my_catalog.postgres.salary')
table_salary.print_schema()

(
  `id` INT NOT NULL,
  `salary` DOUBLE
)


In [8]:
table_salary.to_pandas()

Unnamed: 0,id,salary
0,1,1000.0
1,2,1000.0


In [9]:
source_ddl = f"""
    create table kafka_employee (
        id INT,
        name VARCHAR,
        last_update BIGINT,
        last_comment STRING,
        time_created BIGINT,
        PRIMARY KEY (id) NOT ENFORCED
    )
    with (
        'connector' = 'kafka',
        'topic' = 'employee.public.employee',
        'properties.bootstrap.servers' = 'broker:9092',
        'properties.group.id' = 'testGroup16',
        'scan.startup.mode' = 'earliest-offset',
        'format' = 'debezium-json',
        'debezium-json.schema-include' = 'true'
    )
"""

t_env.execute_sql(source_ddl)

<pyflink.table.table_result.TableResult at 0x7feef1d0c250>

In [10]:
source_ddl = f"""
    create table kafka_salary (
        id INT,
        salary FLOAT,
        PRIMARY KEY (id) NOT ENFORCED
    )
    with (
        'connector' = 'kafka',
        'topic' = 'employee.public.salary',
        'properties.bootstrap.servers' = 'broker:9092',
        'properties.group.id' = 'testGroup16',
        'scan.startup.mode' = 'earliest-offset',
        'format' = 'debezium-json',
        'debezium-json.schema-include' = 'true'
    )
"""

t_env.execute_sql(source_ddl)

<pyflink.table.table_result.TableResult at 0x7feef1d0c6a0>

In [11]:
# tab = t_env.from_path('kafka_employee')
# tab

In [12]:
# t_env.sql_query("select id, name, 1 from kafka_employee").execute_insert('my_catalog.postgres.employee_test').print()

In [13]:
class SentimentClassifier(ScalarFunction):
    def __init__(self, filename = 'tfidf-nb-malay-sentiment.pkl'):
        import pydoop.hdfs
        import os
        import pickle
        
        hdfs = pydoop.hdfs.hdfs(host = 'hdfs', port = 9000)
        with hdfs.open_file(os.path.join('/user', filename), 'rb') as fopen:
            self.model = pickle.loads(fopen.read())

    def eval(self, string):
        return str(self.model.predict([cleaning(string)])[0])

In [14]:
classifier = SentimentClassifier()

2022-04-16 16:06:08,371 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [15]:
classifier.eval('saya busuk')

'Negative'

In [16]:
sentiment_classifier = udf(classifier, result_type=DataTypes.STRING())

In [17]:
t_env.register_function('sentiment_classifier', sentiment_classifier)

In [18]:
sql = """
select e.*, sentiment_classifier(e.last_comment) as sentiment, s.salary from my_catalog.postgres.employee e,
my_catalog.postgres.salary s
where e.id = s.id
"""
tab = t_env.sql_query(sql)
tab.print_schema()

(
  `id` INT NOT NULL,
  `name` STRING,
  `last_update` TIMESTAMP(6),
  `last_comment` STRING,
  `time_created` TIMESTAMP(6),
  `sentiment` STRING,
  `salary` DOUBLE
)


In [19]:
tab.to_pandas()

Unnamed: 0,id,name,last_update,last_comment,time_created,sentiment,salary
0,1,husein,2022-04-16 15:43:57.759551,haram jadah betui,2022-04-16 15:43:57.759551,Negative,1000.0
1,2,kasim,2022-04-16 15:43:57.759551,saya suka kerja disini,2022-04-16 15:43:57.759551,Positive,1000.0


In [20]:
hdfs_path = f'hdfs://hdfs:9000/user/employee_salary_acid_v4'

In [21]:
sink_ddl = f"""
DROP TABLE IF EXISTS employee_salary
"""
t_env.execute_sql(sink_ddl)
sink_ddl = f"""
CREATE TABLE employee_salary (
    id INT PRIMARY KEY NOT ENFORCED,
    name STRING,
    last_comment STRING,
    sentiment STRING,
    salary DOUBLE,
    last_update TIMESTAMP(3),
    time_created TIMESTAMP(3)
) WITH (
  'connector' = 'hudi',
  'path' = '{hdfs_path}',
  'table.type' = 'MERGE_ON_READ',
  'changelog.enabled' = 'true',
  'compaction.async.enabled' = 'false'
)
"""
t_env.execute_sql(sink_ddl)

<pyflink.table.table_result.TableResult at 0x7feec50ffd60>

In [22]:
sql_join = """
SELECT e.id, e.name, e.last_comment, sentiment_classifier(e.last_comment) as sentiment, 
CAST(s.salary AS DOUBLE) as salary, CAST(e.last_update AS TIMESTAMP(3)) as last_update, 
CAST(e.time_created AS TIMESTAMP(3)) as time_created
FROM kafka_salary s, my_catalog.postgres.employee e
WHERE s.id = e.id
"""
tab_join = t_env.sql_query(sql_join)

In [23]:
tab_join.print_schema()

(
  `id` INT NOT NULL,
  `name` STRING,
  `last_comment` STRING,
  `sentiment` STRING,
  `salary` DOUBLE,
  `last_update` TIMESTAMP(3),
  `time_created` TIMESTAMP(3)
)


In [26]:
# tab_join.execute_insert('employee_salary').print()

In [25]:
tab = t_env.from_path('employee_salary')
tab.to_pandas()

2022-04-16 16:17:08,212 INFO  org.apache.hadoop.conf.Configuration.deprecation             [] - mapred.job.map.memory.mb is deprecated. Instead, use mapreduce.map.memory.mb


Unnamed: 0,id,name,last_comment,sentiment,salary,last_update,time_created
