In [1]:
from pyflink.table import DataTypes, TableEnvironment, EnvironmentSettings
from pyflink.table.expressions import lit, call
from pyflink.table.udf import ScalarFunction
from pyflink.table.udf import udf

In [2]:
import re

def cleaning(string):
    string = ' '.join([w for w in string.split() if w.find('@') < 0])
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    string = re.sub('[^A-Za-z0-9 ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string.lower()

In [3]:
settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()
t_env = TableEnvironment.create(settings)

In [4]:
class SentimentClassifier(ScalarFunction):
    def __init__(self, filename = 'tfidf-nb-malay-sentiment.pkl'):
        import pydoop.hdfs
        import os
        import pickle
        
        hdfs = pydoop.hdfs.hdfs(host = 'hdfs', port = 9000)
        with hdfs.open_file(os.path.join('/user', filename), 'rb') as fopen:
            self.model = pickle.loads(fopen.read())

    def eval(self, string):
        return str(self.model.predict([cleaning(string)])[0])

In [5]:
classifier = SentimentClassifier()

2022-03-04 04:30:52,915 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
classifier.eval('saya busuk')

'Negative'

In [7]:
sentiment_classifier = udf(classifier, result_type=DataTypes.STRING())

In [8]:
t_env.register_function('sentiment_classifier', sentiment_classifier)

In [9]:
# !wget https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/supervised-twitter/data.csv

In [10]:
import pandas as pd

filename = 'data.csv'
df = pd.read_csv(filename, sep = '\t')
df.head()

Unnamed: 0,text,id,sentiment,annotator,annotation_id,created_at,updated_at,lead_time
0,perempuan tu apesal aku rasa mcm pernah nampak...,68381,Neutral,27,4603,2022-02-15T14:04:28.798425Z,2022-02-15T14:04:32.792755Z,21974.04
1,Polis tangkap dia.,68345,Negative,27,4567,2022-02-15T12:37:58.202450Z,2022-02-15T12:38:00.815718Z,16782.079
2,Kenapa lokasi kebakaran nya terlalu spesifik? ...,68325,Negative,27,4547,2022-02-15T11:47:23.902859Z,2022-02-15T11:47:27.849352Z,13749.111
3,@YuleumSummer Waa senang nye lah nk bersalin.....,68286,Positive,27,4508,2022-02-15T08:59:24.606503Z,2022-02-15T08:59:27.475987Z,3668.697
4,DAULAT TUANKU! Merafak sembah dan takzim KDYMM...,68247,Positive,27,4469,2022-02-15T08:34:10.363218Z,2022-02-15T08:34:13.342956Z,2154.577


In [11]:
columns = df.columns.tolist()
columns = ', '.join([f'`{c}` VARCHAR' for c in columns])

In [12]:
import pydoop.hdfs
import os

hdfs = pydoop.hdfs.hdfs(host = 'hdfs', port = 9000)

with hdfs.open_file(os.path.join('/user', filename), 'wb') as fopen:
    with open(filename, 'r') as fopen_csv:
        fopen.write(fopen_csv.read().encode())

In [13]:
hdfs_path_input = f'hdfs://hdfs:9000/user/{filename}'

my_source_ddl = f"""
    create table mySource (
        {columns}
    ) with (
        'connector' = 'filesystem',
        'format' = 'csv',
        'csv.field-delimiter' = '\t',
        'path' = '{hdfs_path_input}'
    )
"""

"""
create table mysink_sentiment (word varchar, label varchar);
"""

my_sink_ddl = f"""
    create table mySink (
        word VARCHAR,
        label VARCHAR
    ) with (
        'connector' = 'jdbc',
        'url' = 'jdbc:postgresql://postgres:5432/postgres',
        'table-name' = 'mysink_sentiment',
        'username' = 'postgres',
        'password' = 'postgres'
    )
"""

t_env.execute_sql(my_source_ddl)
t_env.execute_sql(my_sink_ddl)

<pyflink.table.table_result.TableResult at 0x7f0df99df3d0>

In [14]:
tab = t_env.from_path('mySource').offset(1)
tab

<pyflink.table.table.Table at 0x7f0e703dff70>

In [15]:
tab.to_pandas().head()

Unnamed: 0,text,id,sentiment,annotator,annotation_id,created_at,updated_at,lead_time
0,perempuan tu apesal aku rasa mcm pernah nampak...,68381,Neutral,27,4603,2022-02-15T14:04:28.798425Z,2022-02-15T14:04:32.792755Z,21974.04
1,Polis tangkap dia.,68345,Negative,27,4567,2022-02-15T12:37:58.202450Z,2022-02-15T12:38:00.815718Z,16782.079
2,Kenapa lokasi kebakaran nya terlalu spesifik? ...,68325,Negative,27,4547,2022-02-15T11:47:23.902859Z,2022-02-15T11:47:27.849352Z,13749.111
3,@YuleumSummer Waa senang nye lah nk bersalin.....,68286,Positive,27,4508,2022-02-15T08:59:24.606503Z,2022-02-15T08:59:27.475987Z,3668.697
4,DAULAT TUANKU! Merafak sembah dan takzim KDYMM...,68247,Positive,27,4469,2022-02-15T08:34:10.363218Z,2022-02-15T08:34:13.342956Z,2154.577


In [16]:
tab.select('text, sentiment_classifier(text) as label') \
   .execute_insert('mySink').wait()

In [17]:
tab = t_env.from_path('mySink')
tab.to_pandas()

Unnamed: 0,word,label
0,Stubborn af dh la bawa tgh jalan igt jalan kau...,Negative
1,@FarisYusrey Haha betui betui. Tak balik modai,Negative
2,Bergaul dengan orang positive akan mengubah me...,Negative
3,kes apeni ??,Neutral
4,"Kalau lah aku boleh mencarut kat fb, dah lama ...",Negative
...,...,...
2003,"Tak habis2 teliti, tak habis2 guna big word. A...",Negative
2004,Masak apa hari ini? Episod 12 drama #harikitaj...,Neutral
2005,UPDATE: 3 Jalan Kota Bharu - K. Terengganu Jer...,Neutral
2006,"@AgneesLS @ghiegi_ Sehat doongs, kanes sehat j...",Positive
