In [1]:
# !pip install emot --upgrade
# !pip install regex
# !pip install pyvi

In [2]:
# !pip install kafka-python
# !pip install confluent_kafka

In [3]:
# !pip install tensorflow

In [4]:
# !pip install transformers==4.3.0

In [5]:
# !pip install torch

In [6]:
import json
import os
import numpy as np
from confluent_kafka import Consumer, KafkaError, Producer
import socket
import logging
from preprocessing import preprocessing
import tensorflow as tf
import pickle
import torch
from tensorflow.keras.preprocessing import sequence
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification,AutoModelForSequenceClassification

In [7]:
## Create consumer
def create_consumer(topic, group_id):
    try:
        consumer = Consumer({"bootstrap.servers": "localhost:9092",
                             "group.id": group_id,
                             "client.id": socket.gethostname(),
                             "isolation.level": "read_committed",
                             "default.topic.config": {"auto.offset.reset": "latest", # Only consume new messages
                                                      "enable.auto.commit": False}
                             })

        consumer.subscribe([topic])
    except Exception as e:
        logging.exception("Couldn't create the consumer")
        consumer = None

    return consumer
# Create a producer
def create_producer():
    try:
        producer = Producer({"bootstrap.servers": "localhost:9092",
                             "client.id": socket.gethostname(),
                             "enable.idempotence": True,  # EOS processing
                             "compression.type": "lz4",
                             "batch.size": 64000,
                             "linger.ms": 10,
                             "acks": "all",  # Wait for the leader and all ISR to send response back
                             "retries": 5,
                             "delivery.timeout.ms": 1000})  # Total time to make retries
    except Exception as e:
        logging.exception("Couldn't create the producer")
        producer = None
    return producer

In [10]:
def main():
    # create consumer
    consumer = create_consumer(topic="hsd",group_id="hsd")
    # create producer
    producer = create_producer()
    
    # load DNN model
#     model_path = os.path.abspath('../model/Text_CNN_model_PhoW2V.h5')
#     model = tf.keras.models.load_model(model_path)
    
    
    # load tokenizer
#     tknz_path = os.path.abspath('../model/tokenizer.pickle')
#     with open(tknz_path,"rb") as f:
#         tokenizer = pickle.load(f)

    # Load Phobert_model
    ## pbert_path = os.path.abspath('../transformer_model/phobert-v3/')
    pretrained_model = AutoModelForSequenceClassification.from_pretrained("E:/Download/phobert-v3/", local_files_only=True)
    model = Trainer(model=pretrained_model)
    bert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base",use_fast=False)
    
    class BuildDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)
    
    # Consumer
    try:
        while True:
            message = consumer.poll(0.5)
            if message is None:
                continue
            elif not message.error():
                print('Received')
                record = json.loads(message.value().decode('utf-8'))
                date = record["datetime"]
                comment = record["comment"]
                author = record["author"]
                processed_comment = preprocessing(comment)
                
                # dnn
#                 seq_comment = tokenizer.texts_to_sequences([processed_comment])
#                 ds_comment = sequence.pad_sequences(seq_comment,maxlen=80)
#                 pred = model.predict(ds_comment)
#                 hsd_dt = pred.argmax(-1)
                
                # bert
                seq_comment = bert_tokenizer([comment],truncation=True, padding=True, max_length=100)
                ds_comment = BuildDataset(seq_comment, [0])
                
                pred = model.predict(ds_comment)
                hsd_dt = np.argmax(pred.predictions,axis=-1)
                print(f"Date: {date}, Comment: {processed_comment}, HSD_detect: {hsd_dt}, Author: {author}")
                
                # Send message to detected topic
                dict_ = {'author':author,'date':date,'raw_comment':comment,'clean_comment':processed_comment,'label':int(hsd_dt[0])}
                sending_record = json.dumps(dict_).encode("utf-8")
                producer.produce(topic="detected",value=sending_record)
                producer.flush(30)
                print('Detection sent!')
                
            elif message.error().code() == KafkaError._PARTITION_EOF:
                print('End of partition reached {0}/{1}'
                      .format(message.topic(), message.partition()))
            else:
                print('Error occured: {0}'.format(message.error().str()))
    except KeyboardInterrupt:
        print('Stop consume!')
        pass

    finally:
        consumer.close()

In [14]:
if __name__ == "__main__":
    main()

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


Received


Date: 2021-10-08T22:34:59Z, Comment: nhạc hay nhảy lại đẹp nữa, HSD_detect: [0], Author: AT- Official
Detection sent!
Received
Date: 2021-10-08T10:41:21Z, Comment: hay vay noi, HSD_detect: [0], Author: anh ba sánh
Detection sent!
Received
Date: 2021-10-07T09:21:00Z, Comment: ổng ghép nhạc, HSD_detect: [0], Author: anh ba sánh
Detection sent!
Received
Date: 2021-10-01T11:23:24Z, Comment: múa hài_vậy, HSD_detect: [0], Author: Quê GL
Detection sent!
Received
Date: 2021-10-01T02:18:23Z, Comment: nhìn nhóc trẻ trâu, HSD_detect: [1], Author: MY BÒ SỮA
Detection sent!
Received
Date: 2021-09-21T17:28:22Z, Comment: anh phong nhảy vui vãi, HSD_detect: [0], Author: Trí Lê minh
Detection sent!
Received
Date: 2021-09-21T11:29:35Z, Comment: yêu lắm ️, HSD_detect: [0], Author: Mai Trần
Detection sent!
Received
Date: 2021-09-18T07:28:14Z, Comment: hay thật_sự, HSD_detect: [0], Author: tôi là ếch xanh
Detection sent!
Received
Date: 2021-09-13T13:46:49Z, Comment: anh_em nào còn nghe không kết nhất đoạn 