<a href="https://colab.research.google.com/github/jeongukjae/tfds-korean/blob/develop/examples/korean_hate_speech_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U tfds-korean tensorflow-datasets tensorflow-text tensorflow-addons sentencepiece

In [2]:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
import tensorflow_text as text
import tfds_korean.korean_hate_speech

In [3]:
dataset, ds_info = tfds.load("korean_hate_speech", with_info=True)

print(ds_info)

tfds.core.DatasetInfo(
    name='korean_hate_speech',
    full_name='korean_hate_speech/labeled/1.0.0',
    description="""
    The human-annotated Korean corpus for toxic speech detection and the large unlabeled corpus.
    The data is comments from the Korean entertainment news aggregation platform.
    """,
    config_description="""
    Korean hate speech dataset (labeled)
    """,
    homepage='https://github.com/kocohub/korean-hate-speech',
    data_path='/root/tensorflow_datasets/korean_hate_speech/labeled/1.0.0',
    download_size=1.85 MiB,
    dataset_size=2.58 MiB,
    features=FeaturesDict({
        'bias': ClassLabel(shape=(), dtype=tf.int64, num_classes=3),
        'comments': Text(shape=(), dtype=tf.string),
        'contain_gender_bias': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'hate': ClassLabel(shape=(), dtype=tf.int64, num_classes=3),
        'news_title': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=None,
    splits={
        'dev'

In [4]:
import sentencepiece as spm

train_sentences = dataset['train'].flat_map(lambda x: tf.data.Dataset.from_tensor_slices([x['news_title'], x['comments']]))
spm.SentencePieceTrainer.train(sentence_iterator=train_sentences.as_numpy_iterator(), model_prefix='spm', vocab_size=5000)

In [5]:
!head spm.vocab

<unk>	0
<s>	0
</s>	0
▁	-2.61888
,	-3.96764
'	-4.12343
]	-4.31812
이	-4.42457
▁[	-4.45926
▁'	-4.52212


In [6]:
tokenizer = text.SentencepieceTokenizer(open('spm.model', 'rb').read(), add_bos=True, add_eos=True)

def _map_model_input(ds_item):
    title_token_id = tokenizer.tokenize(ds_item['news_title'])
    comment_token_id = tokenizer.tokenize(ds_item['comments'])

    return (
        {"news": title_token_id, "comment": comment_token_id},
        {
            "hate": tf.one_hot(ds_item['hate'], 3),
            "gender_bias": tf.one_hot(ds_item['contain_gender_bias'], 2),
            "bias": tf.one_hot(ds_item['bias'], 3),
        },
    )

train_ds = dataset['train'].shuffle(10000, reshuffle_each_iteration=True).batch(64).map(_map_model_input)
dev_ds = dataset['dev'].shuffle(500, reshuffle_each_iteration=True).batch(64).map(_map_model_input)

In [7]:
def create_model():
    input_node = {
        "news": tf.keras.Input([None], name='news', ragged=True),
        "comment": tf.keras.Input([None], name='comment', ragged=True),
    }

    embedding_table = tf.keras.layers.Embedding(tokenizer.vocab_size(), 256, name='embedding_table')
    news_encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True), name='news_encoder')
    comment_encoder = tf.keras.Sequential(
        [
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
        ],
        name='comment_encoder'
    )
    hate_classifier = tf.keras.Sequential(
        [
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(3)
        ],
        name='hate_classifier'
    )
    gender_bias_classifier = tf.keras.Sequential(
        [
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(2)
        ],
        name='gender_bias_classifier'
    )
    bias_classifier = tf.keras.Sequential(
        [
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(3)
        ],
        name='bias_classifier'
    )

    news_embedding = news_encoder(embedding_table(input_node['news'])).to_tensor()
    comment_embedding = comment_encoder(embedding_table(input_node['comment'])).to_tensor()
    comment_attended = tf.keras.layers.Attention()([comment_embedding, news_embedding])

    representation = tf.concat(
        [
            tf.keras.layers.GlobalAveragePooling1D()(comment_attended),
            tf.keras.layers.GlobalAveragePooling1D()(comment_embedding),
        ],
        axis=-1
    )

    output_node = {
        "hate": hate_classifier(representation),
        "gender_bias": gender_bias_classifier(representation),
        "bias": bias_classifier(representation),
    }

    model = tf.keras.Model(input_node, output_node)
    return model

In [8]:
model = create_model()
model.summary()

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics={
        "hate": tfa.metrics.F1Score(average='macro', num_classes=3),
        "gender_bias": tfa.metrics.F1Score(average='macro', num_classes=2),
        "bias": tfa.metrics.F1Score(average='macro', num_classes=3),
    }
)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
comment (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
news (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_table (Embedding)     (None, None, 256)    1280000     news[0][0]                       
                                                                 comment[0][0]                    
__________________________________________________________________________________________________
comment_encoder (Sequential)    (None, None, 512)    2625536     embedding_table[1][0]        

In [9]:
model.fit(
    train_ds,
    validation_data=dev_ds,
    epochs=3,
)

Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fc24741a050>

In [10]:
# example data from https://www.kaggle.com/c/korean-hate-speech-detection/data?select=unlabeled_comments.txt

# bias: 0="none", 1="gender", 2="others"
# gender_bias: 0="False", 1="True"
# hate: 0="none", 1="hate", 2="offensive"
result = model({
    "news": tokenizer.tokenize(
        [
            "[단독] 지드래곤♥이주연, 제주도 데이트…2018년 1호 커플 탄생",
            "[단독] 지드래곤♥이주연, 제주도 데이트…2018년 1호 커플 탄생",
            "[단독] 지드래곤♥이주연, 제주도 데이트…2018년 1호 커플 탄생",
            "[단독] 지드래곤♥이주연, 제주도 데이트…2018년 1호 커플 탄생",
            "[단독] 지드래곤♥이주연, 제주도 데이트…2018년 1호 커플 탄생",
        ]
    ),
    "comment": tokenizer.tokenize(
        [
            "지드래곤은 난봉꾼이란...댓글도 달렸네 ㅋㅋ 이주연 학창시절 사진 보고 와라. 요즘 웬만한 여자 연예인하고 붙여놔도....미모가 최고였단다.ㅋ 5대 얼짱 출신.",
            "이주연은 알겠는데 지디는 뭐하는 듣보잡여",
            "부럽네요. 나도 불과 한달전까진 허니문베이비를 꿈꿨는데 이제 다 부질없네요. 당연히 순결할거라 믿었고 그래서 첫날밤까지 기다려준건데 배신감만 듭니다. 첫날밤 와이프가 피를 안흘렸어요. 처가집식구들이 일부러 절 속였단 생각에 화도나고 어제 처가집가기로 했는데 안간다고 했더니 혼자 울고 갔다와서 지금까지 한마디도 안해요. 이혼하고 싶네요",
            "이주연을 모르는 애들이 많네. 해체된 애프터스쿨 멤버로 당시는 주연이 예명. 인기나 포텐은 안터졌으나, 순수미모만으로는 애프터스쿨에서 원탑이었다. 진짜 자연미인이다.",
            "겨론했으면",
        ]
    ),
})
result = {key: tf.argmax(value, axis=-1) for key, value in result.items()}
print("bias:", result['bias'])
print("gender_bias:", result['gender_bias'])
print("hate:", result['hate'])

bias: tf.Tensor([0 0 0 0 0], shape=(5,), dtype=int64)
gender_bias: tf.Tensor([0 0 0 0 0], shape=(5,), dtype=int64)
hate: tf.Tensor([2 0 0 0 0], shape=(5,), dtype=int64)
