In [1]:
import json
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
from tensorflow.keras import models, layers, optimizers, losses, metrics
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv
import pandas as pd
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras import Input, Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.python.keras.callbacks import TensorBoard
from time import time
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
x, y = [], []
file1 = "./comment-labeling.csv"
file2 = "./naver-ratings.csv"

In [3]:
f = open(file1, 'r', encoding='utf-8')
read = csv.reader(f)
for line in read:
    emotion = float(line[-1])
    x.append(line[0])
    y.append(emotion)
    
f.close()

In [4]:
x2, y2 = [], []
f = open(file2, 'r', encoding='utf-8')
read = csv.reader(f)
for line in read:
    emotion = float(line[-1])
    y2.append(emotion)
    x2.append(line[0])
    
f.close()

In [5]:
test_percent = 0.2

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_percent)
a, b, c, d = train_test_split(x2, y2, test_size=test_percent)

In [7]:
x_train += a
x_test += b
y_train += c
y_test += d

In [8]:
def build_model(train_data):
    train_data = tf.data.Dataset.from_tensor_slices(train_data)
    model = Sequential()
    model.add(Input(shape=(1,), dtype="string"))
    max_tokens = 15000
    max_len = 50
    vectorize_layer = TextVectorization(
        max_tokens=max_tokens,
        output_mode="int",
        output_sequence_length=max_len
    )
    
    vectorize_layer.adapt(train_data.batch(64))
    model.add(vectorize_layer)
    model.add(layers.Embedding(max_tokens + 1, output_dim=200))
    model.add(Flatten())
    model.add(Dense(8, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))
    return model

In [9]:
tf.config.experimental_enable_xla=True

In [15]:
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
early_stop = EarlyStopping(monitor='val_loss', patience=3)
checkpoint = ModelCheckpoint(filepath='model_{epoch:02d}', save_format='tf', monitor='val_accuracy', save_best_only=False)

In [16]:
rnn_model = build_model(x_train)
rnn_model.compile(
    optimizer="adam",
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [17]:
rnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, 50)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 50, 200)           3000200   
                                                                 
 flatten_1 (Flatten)         (None, 10000)             0         
                                                                 
 dense_2 (Dense)             (None, 8)                 80008     
                                                                 
 dense_3 (Dense)             (None, 1)                 9         
                                                                 
Total params: 3,080,217
Trainable params: 3,080,217
Non-trainable params: 0
____________________________________________

In [18]:
history = rnn_model.fit(
    x_train,
    y_train,
    epochs=50,
    batch_size=128,
    validation_data=(x_test, y_test),
    callbacks=[early_stop, checkpoint, tensorboard]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50


In [None]:
rnn_model.save("WtoE", save_format='tf')