In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.width', 1000)

In [3]:
df = pd.read_csv("/Users/zum/Dev/Dataset/train_category.csv")
df.head()

Unnamed: 0,text,label
0,앵커 애틀랜타 총격 사건 으로 미국시민 들 은 또한 번 거리 로 나왔습니다 슬프고 ...,international
1,앵커 이번 사건 에서 희생 된한 인명 의 신원 이 사흘 만에 공개 됐습니다 유가족 ...,international
2,일오 전시 부터 버팀목 자금 을 통해 짝수 소 상공 인들 로부터 버팀목 자금 신청 ...,economy
3,일 오후 서울 황학동 중앙시장 중 고 가전제품 판매점 에 진열 된에서 문재인 대통령...,economy
4,실종 주 째 고양시 발달장애 인 점퍼 한 강서 발견 경기 고양시 행주산성 둘레길 에...,society


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

label_encoder = Tokenizer()
label_encoder.fit_on_texts(df['label'])
label = np.array(label_encoder.texts_to_sequences(df['label'])) - 1
label = to_categorical(label)
category = label_encoder.word_index
print(category)

{'society': 1, 'politics': 2, 'international': 3, 'economy': 4, 'sport': 5, 'entertain': 6, 'it': 7, 'culture': 8}


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
vocab = tokenizer.word_index
vocab_size = len(vocab) + 1
tokens = tokenizer.texts_to_sequences(df['text'])
print(vocab_size)

53370


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max(len(l) for l in tokens)
tokens_pad = pad_sequences(tokens, maxlen=max_len, padding='post')
print(tokens_pad.shape)

(2196, 1646)


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(tokens_pad, label, test_size=0.2, random_state=0)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(1756, 1646) (440, 1646) (1756, 8) (440, 8)


In [8]:
import gensim

pretrain_model = gensim.models.Word2Vec.load("/Users/zum/Dev/nlp/model/ko.bin")

In [9]:
embedding_matrix = np.zeros((vocab_size, 200))
for word, index in vocab.items():
    if word in pretrain_model.wv:
        embedding_matrix[index] = pretrain_model.wv[word]
print(embedding_matrix.shape)

(53370, 200)


In [10]:
model_cls = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=max_len, trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(120)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(category), activation='softmax')
])

model_cls.compile(loss='categorical_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy'])
model_cls.summary()

Metal device set to: Apple M1 Pro


2022-07-13 11:01:31.188329: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-13 11:01:31.188477: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1646, 200)         10674000  
                                                                 
 bidirectional (Bidirectiona  (None, 240)              231840    
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               30848     
                                                                 
 dense_1 (Dense)             (None, 8)                 1032      
                                                                 
Total params: 10,937,720
Trainable params: 263,720
Non-trainable params: 10,674,000
_________________________________________________________________


In [11]:
history = model_cls.fit(X_train, Y_train, epochs=15, batch_size=128, validation_data=[X_test, Y_test])

Epoch 1/15


2022-07-13 11:01:31.508209: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-07-13 11:01:32.715880: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-13 11:01:32.898668: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-13 11:01:32.915741: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-13 11:01:49.189416: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-13 11:01:49.206707: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-07-13 11:02:57.012628: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-13 11:02:57.082192: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-13 11:02:57.099365: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
