# 캐글 텍스트 분류 - 합성곱 신경망 활용 접근방법

In [5]:
import sys
import os
import numpy as np
import json

from sklearn.model_selection import train_test_split
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

In [6]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_INPUT_DATA = 'train_input.npy'
TRAIN_LABEL_DATA = 'train_label.npy'
TEST_INPUT_DATA = 'test_input.npy'
TEST_ID_DATA = 'test_id.npy'

DATA_CONFIGS = 'data_configs.json'

train_input_data = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
train_label_data = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'rb'))
test_input_data = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb'))

with open(DATA_IN_PATH + DATA_CONFIGS, 'r') as f:
    prepro_configs = json.load(f)
    print(prepro_configs.keys())

dict_keys(['vocab', 'vocab_size'])


In [7]:
# 파라메터 변수
RNG_SEED = 1234
BATCH_SIZE = 16
NUM_EPOCHS = 3
VOCAB_SIZE = prepro_configs['vocab_size'] + 1
EMB_SIZE = 128
VALID_SPLIT = 0.2

train_input, eval_input, train_label, eval_label = train_test_split(train_input_data, train_label_data, test_size=VALID_SPLIT, random_state=RNG_SEED)

## tf.data 세팅

In [8]:
# def mapping_fn(X, Y=None):
#     input, label = {'x': X}, Y
#     return input, label

# def train_input_fn():
#     dataset = tf.data.Dataset.from_tensor_slices((train_input, train_label))
#     dataset = dataset.shuffle(buffer_size=len(train_input))
#     dataset = dataset.batch(BATCH_SIZE)
#     dataset = dataset.map(mapping_fn)
#     dataset = dataset.repeat(count=NUM_EPOCHS)

#     iterator = dataset.make_one_shot_iterator()
    
#     return iterator.get_next()

# def eval_input_fn():
#     dataset = tf.data.Dataset.from_tensor_slices((eval_input, eval_label))
#     dataset = dataset.shuffle(buffer_size=len(eval_input))
#     dataset = dataset.batch(BATCH_SIZE)
#     dataset = dataset.map(mapping_fn)

#     iterator = dataset.make_one_shot_iterator()
    
#     return iterator.get_next()

In [33]:
def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_input, train_label))
    dataset = dataset.shuffle(buffer_size=50000)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.repeat(count=NUM_EPOCHS)
    return dataset

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((eval_input, eval_label))
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

# 데이터셋 정의
train_dataset = train_input_fn()
eval_dataset = eval_input_fn()


## 모델 세팅

In [34]:
def create_model(vocab_size, embedding_dim, dense_dim, learning_rate):
    # Input layer
    inputs = tf.keras.Input(shape=(None,), dtype=tf.int32)

    # Embedding layer
    x = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
    x = tf.keras.layers.Dropout(0.5)(x)

    # Conv1D and GlobalMaxPooling layers
    conv1 = tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='valid', activation='relu')(x)
    pool1 = tf.keras.layers.GlobalMaxPool1D()(conv1)

    conv2 = tf.keras.layers.Conv1D(filters=128, kernel_size=4, padding='valid', activation='relu')(x)
    pool2 = tf.keras.layers.GlobalMaxPool1D()(conv2)

    conv3 = tf.keras.layers.Conv1D(filters=128, kernel_size=5, padding='valid', activation='relu')(x)
    pool3 = tf.keras.layers.GlobalMaxPool1D()(conv3)

    # Concatenate pooling layers
    concat = tf.keras.layers.concatenate([pool1, pool2, pool3])

    # Dense and Dropout layers
    x = tf.keras.layers.Dense(dense_dim, activation='relu')(concat)
    x = tf.keras.layers.Dropout(0.5)(x)
    logits = tf.keras.layers.Dense(1)(x)

    # Squeeze to remove dimensions of size 1
    # squeeze의 직접적으로 2.x에서 사용 불가
    # outputs = tf.squeeze(logits, axis=-1)
    outputs = tf.keras.layers.Flatten()(logits)

    # Model creation
    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    # Model compilation
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model


In [35]:
# 모델 생성
model = create_model(VOCAB_SIZE, EMB_SIZE, 250, 0.01)

In [36]:
model.summary()

In [37]:
# 모델 디렉토리 설정
model_dir = os.path.join(os.getcwd(), "data_out/checkpoint/cnn/")
os.makedirs(model_dir, exist_ok=True)

# 체크포인트 콜백 설정
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(model_dir, 'model_checkpoint.keras'),
    save_best_only=True,  # 가장 좋은 모델만 저장
    monitor='val_loss',  # 평가할 지표
    mode='min',  # 모니터링하는 지표가 최소화될 때 저장
    save_freq='epoch'  # 매 에포크마다 저장
)

# 텐서보드 콜백 설정
tensorboard_cb = tf.keras.callbacks.TensorBoard(
    log_dir=os.path.join(model_dir, 'logs'),
    update_freq='batch'  # 배치마다 로그 기록
)

# 조기 종료 콜백 설정 (선택사항)
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,  # 개선되지 않은 에포크 횟수 후 중단
    restore_best_weights=True
)


# 모델 훈련 (학습)
model.fit(
    train_dataset,
    epochs=NUM_EPOCHS,
    validation_data=eval_dataset,
    callbacks=[checkpoint_cb, tensorboard_cb, early_stopping_cb]
)

# 모델 평가
evaluation_results = model.evaluate(eval_dataset)
print("Evaluation results:", evaluation_results)

Epoch 1/3
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 42ms/step - accuracy: 0.4980 - loss: 8.0664 - val_accuracy: 0.5098 - val_loss: 7.9011
Epoch 2/3
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 43ms/step - accuracy: 0.5053 - loss: 7.9212 - val_accuracy: 0.4902 - val_loss: 8.1274
Epoch 3/3
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 42ms/step - accuracy: 0.5032 - loss: 7.9206 - val_accuracy: 0.4902 - val_loss: 8.1274
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.5038 - loss: 7.9972
Evaluation results: [7.901095390319824, 0.5098000168800354]


In [None]:
# # tensorflow 1.x 에 대한 부분
# # 모델에 대한 메인 부분입니다.


# def model_fn(features, labels, mode):

#     TRAIN = mode == tf.estimator.ModeKeys.TRAIN
#     EVAL = mode == tf.estimator.ModeKeys.EVAL
#     PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
#     #embedding layer를 선언합니다.
#     embedding_layer = keras.layers.Embedding(
#                     VOCAB_SIZE,
#                     EMB_SIZE)(features['x'])
    
#     # embedding layer에 대한 output에 대해 dropout을 취합니다.
#     dropout_emb = keras.layers.Dropout(rate=0.5)(embedding_layer)

#     ## filters = 128이고 kernel_size = 3,4,5입니다.
#     ## 길이가 3,4,5인 128개의 다른 필터를 생성합니다. 3,4,5 gram의 효과처럼 다양한 각도에서 문장을 보는 효과가 있습니다.
#     ## conv1d는 (배치사이즈, 길이, 채널)로 입력값을 받는데, 배치사이즈: 문장 숫자 | 길이: 각 문장의 단어의 개수 | 채널: 임베딩 출력 차원수임
    
#     conv1 = keras.layers.Conv1D(
#          filters=128,
#          kernel_size=3,
#         padding='valid',
#          activation=tf.nn.relu)(dropout_emb)
    
#     pool1 = keras.layers.GlobalMaxPool1D()(conv1)

#     conv2 = keras.layers.Conv1D(
#          filters=128,
#          kernel_size=4,
#         padding='valid',
#          activation=tf.nn.relu)(dropout_emb)
    
#     pool2 = keras.layers.GlobalMaxPool1D()(conv2)
    
#     conv3 = keras.layers.Conv1D(
#          filters=128,
#          kernel_size=5,
#         padding='valid',
#          activation=tf.nn.relu)(dropout_emb)
#     pool3 = keras.layers.GlobalMaxPool1D()(conv3)
    
#     concat = keras.layers.concatenate([pool1, pool2, pool3]) #3,4,5gram이후 모아주기
    
#     hidden = keras.layers.Dense(250, activation=tf.nn.relu)(concat)
#     dropout_hidden = keras.layers.Dropout(rate=0.5)(hidden)
#     logits = keras.layers.Dense(1, name='logits')(dropout_hidden)
#     logits = tf.squeeze(logits, axis=-1)
    
#     #최종적으로 학습, 평가, 테스트의 단계로 나누어 활용
    
#     if PREDICT:
#         return tf.estimator.EstimatorSpec(
#             mode=mode,
#             predictions={
#                 'prob': tf.nn.sigmoid(logits)
#             }
#         )
        
#     loss = tf.losses.sigmoid_cross_entropy(labels, logits)

#     if EVAL:
#         pred = tf.nn.sigmoid(logits)
#         accuracy = tf.metrics.accuracy(labels, tf.round(pred))
#         return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops={'acc': accuracy})
        
#     if TRAIN:
#         global_step = tf.train.get_global_step()
#         train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)

#         return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss = loss)

In [None]:
# model_dir = os.path.join(os.getcwd(), "data_out/checkpoint/cnn/")
# os.makedirs(model_dir, exist_ok=True)

# tensorflow 1.x 에서 callback 함수를 사용할 때 적용되는 부분
# config_tf = tf.estimator.RunConfig(save_checkpoints_steps=200, keep_checkpoint_max=2,
#                                     log_step_count_steps=400)

#  #에스티메이터 객체 생성
# cnn_est = tf.estimator.Estimator(model_fn, model_dir=model_dir, config=config_tf)
# cnn_est.train(train_input_fn) #학습하기
# cnn_est.evaluate(eval_input_fn) #평가하기

In [61]:
test_input_data = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb')) 
ids = np.load(open(DATA_IN_PATH + TEST_ID_DATA, 'rb'),  allow_pickle=True)

test_dataset = tf.data.Dataset.from_tensor_slices(test_input_data)
test_dataset = test_dataset.batch(BATCH_SIZE)

predictions = model.predict(test_dataset)

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step


In [62]:
predictions

array([[-21.4319  ],
       [-25.335705],
       [-24.490244],
       ...,
       [-23.697376],
       [-23.083796],
       [-24.48501 ]], dtype=float32)

In [63]:
# TensorFlow 2.x 코드
predictions = np.array(predictions)
predictions

array([[-21.4319  ],
       [-25.335705],
       [-24.490244],
       ...,
       [-23.697376],
       [-23.083796],
       [-24.48501 ]], dtype=float32)

In [85]:
predictions = predictions.flatten()

In [86]:
output = pd.DataFrame( data={"id": ids, "sentiment": predictions[0]} )

output.to_csv( DATA_OUT_PATH + "Bag_of_Words_model_test.csv", index=False, quoting=3 )