In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
path = '/content/drive/MyDrive/데이터크리에이터캠프/nlp/task1/data'

In [3]:
import tensorflow as tf
from tensorflow import keras
from collections import Counter
import pandas as pd
import re
import os

In [4]:
train_path = os.path.join(path, 'train.csv')
test_path = os.path.join(path, 'test_x.csv')
sub_path = os.path.join(path, 'sample_submission.csv')

train = pd.read_csv(train_path, index_col=0)
test = pd.read_csv(test_path, index_col=0)
sub = pd.read_csv(sub_path, index_col=0)

In [5]:
# 특수문자, 영어 이외의 문자 제거
def preprocessing(text):
    text = re.sub("<br\\s*/?>", ' ', text)
    text = re.sub("[^a-zA-Z\']", ' ', text)
    text = text.split()
    return text

In [6]:
# vocab 생성
vocab = Counter()
x_train = train['text']
y_train = train['author']
y_train = tf.keras.utils.to_categorical(y_train)

for text in x_train.tolist():
    text = preprocessing(text)
    vocab.update(text)

# vocab 길이를 20000으로 제한
vocab_size = 20000
trunc_vocab = [
    word for word, count in vocab.most_common()[:vocab_size]
]

# lookup table 생성
words = tf.constant(trunc_vocab)
word_ids = tf.range(len(trunc_vocab), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [7]:
# dataset mapping 함수
def tf_preprocess(X_batch, y_batch):
  X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
  X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
  X_batch = tf.strings.split(X_batch)
  return X_batch.to_tensor(default_value=b"<pad>"), y_batch

def encode_word(X_batch, y_batch):
  return table.lookup(X_batch), y_batch

In [13]:
BATCH_SIZE = 512
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.batch(BATCH_SIZE).map(tf_preprocess).map(encode_word)
train_dataset = train_dataset.shuffle(1000, seed=42).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [14]:
embed_size = 200
model = keras.models.Sequential([
            keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                                   input_shape=[None]),
            keras.layers.Conv1D(128, 9, activation='relu'),
            keras.layers.BatchNormalization(),
            keras.layers.Conv1D(128, 9, activation='relu'),
            keras.layers.BatchNormalization(),
            keras.layers.Conv1D(256, 5, activation='relu'),
            keras.layers.GlobalAveragePooling1D(),
            keras.layers.Dense(5, activation='softmax')
])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [16]:
hist = model.fit(train_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
# test 용
def test_tf_preprocess(X):
  X = tf.strings.regex_replace(X, b"<br\\s*/?>", b" ")
  X = tf.strings.regex_replace(X, b"[^a-zA-Z']", b" ")
  X = tf.strings.split(X)
  return X.to_tensor(default_value=b"<pad>")

def test_encode_word(X):
  return table.lookup(X)

In [34]:
test_dataset = tf.data.Dataset.from_tensor_slices(test['text'].tolist()).batch(512)
test_dataset = test_dataset.map(test_tf_preprocess).map(test_encode_word)

In [36]:
pred = model.predict(test_dataset)
sub.loc[:, :] = pred
sub.to_csv('cnn.csv')