<a href="https://colab.research.google.com/github/hwyum/deeplearning_nlp/blob/master/Implementing_a_CNN_for_Text_Classification_in_TensorFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1.   Paper: 대상 논문: Convolutional Neural Network for Sentence Classification(2014) by Yoon Kim
2.   Reference: http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
3. Task: sentence classification
4. Data: [네이버 뮤비 리뷰](https://github.com/e9t/nsmc)




### Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
import os
os.chdir("drive/My Drive/Colab Notebooks/MODU_flip_NLP_implementation(2018_2019)")

In [3]:
!pip install mxnet-cu92mkl
!apt-get install g++ openjdk-7-jdk python-dev python3-dev
!pip3 install konlpy 
!apt-get install curl
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)
!pip install gluonnlp matplotlib tqdm

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Package openjdk-7-jdk is not available, but is referred to by another package.
This may mean that the package is missing, has been obsoleted, or
is only available from another source

E: Package 'openjdk-7-jdk' has no installation candidate
Reading package lists... Done
Building dependency tree       
Reading state information... Done
curl is already the newest version (7.58.0-2ubuntu3.5).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.
Installing MeCab-ko
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1381k  100 1381k    0     0  1731k      0 --:--:-- --:--:-- --:--:-- 1731k
mecab-0.996-ko-0.9.2/
mecab-0.996-ko-0.9.2/example/
mecab-0.996-ko-0.9.2/example/example.cpp
mecab-0.996-ko-0.9

In [32]:
import numpy as np
import pandas as pd
import gluonnlp as nlp
import tensorflow as tf
from tensorflow import keras
from sklearn import datasets
import itertools
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from tqdm import tqdm

print(tf.__version__)


1.12.0


### Data Loading

In [5]:
data_train = pd.read_csv('./data/ratings_train.txt', sep='\t')[['document', 'label']]
data_test = pd.read_csv('./data/ratings_test.txt', sep='\t')[['document', 'label']]
print(len(data_train))
print(len(data_test))


150000
50000


In [6]:
# NA 확인
print(sum(data_train.document.isna()))
print(sum(data_test.document.isna()))

# NA를 빈 문자열로 대체
data_train.at[data_train.document.isna(),'document'] = ''
data_test.at[data_test.document.isna(), 'document'] = ''

print(sum(data_train.document.isna()))
print(sum(data_test.document.isna()))

5
3
0
0


In [7]:
# training / validation set split
val_ratio = 0.1
val_indices = np.random.choice(range(data_train.shape[0]), size = int(len(data_train) * val_ratio), replace=False)
tr_indices = np.delete(range(data_train.shape[0]), obj=val_indices, axis=0)

data_val = data_train.iloc[val_indices]
data_tr = data_train.iloc[tr_indices]

print(data_tr.shape, data_val.shape, data_test.shape)

(135000, 2) (15000, 2) (50000, 2)


### Data Preprocessing

In [0]:
# tokenizer load
import konlpy
mecab = konlpy.tag.Mecab()

In [9]:
# 형태소 단위로 토크나이징
%%time

X_tr = data_tr.document.map(mecab.morphs).tolist()
y_tr = data_tr.label.tolist()

X_val = data_val.document.map(mecab.morphs).tolist()
y_val = data_val.label.tolist()

X_tst = data_test.document.map(mecab.morphs).tolist()
y_tst = data_test.label.tolist()

CPU times: user 19.9 s, sys: 216 ms, total: 20.1 s
Wall time: 20.1 s


#### Building vocabulary and connecting vocabulary with fasttext embedding


In [0]:
# training data에 대한 vocab 생성
counter = nlp.data.count_tokens(itertools.chain.from_iterable(X_tr))
vocab = nlp.Vocab(counter, bos_token=None, eos_token=None, min_freq=15)

In [0]:
# loading fasttext embedding
fasttext = nlp.embedding.create(embedding_name='fasttext', source='wiki.simple', load_ngrams=True)

In [0]:
# vocab에 fasttext vector 연결
vocab.set_embedding(fasttext)

In [13]:
vocab.idx_to_token[1]

'<pad>'

In [14]:
%%time

# token to index
X_tr = list(map(lambda sent: [vocab.token_to_idx[token] for token in sent], X_tr))
X_tr = pad_sequences(sequences=X_tr, maxlen=30, padding='post', value=1.)

X_val = list(map(lambda sent: [vocab.token_to_idx[token] for token in sent], X_val))
X_val = pad_sequences(sequences=X_val, maxlen=30, padding='post', value=1.)

X_tst = list(map(lambda sent: [vocab.token_to_idx[token] for token in sent], X_tst))
X_tst = pad_sequences(sequences=X_tst, maxlen=30, padding='post', value=1.)



CPU times: user 4.05 s, sys: 2.57 ms, total: 4.05 s
Wall time: 4.05 s


### Modeling : morphConv class 정의

![대체 텍스트](http://www.wildml.com/wp-content/uploads/2015/11/Screen-Shot-2015-11-06-at-8.03.47-AM-1024x413.png)

In [0]:
class MorphConv:
    def __init__(self, X, y, n_of_classes, embedding):
        
        with tf.variable_scope('input_layer'):
            self.__X = X
            self.__y = y
            self.is_training = tf.placeholder(dtype=tf.bool)
            
        with tf.variable_scope('embedding_layer'):
            static_embed = tf.get_variable(name='static', initializer=embedding, trainable=False)
            non_static_embed = tf.get_variable(name='non_static', initializer=embedding, trainable=True)
            
            static_batch = tf.nn.embedding_lookup(params = static_embed, ids = self.__X)
            non_static_batch = tf.nn.embedding_lookup(params = non_static_embed, ids = self.__X)
            
        with tf.variable_scope('convolution_layer'):
            with tf.variable_scope('tri_gram'):
                tri_gram = keras.layers.Conv1D(filters = 100, kernel_size = 3, activation = keras.activations.relu,
                                               kernel_initializer = 'he_uniform', padding = 'valid')
                static_3 = tri_gram(static_batch)
                non_static_3 = tri_gram(non_static_batch)
                
            with tf.variable_scope('tetra_gram'):
                tetra_gram = keras.layers.Conv1D(filters = 100, kernel_size = 4, activation = keras.activations.relu,
                                               kernel_initializer = 'he_uniform', padding = 'valid')
                static_4 = tetra_gram(static_batch)
                non_static_4 = tetra_gram(non_static_batch)
                
            with tf.variable_scope('penta_gram'):
                penta_gram = keras.layers.Conv1D(filters = 100, kernel_size = 5, activation = keras.activations.relu,
                                               kernel_initializer = 'he_uniform', padding = 'valid')
                static_5 = penta_gram(static_batch)
                non_static_5 = penta_gram(non_static_batch)
                
            fmap_3 = tf.reduce_max(static_3 + non_static_3, axis = 1)
            fmap_4 = tf.reduce_max(static_4 + non_static_4, axis = 1)
            fmap_5 = tf.reduce_max(static_5 + non_static_5, axis = 1)
            
        with tf.variable_scope('output_layer'):
            flattened = tf.concat([fmap_3, fmap_4, fmap_5], axis = -1)
            score = keras.layers.Dense(units = n_of_classes, kernel_constraint=keras.constraints.max_norm(3.))(flattened)
            
            self.__score = keras.layers.Dropout(rate = 0.5)(score, training = self.is_training)
            
        with tf.variable_scope('loss'):
            ce_loss = tf.losses.sparse_softmax_cross_entropy(labels = self.__y, logits = self.__score)
            self.total_loss = ce_loss
            
        with tf.variable_scope('prediction'):
            self.prediction = tf.argmax(self.__score, axis = -1)
        
    # predict instance method for small dataset
    def predict(self, sess, x_data, is_training = False):
        feed_prediction = {self.__X : x_data, self.is_training : is_training}
        return sess.run(self.prediction, feeddict = feed_prediction)
   

### Modeling : model 구현

In [34]:
# hyper-parameter 정의
lr = 0.003
epochs = 30
batch_size = 100
total_step = X_tr.shape[0] // batch_size
print(total_step)

1350


In [0]:
# train
tr_dataset = tf.data.Dataset.from_tensor_slices((X_tr, y_tr))
tr_dataset = tr_dataset.shuffle(buffer_size = 1000000)
tr_dataset = tr_dataset.batch(batch_size = batch_size)
tr_iterator = tr_dataset.make_initializable_iterator()

In [0]:
# val
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.shuffle(buffer_size = 10000000)
val_dataset = val_dataset.batch(batch_size = batch_size)
val_iterator = val_dataset.make_initializable_iterator()

In [19]:
# anonymous iterator
handle = tf.placeholder(dtype = tf.string)
iterator = tf.data.Iterator.from_string_handle(string_handle = handle,
                                               output_types = tr_iterator.output_types,
                                               output_shapes = tr_iterator.output_shapes)
x_data, y_data = iterator.get_next()
print(x_data, y_data)

Tensor("IteratorGetNext:0", shape=(?, 30), dtype=int32) Tensor("IteratorGetNext:1", shape=(?,), dtype=int32)


In [20]:
embedding = vocab.embedding.idx_to_vec.asnumpy()
print(embedding.dtype)
print(embedding.shape)

float32
(7120, 300)


In [0]:
morph_conv = MorphConv(X = x_data, y = y_data, n_of_classes=2, embedding=embedding)

In [0]:
# create training op
opt = tf.train.AdadeltaOptimizer(learning_rate = lr)
training_op = opt.minimize(loss = morph_conv.total_loss)

### Training

In [0]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config = config)
sess.run(tf.global_variables_initializer())
tr_handle, val_handle = sess.run(fetches = [tr_iterator.string_handle(), val_iterator.string_handle()])

In [36]:


tr_loss_hist = []
val_loss_hist = []

for epoch in tqdm(range(epochs)): 
    avg_tr_loss = 0
    avg_val_loss = 0
    tr_step = 0
    val_step = 0
    
    # for mini-batch training
    sess.run(tr_iterator.initializer)
    try:
        while True:
            _, tr_loss = sess.run(fetches = [training_op, morph_conv.total_loss],
                                  feed_dict = {handle : tr_handle, morph_conv.is_training : True})
            
            avg_tr_loss += tr_loss
            tr_step += 1
            
    except tf.errors.OutOfRangeError:
        pass
        
    # for validation
    sess.run(val_iterator.initializer)
    try:
        while True:
            _, val_loss = sess.run(fetches = [training_op, morph_conv.total_loss],
                                  feed_dict = {handle : val_handle, morph_conv.is_training : False})
            
            avg_val_loss += val_loss
            val_step += 1
            
    except tf.errors.OutOfRangeError:
        pass
    
    avg_tr_loss /= tr_step
    avg_val_loss /= val_step
    tr_loss_hist.append(avg_tr_loss)
    val_loss_hist.append(avg_val_loss)
    
    print('epoch : {:3}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, avg_tr_loss, avg_val_loss))

  3%|▎         | 1/30 [00:26<12:49, 26.55s/it]

epoch :   1, tr_loss : 1.049, val_loss : 0.805


  7%|▋         | 2/30 [00:52<12:17, 26.35s/it]

epoch :   2, tr_loss : 0.778, val_loss : 0.716


 10%|█         | 3/30 [01:18<11:47, 26.21s/it]

epoch :   3, tr_loss : 0.721, val_loss : 0.686


 13%|█▎        | 4/30 [01:44<11:18, 26.10s/it]

epoch :   4, tr_loss : 0.698, val_loss : 0.672


 17%|█▋        | 5/30 [02:10<10:50, 26.03s/it]

epoch :   5, tr_loss : 0.686, val_loss : 0.663


 20%|██        | 6/30 [02:35<10:23, 25.98s/it]

epoch :   6, tr_loss : 0.679, val_loss : 0.657


 23%|██▎       | 7/30 [03:01<09:56, 25.95s/it]

epoch :   7, tr_loss : 0.674, val_loss : 0.652


 27%|██▋       | 8/30 [03:27<09:30, 25.92s/it]

epoch :   8, tr_loss : 0.670, val_loss : 0.648


 30%|███       | 9/30 [03:53<09:03, 25.88s/it]

epoch :   9, tr_loss : 0.666, val_loss : 0.645


 33%|███▎      | 10/30 [04:19<08:37, 25.86s/it]

epoch :  10, tr_loss : 0.664, val_loss : 0.642


 37%|███▋      | 11/30 [04:45<08:11, 25.86s/it]

epoch :  11, tr_loss : 0.660, val_loss : 0.639


KeyboardInterrupt: ignored