## Convolution Neural Networks of Sentence
* convolution neural network 는 주로 이미지 학습에서 많이 쓰이는 딥러닝 모델이며 nlp 에서도 좋은 성능을 보입니다.
* Test Accuracy : 87.1% 
* 학습소요시간 : 약 60분 

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.datasets import fetch_20newsgroups
import re
from collections import Counter
import os
#from text_cnn import TextCNN
from tensorflow.contrib import learn
from sklearn.model_selection import train_test_split

In [2]:
#data 
news = fetch_20newsgroups(subset='train')

#feature 
#메모리 에러 문제로 10000자까지만 읽음.
news['target']
for i in range(len(news['data'])):
    news['data'][i] = news['data'][i][:10000]


In [3]:
#정규식으로 data cleaning , 정리 후 소문자로 변환
def clean_str(s):

    s = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", s)
    s = re.sub(r"\'s", " \'s", s)
    s = re.sub(r"\'ve", " \'ve", s)
    s = re.sub(r"n\'t", " n\'t", s)
    s = re.sub(r"\'re", " \'re", s)
    s = re.sub(r"\'d", " \'d", s)
    s = re.sub(r"\'ll", " \'ll", s)
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\(", " \( ", s)
    s = re.sub(r"\)", " \) ", s)
    s = re.sub(r"\?", " \? ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = re.sub(r'\S*(x{2,}|X{2,})\S*',"xxx", s)
    s = re.sub(r'[^\x00-\x7F]+', "", s)
    return s.strip().lower()

#make x_data , y_data
def load_data_and_labels():
    df = pd.DataFrame({"target":news['target'], "data":news['data']})
    selected = ['target', 'data']
    df = df.dropna(axis=0, how='any', subset=selected) # Drop null rows
    df = df.reindex(np.random.permutation(df.index)) # Shuffle the dataframe

    # 0부터 19까지 one-hot encoding
    labels = sorted(list(set(df[selected[0]].tolist())))
    one_hot = np.zeros((len(labels), len(labels)), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    x_raw = df[selected[1]].apply(lambda x: clean_str(x)).tolist()
    y_raw = df[selected[0]].apply(lambda y: label_dict[y]).tolist()
    return x_raw, y_raw, df, labels


In [4]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):

    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(data_size / batch_size) + 1

    for epoch in range(num_epochs):
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data

        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]


In [5]:
# Convolution , Sub-sampling(max-pooling) , Fully connected
class TextCNN(object):
    def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):
        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name='input_y')
        self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')

        # Keeping track of l2 regularization loss
        l2_loss = tf.constant(0.0)

        # Embedding layer
        
        W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name='W')
        
        self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
        self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
        
        
        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            
            # Convolution Layer
            filter_shape = [filter_size, embedding_size, 1, num_filters]
            W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
            b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name='b')
            conv = tf.nn.conv2d(
                self.embedded_chars_expanded,
                W,
                strides=[1, 1, 1, 1],   #stride 이동간격
                padding='VALID',        #'SAME' = zero-padding
                name='conv')

                # Apply activate fuction
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')

                # Maxpooling over the outputs
            pooled = tf.nn.max_pool(
                h,
                ksize=[1, sequence_length - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name='pool')
            pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs,3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add dropout
        
        self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        
        W = tf.get_variable(
            'W',
            shape=[num_filters_total, num_classes],
            initializer=tf.contrib.layers.xavier_initializer())
        b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name='b')
        l2_loss += tf.nn.l2_loss(W)
        l2_loss += tf.nn.l2_loss(b)
        
        self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name='scores')
        
        
        
        
        self.predictions = tf.argmax(self.scores, 1, name='predictions')

        # Calculate mean cross-entropy loss
        
        losses = tf.nn.softmax_cross_entropy_with_logits(labels = self.input_y, logits = self.scores) #  only named arguments accepted            
        self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
       
        correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'), name='accuracy')

        
        correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
        self.num_correct = tf.reduce_sum(tf.cast(correct_predictions, 'float'), name='num_correct')

In [6]:
def train_cnn():
    #load sentences, labels
    
    x_raw, y_raw, df, labels = load_data_and_labels()

    #문장의 길이를 맞춰주고 각각의 단어들을 id 형태로 mapping함
    max_document_length = max([len(x.split(' ')) for x in x_raw])
    print('The maximum length of all sentences: {}'.format(max_document_length))
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_raw)))
    y = np.array(y_raw)

    #dataset을 train 과 test 데이터로 나누어 주기.
    x_, x_test, y_, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

    
    #데이터를 랜덤하게 섞어주기.
    shuffle_indices = np.random.permutation(np.arange(len(y_)))
    x_shuffled = x_[shuffle_indices]
    y_shuffled = y_[shuffle_indices]
    x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1)

    print('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test)))
    print('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test)))


    # graph 빌드하기.
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(
                sequence_length=x_train.shape[1],
                num_classes=y_train.shape[1],
                vocab_size=len(vocab_processor.vocabulary_),
                embedding_size=50,
                filter_sizes=[2,3,4,5],
                num_filters=32,
                l2_reg_lambda=0.01)

            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)


            #batch 단위로 학습시키기.
            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob:0.5}
                _, step, loss, acc = sess.run([train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)

            # evaluate the model with one batch
            def dev_step(x_batch, y_batch):
                feed_dict = {cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0}
                step, loss, acc, num_correct = sess.run([global_step, cnn.loss, cnn.accuracy, cnn.num_correct], feed_dict)
                return num_correct


            # Training starts here
            sess.run(tf.global_variables_initializer())
            train_batches = batch_iter(list(zip(x_train, y_train)), 64, 45)
            best_accuracy, best_at_step = 0, 0

            #train the cnn model with x_train and y_train (batch by batch)
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                #evaluate the model with x_dev and y_dev (batch by batch)
                if current_step % 500 == 0:
                    dev_batches = batch_iter(list(zip(x_dev, y_dev)), 64, 1)
                    total_dev_correct = 0
                    for dev_batch in dev_batches:
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        num_dev_correct = dev_step(x_dev_batch, y_dev_batch)
                        total_dev_correct += num_dev_correct

                    dev_accuracy = float(total_dev_correct) / len(y_dev)
                    print('Accuracy on dev set: {}'.format(dev_accuracy))


                    if dev_accuracy >= best_accuracy:
                        best_accuracy, best_at_step = dev_accuracy, current_step
                        
                        print('Best accuracy {} at step {}'.format(best_accuracy, best_at_step))

            #predict x_test (batch by batch)
            test_batches = batch_iter(list(zip(x_test, y_test)), 64, 1)
            total_test_correct = 0
            for test_batch in test_batches:
                x_test_batch, y_test_batch = zip(*test_batch)
                num_test_correct = dev_step(x_test_batch, y_test_batch)
                total_test_correct += num_test_correct

            test_accuracy = float(total_test_correct) / len(y_test)
            print('Accuracy on test set : {} '.format(test_accuracy))
            print('The training is complete')


train_cnn()

The maximum length of all sentences: 3679
x_train: 9163, x_dev: 1019, x_test: 1132
y_train: 9163, y_dev: 1019, y_test: 1132
Accuracy on dev set: 0.45436702649656524
Best accuracy 0.45436702649656524 at step 500
Accuracy on dev set: 0.6035328753680078
Best accuracy 0.6035328753680078 at step 1000
Accuracy on dev set: 0.7124631992149166
Best accuracy 0.7124631992149166 at step 1500
Accuracy on dev set: 0.7634936211972522
Best accuracy 0.7634936211972522 at step 2000
Accuracy on dev set: 0.8076545632973503
Best accuracy 0.8076545632973503 at step 2500
Accuracy on dev set: 0.8263002944062807
Best accuracy 0.8263002944062807 at step 3000
Accuracy on dev set: 0.8292443572129539
Best accuracy 0.8292443572129539 at step 3500
Accuracy on dev set: 0.8420019627085378
Best accuracy 0.8420019627085378 at step 4000
Accuracy on dev set: 0.8429833169774289
Best accuracy 0.8429833169774289 at step 4500
Accuracy on dev set: 0.8518155053974484
Best accuracy 0.8518155053974484 at step 5000
Accuracy on dev