<a href="https://colab.research.google.com/github/hiya906/my-machine-learning/blob/master/%EC%98%81%ED%99%94%ED%8F%89%EC%A0%90%EC%98%88%EC%B8%A1_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SDS NLP Challenge

- 이 Challenge는 자연어 처리를 통해 영화 감상평을 보고 영화 평점을 예측하는 것을 목적으로 합니다.
- 데이터는 2 종류가 있습니다: IMDB (영어), NSMC (한국어)
- 전처리, 학습 및 평가 템플릿은 아래와 같이 제공됩니다. 원하시는 대로 Parameter를 조절하고 'MyModel'에 해당하는 본인의 모델을 구현하여 실험하시면 됩니다.
- Baseline 성능은 XXX와 같습니다.
- Challenge 설명 PPT에 더 자세한 설명이 들어있습니다.

In [0]:
import os
os.mkdir('data')
os.mkdir('models')
os.mkdir('util')

In [0]:
import tensorflow as tf
print(tf.__version__)
print(dir(tf.feature_column))

In [0]:
# IMPORT PACKAGE
import time
import argparse
import numpy as np
import tensorflow as tf

from util.Data_loader import load_data
from util.Dataset import Dataset
from util.Parser import vocab_dictionary, parse_data

from models.Baseline_LSTM import LSTM
from models.Baseline_Char_CNN import Char_CNN

In [0]:
# imdb, nsmc
DATA_NAME = 'imdb'

# 전처리 방법
# imdb (영어) = word 
# nsmc (한글) = eumjeol or char (음절 or 음소)
PARSER = 'word'

MAXLEN = 300
# BATCH_SIZE = 256
# LEARNING_RATE = 0.01
BATCH_SIZE = 256
LEARNING_RATE = 0.01

In [0]:
train_file = DATA_NAME + '_train.txt'
test_file = DATA_NAME + '_test.txt'
train_comments, train_ratings, test_comments, test_ratings = load_data('data', train_file, test_file)

str2idx = vocab_dictionary(train_comments, PARSER)
idx2str = {i: s for s, i in str2idx.items()}

CHARSIZE = len(str2idx)

In [0]:
# 전처리
train_x = parse_data(train_comments, str2idx, PARSER)
train_y = train_ratings

test_x = parse_data(test_comments, str2idx, PARSER)
test_y = test_ratings

In [0]:
# 데이터셋
train_dataset = Dataset(train_x, train_y, MAXLEN, BATCH_SIZE, shuffle=True)
test_dataset = Dataset(test_x, test_y, MAXLEN, BATCH_SIZE, shuffle=False)

In [0]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np

class MyModel():
    def __init__(self, embedding, conv_layers, fc_layers, maxlen, char_size, lr):
        self.embedding = embedding
        self.input_len = maxlen
        self.char_size = char_size
        self.conv_layers = conv_layers
        self.fc_layers = fc_layers
        self.lr = lr
        
        self.output_keep_prob = tf.placeholder(tf.float32, name='output_keep_prob')

        with tf.name_scope("Input-Layer"):
            # Input
            self.x = tf.placeholder(tf.int64, shape=[None, self.input_len], name="input_x")
            self.y = tf.placeholder(tf.float32, shape=[None], name="output_x")
            embedding_matrix = tf.Variable(tf.random_normal([self.char_size, self.embedding], stddev=0.01), name='Embedding_matrix')
            print("Embedding Matrix: ", embedding_matrix.shape)

        # EMBEDDING LAYERS
        with tf.name_scope("Embedding-Layer"):
            cnn_x = tf.nn.embedding_lookup(embedding_matrix, self.x)
            cnn_x = tf.expand_dims(cnn_x, -1)
        
        
        # ================================ Version 2 =================================
        # CONVOLUTION LAYERS
        for i, conv_info in enumerate(self.conv_layers):
            print("CNN Input: ", cnn_x.shape)
            with tf.name_scope("Conv-Layer" + str(i)):
                filter_width = cnn_x.get_shape()[2].value
                filter_shape = [conv_info[1], filter_width, 1, conv_info[0]]

                W = tf.Variable(tf.random_normal(filter_shape,  mean=0.0, stddev=0.01), dtype=tf.float32,
                                name='Conv_W')  # large = 0.02 , small = 0.05
                b = tf.Variable(tf.random_normal(shape=[conv_info[0]],  mean=0.0, stddev=0.01), dtype=tf.float32,
                                name='Conv_b')

                conv = tf.nn.conv2d(cnn_x, W, [1, 1, 1, 1], "VALID", name="conv")
                cnn_x = tf.nn.bias_add(conv, b)

            with tf.name_scope("Non-Linear"):
                cnn_x = tf.nn.relu(cnn_x)
#                 cnn_x = tf.nn.tanh(cnn_x)
            print("CNN Output: ", cnn_x.shape)
            if conv_info[-1] != -1:
                print("Pooling Input: ", cnn_x.shape)
                with tf.name_scope("Max-Polling"):
                    pool_shape = [1, conv_info[-1], 1, 1]
                    pool = tf.nn.max_pool(cnn_x, ksize=pool_shape, strides=pool_shape, padding="VALID")
                    cnn_x = tf.transpose(pool, [0, 1, 3, 2])
                print("Pooling Output: ", cnn_x.shape)
            else:
                cnn_x = tf.transpose(cnn_x, [0, 1, 3, 2])
        cnn_output = tf.squeeze(cnn_x, axis=3)

        # Flatten cnn_output: (batch, height, width)  --> (batch, height * width)
        out_shape = cnn_output.get_shape()
        d = out_shape[1].value * out_shape[2].value
        print('out_shape[1].value: ', out_shape[1].value)
        print('out_shape[2].value: ', out_shape[2].value)
        fc_input = tf.reshape(cnn_output, [-1, d])
        print('fc_input: ', fc_input)

        # Add dropout
#         keep_prob = tf.placeholder(tf.float32)
        h_drop = tf.nn.dropout(fc_input, self.output_keep_prob)
        
        # Regression layer
        print("Output Layer input: ", h_drop.shape)
        with tf.name_scope("Output-Layer"):
            W = tf.Variable(tf.random_normal([d, 1], stddev=0.01), name="Output_W")
            b = tf.Variable(tf.random_normal([1], stddev=0.01), name="Output_b")

            output = tf.squeeze(tf.matmul(h_drop, W) + b, 1)
            self.pred = output
        print("Output Layer output: ", output.shape)

        with tf.name_scope("Loss"):
            squared_error = tf.square(self.y - output)
            self.loss = tf.reduce_mean(squared_error)

        optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        self.train_step = optimizer.minimize(self.loss)

In [0]:
## 병렬 ##

# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np

class MyModel2():
    def __init__(self, embedding, conv_layers, fc_layers, maxlen, char_size, lr):
        self.embedding = embedding
        self.input_len = maxlen
        self.char_size = char_size
        self.conv_layers = conv_layers
        self.fc_layers = fc_layers
        self.lr = lr
        
        self.output_keep_prob = tf.placeholder(tf.float32, name='output_keep_prob')
        
        filter_sizes = [3]
        sec_filter_sizes = [1,2,3]
        num_filters = 32
        hidden_layer_size = 1280

        with tf.name_scope("Input-Layer"):
            # Input
            self.x = tf.placeholder(tf.int64, shape=[None, self.input_len], name="input_x")
            self.y = tf.placeholder(tf.float32, shape=[None], name="output_x")
            embedding_matrix = tf.Variable(tf.random_normal([self.char_size, self.embedding], stddev=0.01), name='Embedding_matrix')
            print("Embedding Matrix: ", embedding_matrix.shape)

        # EMBEDDING LAYERS
        with tf.name_scope("Embedding-Layer"):
            cnn_x = tf.nn.embedding_lookup(embedding_matrix, self.x)
            cnn_x = tf.expand_dims(cnn_x, -1)
        print('embedeed_cnn_x: ', cnn_x)
        
        pooled_outputs = []
        
        for j, sec_filter_size in enumerate(sec_filter_sizes):
            with tf.name_scope("conv2-maxpool-%s" % sec_filter_size):
                # Convolution Layer
                conv2 = tf.layers.conv2d(cnn_x,
                                        filters=num_filters,
                                        kernel_size=[sec_filter_size, 32],
                                        strides=[1, 1],
                                        activation=tf.nn.relu, )
                # Maxpooling Layer
                pooled2 = tf.layers.max_pooling2d(conv2,
                                                 [conv2.shape[1],1],
                                                 strides=[1, 1], )
                pooled_outputs.append(pooled2)
                print('pooled2 shape: ', pooled2.shape)
                print('pooled_outputs shape: ', pooled_outputs)
                print('conv2.shape[1]: ', conv2.shape[1])
#=================================================================================#
#         for i, filter_size in enumerate(filter_sizes):
#           with tf.name_scope("conv-maxpool-%s" % filter_size):
            
#             # Convolution Layer
#             conv = tf.layers.conv2d(cnn_x,
#                                     filters=num_filters,
#                                     kernel_size=[filter_size, filter_size],
#                                     strides=[1, 1],
#                                     activation=tf.nn.relu,)

#             # Maxpooling Layer
#             pooled = tf.layers.max_pooling2d(conv,
#                                     [3, 1],
#                                     strides=[2, 1],)
#             pooled = tf.transpose(pooled, [0, 1, 3, 2])

#             for j, sec_filter_size in enumerate(sec_filter_sizes):
#                 with tf.name_scope("conv2-maxpool-%s" % sec_filter_size):
#                     # Convolution Layer
#                     conv2 = tf.layers.conv2d(pooled,
#                                             filters=num_filters,
#                                             kernel_size=[sec_filter_size, pooled.shape[2]],
#                                             strides=[1, 1],
#                                             activation=tf.nn.relu, )
#                     print('sec_filter_size: ', sec_filter_size)
#                     print('pooled_shape: ', pooled.shape[2])
#                     # Maxpooling Layer
#                     pooled2 = tf.layers.max_pooling2d(conv2,
#                                                      [conv2.shape[1], 1],
#                                                      strides=[1, 1], )
#                     pooled_outputs.append(pooled2)            
#=================================================================================#
#         for i, filter_size in enumerate(filter_sizes):
#           with tf.name_scope("conv-maxpool-%s" % filter_size):
            
#             # Convolution Layer
#             conv = tf.layers.conv2d(cnn_x,
#                                     filters=num_filters,
#                                     kernel_size=[filter_size, filter_size],
#                                     strides=[1, 1],
#                                     activation=tf.nn.relu,)

#             # Maxpooling Layer
#             pooled = tf.layers.max_pooling2d(conv,
#                                     [3, 1],
#                                     strides=[2, 1],)
#             pooled = tf.transpose(pooled, [0, 1, 3, 2])

#             for j, sec_filter_size in enumerate(sec_filter_sizes):
#                 with tf.name_scope("conv2-maxpool-%s" % sec_filter_size):
#                     # Convolution Layer
#                     conv2 = tf.layers.conv2d(pooled,
#                                             filters=num_filters,
#                                             kernel_size=[sec_filter_size, pooled.shape[2]],
#                                             strides=[1, 1],
#                                             activation=tf.nn.relu, )
#                     print('sec_filter_size: ', sec_filter_size)
#                     print('pooled_shape: ', pooled.shape[2])
#                     # Maxpooling Layer
#                     pooled2 = tf.layers.max_pooling2d(conv2,
#                                                      [3, 1],
#                                                      strides=[1, 1], )
#                     pooled_outputs.append(pooled2) 
        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        print('num_filters_total : ', num_filters_total)
        h_pool = tf.concat(pooled_outputs, 1)
        h_pool_flat = tf.reshape(h_pool, [-1, h_pool.shape[1]*h_pool.shape[2]*h_pool.shape[3]])
        print("h_pool shape: ", h_pool.shape)

        # Add dropout
        with tf.name_scope("dropout"):
            h_drop = tf.nn.dropout(h_pool_flat, self.output_keep_prob)

        fc_1 = tf.contrib.layers.fully_connected(h_drop, int(hidden_layer_size),
                                                 activation_fn=tf.nn.relu,)
        fc_1 = tf.nn.dropout(fc_1, self.output_keep_prob)                                         
        output = tf.contrib.layers.fully_connected(fc_1, 1, activation_fn=None)                    
        self.pred = output            
#         # Regression layer
#         print("Output Layer input: ", h_drop.shape)
        
#         with tf.name_scope("Output-Layer"):
#             W = tf.Variable(tf.random_normal([hidden_layer_size, 1], stddev=0.01), name="Output_W")
#             b = tf.Variable(tf.random_normal([1], stddev=0.01), name="Output_b")

#             output = tf.squeeze(tf.matmul(output, W) + b, 1)
#             self.pred = output
#         print("Output Layer output: ", output.shape)
        
        
        with tf.name_scope("Loss"):
            squared_error = tf.square(self.y - output)
            self.loss = tf.reduce_mean(squared_error)

        optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        self.train_step = optimizer.minimize(self.loss)

In [0]:
class MyModel:
    def __init__(self, args):
        # Input Placeholders
        self.x = None
        self.y = None
        
        # Model Network....        
        
        self.pred = None
        
        # Define loss        
        self.loss = None
        
        # train_step = "optimize operation"
        self.train_step = None

In [0]:
MODEL_NAME = 'MY_MODEL'

if MODEL_NAME == 'LSTM':
    model = LSTM(32, 32, False, [], MAXLEN, CHARSIZE, 0.01)
    input_x = model.x
    output_y = model.y
    input_len = model.x_len
    pred = model.pred
    loss = model.loss
    train_op = model.train_step
elif MODEL_NAME == 'CNN':
    model = Char_CNN(32, [[32, 3, -1], [32, 3, 3]], [], MAXLEN, CHARSIZE, 0.01)
    input_x = model.x
    output_y = model.y
    pred = model.pred
    loss = model.loss
    train_op = model.train_step
elif MODEL_NAME == 'MY_MODEL':
    model = MyModel2(32, [[16, 3, -1], [16, 3, 3]], [], MAXLEN, CHARSIZE, 0.0001)
    input_x = model.x
    output_y = model.y
    pred = model.pred
    loss = model.loss
    train_op = model.train_step
    output_keep_prob = model.output_keep_prob
else:
    raise NotImplementedError

In [0]:
# 학습 시작하기
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [0]:
best_rmse = 1000000
best_epoch = -1

EPOCHs = 100
DISPLAY_STEP = 50

# TRAINING LOOP
for epoch in range(1, EPOCHs + 1):
    epoch_loss = 0.0
    epoch_start = time.time()
    # TRAIN EACH BATCH
    for i, (train_x, train_y) in enumerate(train_dataset):
        # PAD BATCH DATA
        train_x_pad = np.ones((len(train_x), MAXLEN), dtype=np.int32) * 2  # PAD : 2
        for idx, s in enumerate(train_x):
            length = len(s)
            if length < MAXLEN:
                train_x_pad[idx, :length] = np.array(s)
            else:
                train_x_pad[idx:, :MAXLEN] = np.array(s)[:MAXLEN]  # Truncate from the front
        
        # DEFINE feed_dict
        # ex) feed_dict = {input_x: train_x_pad, output_y: train_y}
        # feed_dict = {}
        feed_dict = {input_x: train_x_pad, output_y: train_y, output_keep_prob:0.3}
        
        t = time.time()
        # train_op: train, optimize
        # loss : batch loss
        _, l = sess.run([train_op, loss], feed_dict=feed_dict)
        elapsed = time.time() - t

        epoch_loss += l
        
        # PRINT LOSS
        if (i + 1) % DISPLAY_STEP == 0:
            print('[%3d/%3d] loss = %.4f, time elapsed = %.2f' % (i + 1, train_dataset.num_batch, l, elapsed))
    epoch_end = time.time() - epoch_start
    print('Epoch %3d >> Epoch loss: %.4f , time elapsed %.4f' % (epoch, epoch_loss, epoch_end))
    
    # EVALUATE ON TEST DATA 
    se = 0
    total = 0
    for i, (test_x, test_y) in enumerate(test_dataset):
        # PAD
        test_x_pad = np.ones((len(test_x), MAXLEN), dtype=np.int32) * 2  # PAD : 2
        for idx, s in enumerate(test_x):
            length = len(s)
            if length < MAXLEN:
                test_x_pad[idx, :length] = np.array(s)
            else:
                test_x_pad[idx:, :MAXLEN] = np.array(s)[:MAXLEN]  # Truncate from the front
            
        # feed_dict 
        # feed_dict = {}
        feed_dict = {input_x: test_x_pad,  output_keep_prob:1.0}
        p = sess.run(pred, feed_dict=feed_dict)
        
        # squared error
        se += np.sum(np.square(p - test_y))
        total += len(p)
    
    # root mean squared error
    test_rmse = np.sqrt(se / total)
    print('Test RMSE = %.4f' % (test_rmse))
    
    if best_rmse > test_rmse:
        best_rmse = test_rmse
        best_epoch = epoch

In [0]:
print('Best RMSE is %.4f at EPOCH %d' % (best_rmse, best_epoch))

In [0]:
!nvidia-smi

In [0]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))