In [1]:
import sys
import os
import numpy as np
import json
import tensorflow as tf
from tensorflow import keras
import pandas as pd

In [2]:
train_input_data=np.load(open('./data/train_input.npy','rb'))
train_label_data=np.load(open('./data/train_label.npy','rb'))

In [4]:
with open('./data/data_configs.json','r') as f :
    prepro_config = json.load(f)
    #print(prepro_config)

In [5]:
#데이터 나누기
RNG_SEED=1234
VALID_SPLIT=0.2

from sklearn.model_selection import train_test_split

train_input, eval_input, train_label, eval_label = train_test_split(train_input_data, train_label_data, test_size=VALID_SPLIT,
                                                   random_state=RNG_SEED)

In [6]:

#파라미터 변수
BATCH_SIZE=128
NUM_EPOCHS=10
VOCAB_SIZE=prepro_config['vocab_size']
EMB_SIZE=128

def mapping_fn(X,Y=None):
    inputs, labels = {'x':X}, Y
    return inputs, labels

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_input, train_label))
    dataset = dataset.shuffle(buffer_size=len(train_input))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    dataset = dataset.repeat(count=NUM_EPOCHS)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((eval_input, eval_label))
    dataset = dataset.shuffle(buffer_size=len(eval_input))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [7]:
#합성 신경망+맥스풀링
#Conv1D 활용,총 3개의 합성곱 층, kernel_size=3,4,5

def model_fn(features, labels, mode):
    TRAIN = mode ==tf.estimator.ModeKeys.TRAIN
    EVAL = mode ==tf.estimator.ModeKeys.EVAL
    PREDICT =mode ==tf.estimator.ModeKeys.PREDICT
    
    #embedding layer선언
    embedding_layer = keras.layers.Embedding(VOCAB_SIZE, EMB_SIZE)(features['x'])
    
    #embedding layer에 대한 output에 대해 dropout을 취함
    dropout_emb = keras.layers.Dropout(rate=0.5)(embedding_layer)
    
    #filters=128, kernel_size=3,4,5 길이가 3,4,5인 128개의 다른 필터를 생성
    #n-gram처럼 다양한 각도에서 문장을 봄
    #conv1d는 (배치크기: 문장 숫자, 길이: 각 문장의 단어개수, 채널: 임베딩 출력 차원수)로 입력값을 받음
    
    conv1 = keras.layers.Conv1D(filters=128, kernel_size=3, padding='valid', activation=tf.nn.relu)(dropout_emb)
    pool1 = keras.layers.GlobalMaxPool1D()(conv1)
    
    conv2 = keras.layers.Conv1D(filters=128, kernel_size=4, padding='valid', activation=tf.nn.relu)(dropout_emb)
    pool2 = keras.layers.GlobalMaxPool1D()(conv2)
    
    conv3 = keras.layers.Conv1D(filters=128, kernel_size=5, padding='valid', activation=tf.nn.relu)(dropout_emb)
    pool3 = keras.layers.GlobalMaxPool1D()(conv3)
    
    concat = keras.layers.concatenate([pool1, pool2, pool3]) #모아주기
    
    hidden=keras.layers.Dense(250, activation=tf.nn.relu)(concat)
    dropout_hidden=keras.layers.Dropout(rate=0.5)(hidden)
    logits = keras.layers.Dense(1,name='logits')(dropout_hidden)
    logits = tf.squeeze(logits, axis=-1)
    
    #학습, 검증, 평가의 단계로 나눔
    
    if PREDICT:
        pred=tf.nn.sigmoid(logits)
        return tf.estimator.EstimatorSpec(mode=mode, predictions={'prob':tf.round(pred)})
    
    loss=tf.losses.sigmoid_cross_entropy(labels, logits)
    
    if EVAL:
        pred=tf.nn.sigmoid(logits)
        accuracy=tf.metrics.accuracy(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops={'acc':accuracy})
    
    if TRAIN:
        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)
        
        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss)

In [8]:
model_dir = os.path.join(os.getcwd(), "./checkpoint/cnn/")
os.makedirs(model_dir, exist_ok=True)

In [9]:
#Estimator객체 생성
cnn_est = tf.estimator.Estimator(model_fn, model_dir=model_dir)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\PC\\자연어처리\\./checkpoint/cnn/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001F555223708>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [10]:
cnn_est.train(train_input_fn) #학습

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\PC\자연어처리\./checkpoint/cnn/model.ckpt-16866
Instructions for updating:
Use standard file utilities to

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x1f555225748>

In [11]:
cnn_est.evaluate(eval_input_fn) #평가

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-04-24T04:10:55Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\PC\자연어처리\./checkpoint/cnn/model.ckpt-18436
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-04-24-04:10:58
INFO:tensorflow:Saving dict for global step 18436: acc = 0.8806, global_step = 18436, loss = 1.1642536
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 18436: C:\Users\PC\자연어처리\./checkpoint/cnn/model.ckpt-18436


{'acc': 0.8806, 'loss': 1.1642536, 'global_step': 18436}

In [12]:
test_input_data=np.load(open('./data/test_input.npy','rb'))
ids = np.load(open('./data/test_id.npy','rb'),allow_pickle=True)

In [13]:
#예측데이터파이프라인
def mapping_fn2(X):
    inputs = {'x':X}
    return inputs

def test_input_fn2():
    dataset = tf.data.Dataset.from_tensor_slices((test_input_data))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn2)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [14]:
prediction=np.array([p['prob'] for p in cnn_est.predict(test_input_fn2)])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\PC\자연어처리\./checkpoint/cnn/model.ckpt-18436
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [15]:
prediction

array([1., 0., 0., ..., 0., 1., 0.], dtype=float32)

In [16]:
output = pd.DataFrame(data={"id":list(ids),"sentiment":list(prediction)})

In [17]:
output.to_csv('./data/Bag_of_Words_model_test_cnn.csv',index=False,quoting=3)