In [1]:
import collections
import re
import unicodedata
import six
import tensorflow as tf

In [2]:
flags = tf.flags
FLAGS = flags.FLAGS

# Jupyter notebook에서 사용시에만 필요
flags.DEFINE_string('f', '', 'kernel')

## Required parameters
flags.DEFINE_string(
    "data_dir", None,
    "The input data dir. Should contain the .tsv files (or other data files) "
    "for the task.")

flags.DEFINE_string(
    "bert_config_file", None,
    "The config json file corresponding to the pre-trained BERT model. "
    "This specifies the model architecture.")

flags.DEFINE_string("task_name", None, "The name of the task to train.")

flags.DEFINE_string("vocab_file", None,
                    "The vocabulary file that the BERT model was trained on.")

flags.DEFINE_string(
    "output_dir", None,
    "The output directory where the model checkpoints will be written.")

## Other parameters

flags.DEFINE_string(
    "init_checkpoint", None,
    "Initial checkpoint (usually from a pre-trained BERT model).")

flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")

flags.DEFINE_integer(
    "max_seq_length", 128,
    "The maximum total input sequence length after WordPiece tokenization. "
    "Sequences longer than this will be truncated, and sequences shorter "
    "than this will be padded.")

flags.DEFINE_bool("do_train", False, "Whether to run training.")

flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")

flags.DEFINE_bool(
    "do_predict", False,
    "Whether to run the model in inference mode on the test set.")

flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")

flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")

flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")

flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")

flags.DEFINE_float("num_train_epochs", 3.0,
                   "Total number of training epochs to perform.")

flags.DEFINE_float(
    "warmup_proportion", 0.1,
    "Proportion of training to perform linear learning rate warmup for. "
    "E.g., 0.1 = 10% of training.")

flags.DEFINE_integer("save_checkpoints_steps", 1000,
                     "How often to save the model checkpoint.")

flags.DEFINE_integer("iterations_per_loop", 1000,
                     "How many steps to make in each estimator call.")

flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")

tf.flags.DEFINE_string(
    "tpu_name", None,
    "The Cloud TPU to use for training. This should be either the name "
    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
    "url.")

tf.flags.DEFINE_string(
    "tpu_zone", None,
    "[Optional] GCE zone where the Cloud TPU is located in. If not "
    "specified, we will attempt to automatically detect the GCE project from "
    "metadata.")

tf.flags.DEFINE_string(
    "gcp_project", None,
    "[Optional] Project name for the Cloud TPU-enabled project. If not "
    "specified, we will attempt to automatically detect the GCE project from "
    "metadata.")

tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")

flags.DEFINE_integer(
    "num_tpu_cores", 8,
    "Only used if `use_tpu` is True. Total number of TPU cores to use.")

In [3]:
# 기본 객체 정의
class InputExample:
    
    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
        
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
    When running eval/predict on the TPU, we need to pad the number of examples
    to be a multiple of the batch size, because the TPU requires a fixed batch
    size. The alternative is to drop the last batch, which is bad because it means
    the entire output data won't be generated.
    We use this class instead of `None` because treating `None` as padding
    battches could cause silent errors.
    """
    
# Feature를 담을 객체 정의
class InputFeatures(object):
    
    def __init__(self,
               input_ids,
               input_mask,
               segment_ids,
               label_id,
               is_real_example=True):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.is_real_example = is_real_example

In [4]:
# DataProcessor 정의
class DataProcessor(object):
    
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with tf.io.gfile.GFile(input_file, "r") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                lines.append(line)
            return lines

In [5]:
# 기타 필요한 함수들 정의
def load_vocab(vocab_file):
    vocab = collections.OrderedDict()
    index = 0
    with tf.io.gfile.GFile(vocab_file, "r") as reader:
        while True:
            token = convert_to_unicode(reader.readline())
            if not token:
                break

            ### joonho.lim @ 2019-03-15
            if token.find('n_iters=') == 0 or token.find('max_length=') == 0 :
                continue
            token = token.split('\t')[0]

            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab

def convert_by_vocab(vocab, items):
    output = []
    for item in items:
        output.append(vocab[item])
    return output


def convert_tokens_to_ids(vocab, tokens):
    return convert_by_vocab(vocab, tokens)


def convert_ids_to_tokens(inv_vocab, ids):
    return convert_by_vocab(inv_vocab, ids)

def convert_to_unicode(text):
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    elif six.PY2:
        if isinstance(text, str):
            return text.decode("utf-8", "ignore")
        elif isinstance(text, unicode):
            return text
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    else:
        raise ValueError("Not running on Python2 or Python 3?")
        
def printable_text(text):
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    elif six.PY2:
        if isinstance(text, str):
            return text
        elif isinstance(text, unicode):
            return text.encode("utf-8")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    else:
        raise ValueError("Not running on Python2 or Python 3?")

!

In [6]:
def convert_single_example(ex_index, example,
                           label_list,
                           max_seq_length,
                           tokenizer,
                           openapi_key,
                           komoran=None):
    if isinstance(example, PaddingInputExample):
        return InputFeatures(
            input_ids=[0] * max_seq_length,
            input_mask=[0] * max_seq_length,
            segment_ids=[0] * max_seq_length,
            label_id=0,
            is_real_example=False)

    label_map = {label : i for (i, label) in enumerate(label_list)}

    if openapi_key is None:
        tokens_a = ' '.join(
            [i[0] + '/' + i[1]
             for i in komoran.pos(example.text_a)])
    else:
        tokens_a = do_lang(openapi_key, example.text_a)
#     if "openapi error" in tokens_a:
#         tf.logging.info("(%d--%s)" % (ex_index, tokens_a))
    tokens_a = tokenizer.tokenize(tokens_a)

    if example.text_b:
        tokens_b = do_lang(openapi_key, example.text_b)
        if "openapi error" in tokens_a:
            tf.logging.info("(%d--%s)" % (ex_index, tokens_b))
            tokens_b = ' '.join(
                [i[0] + '/' + i[1]
                 for i in Komoran().pos(example.text_b)])
        tokens_b = tokenizer.tokenize(tokens_b)

        def _truncate_seq_pair(tokens_a, tokens_b, max_length):
            while True:
                total_length = len(tokens_a) + len(tokens_b)
                if total_length <= max_length:
                    break
                if len(tokens_a) > len(tokens_b):
                    tokens_a.pop()
                else:
                    tokens_b.pop()
        # Account for [CLS], [SEP],and [SEP] with "-3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length-3)
    else:
        # Account for [CLS] and [SEP] with "-2"
        if len(tokens_a) > max_seq_length-2:
            tokens_a = tokens_a[:(max_seq_length-2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    # tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    # type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    # tokens:   [CLS] the dog is hairy . [SEP]
    # type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for 'type=0' and
    # 'type=1' were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not "strictly" necessary
    # since the [SEP] token unambigiously separates the sequences, but it makes
    # if easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_ids = [0] * len(tokens)

    tokens_b = None
    if tokens_b:
        tokens += tokens_b + ["[SEP]"]
        segment_ids += [1] * (len(tokens_b) + 1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    label_id = label_map[example.label]

    if ex_index < 5:
        tf.logging.info("*** Example ***")
        tf.logging.info("guid: %s" % (example.guid))
        tf.logging.info("tokens: %s" % " ".join(
                [str(x) for x in tokens]))
        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        tf.logging.info(
                "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
        tf.logging.info("label: %s (id = %d)" % (example.label, label_id))

    feature = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id,
        is_real_example=True)

    return feature

# GO

In [7]:
from modeling import *

In [8]:
tf.logging.set_verbosity(tf.logging.INFO)

```
  "attention_probs_dropout_prob": 0.1, 
  "directionality": "bidi", 
  "hidden_act": "gelu", 
  "hidden_dropout_prob": 0.1, 
  "hidden_size": 768, 
  "initializer_range": 0.02, 
  "intermediate_size": 3072, 
  "max_position_embeddings": 512, 
  "num_attention_heads": 12, 
  "num_hidden_layers": 12, 
  "pooler_fc_size": 768, 
  "pooler_num_attention_heads": 12, 
  "pooler_num_fc_layers": 3, 
  "pooler_size_per_head": 128, 
  "pooler_type": "first_token_transform", 
  "type_vocab_size": 2, 
  "vocab_size": 30349
```

In [14]:
# Load BERT Config
path = '../KorBERT/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/'
FLAGS.bert_config_file = path + 'bert_config.json'
bert_config = BertConfig.from_json_file(FLAGS.bert_config_file)
bert_config

<BERT CONGIRUATION>
ddress: 0x26a9bb7e860
params:
  vocab_size: 30349
  hidden_size: 768
  num_hidden_layers: 12
  num_attention_heads: 12
  hidden_act: gelu
  intermediate_size: 3072
  hidden_dropout_prob: 0.1
  attention_probs_dropout_prob: 0.1
  max_position_embeddings: 512
  type_vocab_size: 2
  initializer_range: 0.02
  directionality: bidi
  pooler_fc_size: 768
  pooler_num_attention_heads: 12
  pooler_num_fc_layers: 3
  pooler_size_per_head: 128
  pooler_type: first_token_transform

In [15]:
if FLAGS.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length %d because the BERT model "
        "was only trained up to sequence length %d" %
        (FLAGS.max_seq_length, bert_config.max_position_embeddings))
else:
    print('     (FLAGS) MAX_SEQ_LENGTH           :', FLAGS.max_seq_length)
    print('(BERTConfig) MAX_POSITION_EMBEDDINGS  :', bert_config.max_position_embeddings)

     (FLAGS) MAX_SEQ_LENGTH           : 128
(BERTConfig) MAX_POSITION_EMBEDDINGS  : 512


In [16]:
# do not use tpu
tpu_cluster_resolver = None
if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
else:
    print('Do not use TPU')

Do not use TPU


In [17]:
# BROADCAST = 4
# PER_HOST_V1 = 2
# PER_HOST_V2 = 3
# PER_SHARD_V1 = 1
# SLICED = 5
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
is_per_host

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



3

In [18]:
run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver, # None
    master=FLAGS.master, # [Optional] TensorFlow master URL.
    model_dir=FLAGS.output_dir, # None
    save_checkpoints_steps=FLAGS.save_checkpoints_steps, # 1000
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=FLAGS.iterations_per_loop, # 1000
        num_shards=FLAGS.num_tpu_cores, # 8
        per_host_input_for_training=is_per_host)
)

In [19]:
# tensorflow gpu 사용 가능한지 체크
tf.test.is_gpu_available()

True

In [20]:
# 한국어 vocab 사전을 등록
FLAGS.vocab_file = path + 'vocab.korean_morp.list'

# NAVER NSMC 예제 데이터

In [112]:
import pandas as pd
from glob import glob

In [122]:
%%time
paths = [path.replace('\\', '/') 
         for path in glob('../research_persona/nsmc/raw/*.json')]
res = []
for path in paths:
    with open(path, encoding='utf-8') as data_file:
        res.extend(json.load(data_file))

Wall time: 3.48 s


In [123]:
df = pd.DataFrame(res)
df['y'] = (df.rating.astype(np.int16) > 5).astype(np.int)

In [124]:
# 개행문자 제거
df.review = df.review.map(lambda x: re.sub('[\r\n]', ' ', x)).tolist()

In [121]:
df.loc[599278]

review       오경이 아깝다  이런영화 왜 만든거냐
date                     11.06.14
rating                          3
author                   psyc****
review_id                 5480540
movie_id                    76943
y                               0
Name: 599278, dtype: object

In [125]:
# 리뷰가 비어있는 열 제거
df = df[df.review != '']

In [126]:
df.review

0                                            전체관람가는 아닌것 같아요
1                              디렉터스컷으로봐서 거의 3시간짜리인데 참 흡인력있다
2         태어나 처음으로 가슴아리는 영화였다.  20년이상 지났지만.. 생각하면  또 가슴이...
3         어린시절 고딩때 봤던 때랑 또 결혼하고 나서 봤을때의 느낌은 확실히 다르네요. 뭔가...
4         토토에게 넓은 세상을 보여주고픈 알프레도.. 그가 토토를 위해 정을 떼려고 했던 장...
                                ...                        
712399          아직까지 여운이.... 우연히 봤다가 펑펑 울었네요. 음악때문에 감동이 백배!
712400                   쓸쓸한 가을  감성을 채워주는 영화네요.. 사랑하고시프다...
712401                            단언컨대 올 가을 최고의 영화!! 장담한다!!
712402                    기대없이 봤다가 너무너무 반하게 된 영화! 가을 감성에 딱!
712403       나도 모르게 눈물이 주륵주륵! 가을 감성 영화! 강추!!!!OST너무 좋아요!!!!
Name: review, Length: 712383, dtype: object

In [127]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(df['review'],
                                                      df['y'],
                                                      random_state=42,
                                                      test_size=.5,
                                                      shuffle=True,
                                                      stratify=df['y'])
X_valid, X_test, y_valid, y_test = train_test_split(X_valid,
                                                    y_valid,
                                                    random_state=42,
                                                    test_size=.4,
                                                    shuffle=True,
                                                    stratify=y_valid)

In [128]:
X_train.shape, X_valid.shape, X_test.shape

((356191,), (213715,), (142477,))

In [129]:
y_train.shape, y_valid.shape, y_test.shape

((356191,), (213715,), (142477,))

In [130]:
sets = np.array([356202, 213721, 142481])
sets / sum(sets)

array([0.5       , 0.29999972, 0.20000028])

In [131]:
df_train = pd.concat((X_train, y_train), axis=1)
df_valid = pd.concat((X_valid, y_valid), axis=1)
df_test = pd.concat((X_test, y_test), axis=1)

In [132]:
df_train.to_csv('train.tsv', sep='\t')
df_valid.to_csv('valid.tsv', sep='\t')
df_test.to_csv('test.tsv', sep='\t')

In [133]:
# DataProcessor 제작
class NSMCProcessor(DataProcessor):

    def get_train_examples(self, data_dir, filename='train.tsv'):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, filename)), "train")

    def get_dev_examples(self, data_dir, filename='valid.tsv'):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, filename)), "valid")

    def get_test_examples(self, data_dir, filename='test.tsv'):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, filename)), "test")

    def get_labels(self):
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        examples = []
        try:
            for (i, line) in enumerate(lines):
                if i == 0:
                    continue
                guid = "%s-%s" % (set_type, i)
                text_a = convert_to_unicode(line[1])
                if set_type == "test":
                    label = "0"
                else:
                    label = convert_to_unicode(line[-1])
                examples.append(
                    InputExample(guid=guid, text_a=text_a, label=label))
                prev_line = line
        except IndexError:
            print(i, prev_line)
            return None
        return examples

In [134]:
import os
import csv

In [135]:
df_train.loc[599278]

review    오경이 아깝다  이런영화 왜 만든거냐
y                            0
Name: 599278, dtype: object

In [84]:
df_train.iloc[2]

review    꼬리를 먼저 내린 러시아가 패자인가, 아니면 진정한 용기가 있는 승자인가
y                                                1
Name: 305153, dtype: object

In [136]:
processor = NSMCProcessor()
label_list = processor.get_labels()

# get train samples
train_examples = processor.get_train_examples('', 'train.tsv')
num_train_steps = int(
    len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

In [138]:
# record ETRI model weights
FLAGS.init_checkpoint = path + 'model.ckpt'

In [139]:
import 