In [1]:
import tensorflow as tf

In [2]:
from utils_20191230 import *

In [3]:
tf.logging.set_verbosity(tf.logging.INFO)

In [4]:
# Load BERT Config
path = '../KorBERT/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/'
FLAGS.bert_config_file = path + 'bert_config.json'
bert_config = BertConfig.from_json_file(FLAGS.bert_config_file)

In [5]:
if FLAGS.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length %d because the BERT model "
        "was only trained up to sequence length %d" %
        (FLAGS.max_seq_length, bert_config.max_position_embeddings))
else:
    print('     (FLAGS) MAX_SEQ_LENGTH           :', FLAGS.max_seq_length)
    print('(BERTConfig) MAX_POSITION_EMBEDDINGS  :', bert_config.max_position_embeddings)

     (FLAGS) MAX_SEQ_LENGTH           : 128
(BERTConfig) MAX_POSITION_EMBEDDINGS  : 512


In [6]:
# do not use tpu
tpu_cluster_resolver = None
if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
else:
    print('Do not use TPU')

Do not use TPU


In [7]:
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    master=FLAGS.master,
    model_dir=FLAGS.output_dir,
    save_checkpoints_steps=FLAGS.save_checkpoints_steps, # 1000
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=FLAGS.iterations_per_loop, # 1000
        num_shards=FLAGS.num_tpu_cores, # 8
        per_host_input_for_training=is_per_host)
)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [8]:
# tensorflow gpu 사용 가능한지 체크
tf.test.is_gpu_available()

True

In [9]:
# 한국어 vocab 사전을 등록
FLAGS.vocab_file = path + 'vocab.korean_morp.list' 

In [10]:
import pandas as pd

dacon_path = '../dacon문자스미싱/filedown (2)/'
df_train = pd.read_csv(dacon_path + 'train.csv')
df_test = pd.read_csv(dacon_path + 'public_test.csv')

In [11]:
from sklearn.model_selection import train_test_split

df_train = df_train.set_index('id')
df_test = df_test.set_index('id')

X_train, X_valid, y_train, y_valid = train_test_split(
                 df_train[[col for col in df_train.columns if col != 'smishing']], 
                 df_train['smishing'],
                 random_state=42, test_size=.2,
                 stratify=df_train['smishing'])
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((236756, 2), (59189, 2), (236756,), (59189,))

In [12]:
df_train = pd.concat((X_train, y_train), axis=1)
df_valid = pd.concat((X_valid, y_valid), axis=1)

# sample 100개씩 뽑아서 미리 test
df_train.to_csv(dacon_path + 'train.tsv', sep='\t')
df_valid.to_csv(dacon_path + 'dev.tsv', sep='\t')
df_test.to_csv(dacon_path + 'test.tsv', sep='\t')

In [13]:
# DataProcessor 제작
class SmishingProcessor(DataProcessor):

    def get_train_examples(self, data_dir, filename='train.tsv'):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, filename)), "train")

    def get_dev_examples(self, data_dir, filename='dev.tsv'):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, filename)), "dev")

    def get_test_examples(self, data_dir, filename='test.tsv'):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, filename)), "test")

    def get_labels(self):
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = convert_to_unicode(line[2])
            if set_type == "test":
                label = "0"
            else:
                label = convert_to_unicode(line[-1])
            examples.append(
                InputExample(guid=guid, text_a=text_a, label=label))
        return examples

In [14]:
processor = SmishingProcessor()
label_list = processor.get_labels()

# get train samples
train_examples = processor.get_train_examples(dacon_path, 'train.tsv')
num_train_steps = int(
    len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

In [15]:
# record ETRI model weights
FLAGS.init_checkpoint = path + 'model.ckpt'

In [16]:
model_fn = model_fn_builder(
    bert_config=bert_config,
    num_labels=len(label_list),             # 2
    init_checkpoint=FLAGS.init_checkpoint,  # None
    learning_rate=FLAGS.learning_rate,      # 5e-05
    num_train_steps=num_train_steps,        # 22195
    num_warmup_steps=num_warmup_steps,      # 2219
    use_tpu=FLAGS.use_tpu,                  # False
    use_one_hot_embeddings=FLAGS.use_tpu)   # False

# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU
estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=FLAGS.use_tpu,                        # False
    model_fn=model_fn,
    config=run_config,
    train_batch_size=FLAGS.train_batch_size,      # 32
    eval_batch_size=FLAGS.eval_batch_size,        # 8
    predict_batch_size=FLAGS.predict_batch_size   # 8
)

INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\jinma\\AppData\\Local\\Temp\\tmp984ihjm8', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001B05F483D30>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, nu

In [17]:
FLAGS.output_dir = './output_dir/smishing/'
tf.gfile.MakeDirs(FLAGS.output_dir)

In [18]:
train_file = os.path.join(FLAGS.output_dir, 'train_nonspacing.tf_record')

In [19]:
from getpass import getpass

openapi_key = getpass(prompt='Password: ', stream=None)

In [20]:
tokenizer = BERTTokenizer(FLAGS.vocab_file, do_lower_case=True)

In [None]:
file_based_convert_examples_to_features(train_examples,
                                        label_list,
                                        FLAGS.max_seq_length,
                                        tokenizer,
                                        openapi_key,
                                        train_file)



INFO:tensorflow:Writing example 0 of 236756
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: train-1
INFO:tensorflow:tokens: [CLS] X X X/SL_ 고객/NNG_ 님/XSN_ ./SF_ X X X/SL_ [SEP]
INFO:tensorflow:input_ids: 2 3047 3047 1496 1291 1123 7 3047 3047 1496 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0