# classifier/match app

In [1]:
import collections
import csv
import os
import modeling
import optimization
import tokenization
import tensorflow as tf

In [2]:
from run_classifier import *

In [5]:
class DataProcessor(object):
  @classmethod
  def _read_tsv(cls, input_file, quotechar=None):
    """Reads a tab separated value file."""
    with tf.gfile.Open(input_file, "r") as f:
      reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
      lines = []
      for line in reader:
        lines.append(line)
      return lines

class ZaProcessor(DataProcessor):
  def get_train_examples(self, data_dir):
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "kd_train.csv")), "train")

  def get_dev_examples(self, data_dir):
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "kd_dev.csv")), "dev")

  def get_labels(self):
    return ["0", "1"]

  def _create_examples(self, lines, set_type):
    examples = []
    for (i, line) in enumerate(lines):
      guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(str(i)))
      text_a = tokenization.convert_to_unicode(line[0])
      text_b = tokenization.convert_to_unicode(line[1])
      label = tokenization.convert_to_unicode(line[-1])
      examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

class ZaIrProcessor(DataProcessor):
  def get_train_examples(self, data_dir):
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "ir_train.csv")), "train")

  def get_dev_examples(self, data_dir):
    return self._create_examples(
        self._read_tsv(os.path.join(data_dir, "ir_dev.csv")), "dev")

  def get_labels(self):
    return [str(j) for j in range(1,6)]

  def _create_examples(self, lines, set_type):
    examples = []
    for (i, line) in enumerate(lines):
      guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(str(i)))
      text_a = tokenization.convert_to_unicode(line[0])
      text_b = None
      label = tokenization.convert_to_unicode(line[-1])
      examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

processors = {"za": ZaProcessor, "za_ir": ZaIrProcessor}

In [15]:
def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,num_train_steps, num_warmup_steps, 
                     use_tpu, use_one_hot_embeddings):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]
    is_real_example = None
    if "is_real_example" in features:
      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
    else:
      is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    (total_loss, per_example_loss, logits, probabilities) = create_model(
        bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
        num_labels, use_one_hot_embeddings)

    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:

      train_op = optimization.create_optimizer(
          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,
          scaffold_fn=scaffold_fn)
    elif mode == tf.estimator.ModeKeys.EVAL:
      def metric_fn(per_example_loss, label_ids, logits, is_real_example):
        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
        accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions, weights=is_real_example)
        loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
        return {"eval_accuracy": accuracy, "eval_loss": loss}
#         return {"eval_accuracy": accuracy, "eval_loss": loss, "prob": logits, "pred": predictions, "y": label_ids}

      eval_metrics = (metric_fn, [per_example_loss, label_ids, logits, is_real_example])
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metrics=eval_metrics,
          scaffold_fn=scaffold_fn)
    else:
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          predictions={"probabilities": probabilities},
          scaffold_fn=scaffold_fn)
    return output_spec

  return model_fn

In [16]:
do_lower_case = True
max_seq_length = 32
learning_rate = 0.001
num_train_epochs = 3
train_batch_size, eval_batch_size, predict_batch_size = 128, 8, 8

warmup_proportion = 0.1
save_checkpoints_steps, iterations_per_loop = 1000, 1000
use_tpu = False
tpu_name, tpu_zone, gcp_project, master = None, None, None, None
num_tpu_cores =8
tf.logging.set_verbosity(tf.logging.ERROR)

In [17]:
def app(task_name, init_checkpoint, bert_config_file, vocab_file, data_dir, output_dir="model/tmp"):
  tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)
  bert_config = modeling.BertConfig.from_json_file(bert_config_file)
  if max_seq_length > bert_config.max_position_embeddings:
    raise ValueError("Cannot use sequence length %d because the BERT model was only trained up to sequence length %d" %
                                   (max_seq_length, bert_config.max_position_embeddings))
    
  if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))
  processor = processors[task_name]()
  label_list = processor.get_labels()
  print(">>> labels: ", label_list)
  tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  run_config = tf.contrib.tpu.RunConfig(cluster=None, master=None, model_dir=output_dir, save_checkpoints_steps=save_checkpoints_steps,
                                        tpu_config=tf.contrib.tpu.TPUConfig(
                                            iterations_per_loop=iterations_per_loop, num_shards=num_tpu_cores, 
                                            per_host_input_for_training=is_per_host)
                                       )

  train_examples, num_train_steps,  num_warmup_steps = None, None, None
  train_examples = processor.get_train_examples(data_dir)
  num_train_steps = int(len(train_examples) / train_batch_size * num_train_epochs)
  num_warmup_steps = int(num_train_steps * warmup_proportion)
  print(">>>labels cnt: ", len(label_list))

  model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=init_checkpoint,
                              learning_rate=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps,
                              use_tpu=use_tpu, use_one_hot_embeddings=use_tpu)

  estimator = tf.contrib.tpu.TPUEstimator(use_tpu=use_tpu, model_fn=model_fn, config=run_config,
                                          train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, 
                                          predict_batch_size=predict_batch_size)

  if True:
    train_file = os.path.join("data/", "tmp_train.tf_record")
    file_based_convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer, train_file)
    print("***** Running training *****")
    print("  Num examples = %d", len(train_examples))
    print("  Batch size = %d", train_batch_size)
    print("  Num steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(input_file=train_file, seq_length=max_seq_length,is_training=True, drop_remainder=True)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    print(">>> train finish")

  if True:
    eval_examples = processor.get_dev_examples(data_dir)
    eval_file = os.path.join("data/", "tmp_eval.tf_record")
    num_actual_eval_examples = len(eval_examples)
    file_based_convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer, eval_file)
    print("***** Running evaluation *****")
    print("  Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples,
          len(eval_examples) - num_actual_eval_examples)
    print("  Batch size = %d", eval_batch_size)
    eval_input_fn = file_based_input_fn_builder(input_file=eval_file, seq_length=max_seq_length,is_training=False, drop_remainder=False)
    print(">>> eval result output")
    result = estimator.evaluate(input_fn=eval_input_fn, steps=None)
    pred = estimator.predict(input_fn=eval_input_fn)
    print("***** Eval results *****")
    print(result)
#     for key in sorted(result.keys()):
#         print("  %s = %s", key, str(result[key]))
    return pred

In [18]:
res_check = app(task_name="za", data_dir="data/za_data",
                init_checkpoint="model/chinese_L-12_H-768_A-12/bert_model.ckpt", 
                bert_config_file="model/chinese_L-12_H-768_A-12/bert_config.json", 
                vocab_file="model/chinese_L-12_H-768_A-12/vocab.txt")

>>> labels:  ['0', '1']
>>>labels cnt:  2
***** Running training *****
  Num examples = %d 20591
  Batch size = %d 128
  Num steps = %d 482
>>> train finish
***** Running evaluation *****
  Num examples = %d (%d actual, %d padding) 5148 5148 0
  Batch size = %d 8
>>> eval result output
***** Eval results *****
{'eval_accuracy': 0.42288268, 'eval_loss': 3.3276796, 'loss': 3.3273385, 'global_step': 482}


In [20]:
tmp_res = [r for r in res_check]

In [24]:
import pandas as pd
dfk = pd.DataFrame([list(r['probabilities']) for r in tmp_res])

In [27]:
(dfk[1] > 0.9).mean()

1.0

## 所有的pair都判断为相似，单边倒

In [10]:
res_check

{'eval_accuracy': 0.42288268,
 'eval_loss': 3.3276796,
 'loss': 3.3273385,
 'global_step': 482}

In [9]:
app(task_name="za", 
    data_dir="data/za_data",
    init_checkpoint="model/chinese_wwm_ext_L-12_H-768_A-12/bert_model.ckpt", 
    bert_config_file="model/chinese_wwm_ext_L-12_H-768_A-12/bert_config.json", 
    vocab_file="model/chinese_wwm_ext_L-12_H-768_A-12/vocab.txt")

>>> labels:  ['0', '1']
>>>labels cnt:  2
***** Running training *****
  Num examples = %d 20591
  Batch size = %d 128
  Num steps = %d 482
>>> train finish
***** Running evaluation *****
  Num examples = %d (%d actual, %d padding) 5148 5148 0
  Batch size = %d 8
>>> output shape:  (?, 2)
>>> output prob shape:  (?,)
>>> eval result output
***** Eval results *****
  %s = %s eval_accuracy 0.42288268
  %s = %s eval_loss 3.3276796
  %s = %s global_step 482
  %s = %s loss 3.3273385


In [10]:
app(task_name="za", 
    data_dir="data/za_data",
    init_checkpoint="result/bert_za201908_max/model.ckpt-300000", 
    bert_config_file="model/chinese_wwm_ext_L-12_H-768_A-12/bert_config.json", 
    vocab_file="model/chinese_wwm_ext_L-12_H-768_A-12/vocab.txt")

>>> labels:  ['0', '1']
>>>labels cnt:  2
***** Running training *****
  Num examples = %d 20591
  Batch size = %d 128
  Num steps = %d 482
>>> train finish
***** Running evaluation *****
  Num examples = %d (%d actual, %d padding) 5148 5148 0
  Batch size = %d 8
>>> output shape:  (?, 2)
>>> output prob shape:  (?,)
>>> eval result output
***** Eval results *****
  %s = %s eval_accuracy 0.42288268
  %s = %s eval_loss 3.3276796
  %s = %s global_step 482
  %s = %s loss 3.3273385


## steps of BERT apply
1. update params with flag
2. append some data processor
3. run

```
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
export GLUE_DIR=/path/to/glue
```
```python
python run_classifier.py \
  --task_name=MRPC \
  --do_train=true \
  --do_eval=true \
  --data_dir=$GLUE_DIR/MRPC \
  --vocab_file=$BERT_BASE_DIR/vocab.txt \
  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
  --max_seq_length=128 \
  --train_batch_size=32 \
  --learning_rate=2e-5 \
  --num_train_epochs=3.0 \
  --output_dir=/tmp/mrpc_output/
```

In [2]:
from run_classifier import *

In [8]:
tf.app.run(main=main, argv=['run_classifier.py', '--task_name', 'za', 
                           '--do_train', 'true',
                           '--do_eval', 'true',
                           '--data_dir', 'data/za_data',
                           '--vocab_file', 'model/chinese_L-12_H-768_A-12/vocab.txt',
                           '--bert_config_file', 'model/chinese_L-12_H-768_A-12/bert_config.json',
                           '--init_checkpoint', 'model/chinese_L-12_H-768_A-12/bert_model.ckpt',
                           '--max_seq_length', '20',
                           '--train_batch_size', '32',
                           '--learning_rate', '2e-5', 
                           '--num_train_epochs', '3.0',
                           '--output_dir', 'result/za_output01/'])

ValueError: Task not found: za

In [19]:
%%!
python run_classifier.py \
  --task_name=za \
  --do_train=true \
  --do_eval=true \
  --data_dir=data/za_data \
  --vocab_file=model/chinese_wwm_ext_L-12_H-768_A-12/vocab.txt \
  --bert_config_file=model/chinese_wwm_ext_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=model/chinese_wwm_ext_L-12_H-768_A-12/bert_model.ckpt \
  --max_seq_length=20 \
  --train_batch_size=32 \
  --learning_rate=2e-5 \
  --num_train_epochs=3.0 \
  --output_dir=result/za_output/

 "INFO:tensorflow:Using config: {'_model_dir': 'result/za_output/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f74c1190f60>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, computation_shape=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None), '_cluster': None}",
 'INFO:tensorflow:_TPUContext: eval_on_tpu True',
 'INFO:tensorflow:Writing example 0 of 20591',
 'INFO:tensorflow:*** Example ***',
 'INFO:tensorflow:guid: trai

In [None]:
python run_classifier.py \
  --task_name=za_ir \
  --do_train=true \
  --do_eval=true \
  --data_dir=data/za_data \
  --vocab_file=model/chinese_wwm_ext_L-12_H-768_A-12/vocab.txt \
  --bert_config_file=model/chinese_wwm_ext_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=model/chinese_wwm_ext_L-12_H-768_A-12/bert_model.ckpt \
  --max_seq_length=32 \
  --train_batch_size=32 \
  --learning_rate=2e-5 \
  --num_train_epochs=3.0 \
  --output_dir=result/za_output/

In [20]:
%%!
python run_classifier.py \
  --task_name=za \
  --do_train=true \
  --do_eval=true \
  --data_dir=data/za_data \
  --vocab_file=model/chinese_wwm_ext_L-12_H-768_A-12/vocab.txt \
  --bert_config_file=model/chinese_wwm_ext_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=model/chinese_wwm_ext_L-12_H-768_A-12/bert_model.ckpt \
  --max_seq_length=20 \
  --train_batch_size=32 \
  --learning_rate=2e-5 \
  --num_train_epochs=12.0 \
  --output_dir=result/za_output/

 "INFO:tensorflow:Using config: {'_model_dir': 'result/za_output/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fcf00110ef0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, computation_shape=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None), '_cluster': None}",
 'INFO:tensorflow:_TPUContext: eval_on_tpu True',
 'INFO:tensorflow:Writing example 0 of 20591',
 'INFO:tensorflow:*** Example ***',
 'INFO:tensorflow:guid: trai

In [1]:
%%!
python run_classifier.py \
  --task_name=za \
  --do_train=true \
  --do_eval=true \
  --data_dir=data/za_data \
  --vocab_file=model/chinese_wwm_ext_L-12_H-768_A-12/vocab.txt \
  --bert_config_file=model/chinese_wwm_ext_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=result/pretraining_output/model.ckpt-20 \
  --max_seq_length=20 \
  --train_batch_size=32 \
  --learning_rate=2e-5 \
  --num_train_epochs=3.0 \
  --output_dir=result/za_output/

 "INFO:tensorflow:Using config: {'_model_dir': 'result/za_output/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f725f231198>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, computation_shape=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None), '_cluster': None}",
 'INFO:tensorflow:_TPUContext: eval_on_tpu True',
 'INFO:tensorflow:Writing example 0 of 20591',
 'INFO:tensorflow:*** Example ***',
 'INFO:tensorflow:guid: trai

In [2]:
%%!
python run_classifier.py \
  --task_name=za \
  --do_train=true \
  --do_eval=true \
  --data_dir=data/za_data \
  --vocab_file=model/chinese_wwm_ext_L-12_H-768_A-12/vocab.txt \
  --bert_config_file=model/chinese_wwm_ext_L-12_H-768_A-12/bert_config.json \
  --init_checkpoint=model/chinese_wwm_ext_L-12_H-768_A-12/bert_model.ckpt \
  --max_seq_length=30 \
  --train_batch_size=128 \
  --learning_rate=2e-5 \
  --num_train_epochs=3.0 \
  --output_dir=result/za_output002/

 "INFO:tensorflow:Using config: {'_model_dir': 'result/za_output002/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f22f6d9d320>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, computation_shape=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None), '_cluster': None}",
 'INFO:tensorflow:_TPUContext: eval_on_tpu True',
 'INFO:tensorflow:Writing example 0 of 20591',
 'INFO:tensorflow:*** Example ***',
 'INFO:tensorflow:guid: t

## train分割为train、test

In [12]:
import pandas as pd
df = pd.read_csv("data/za_data/kd_train.csv_", header=None, sep="\t", names=("a", "b", "y"))

In [13]:
def df_split(out_df):
    test = out_df.sample(frac=0.2, random_state=666)
    train = out_df[~out_df.index.isin(test.index)]
    return train, test

In [14]:
df1, df2 = df_split(df)

In [16]:
df1.y.value_counts(normalize=True), df2.y.value_counts(normalize=True)

(0    0.571463
 1    0.428537
 Name: y, dtype: float64, 0    0.577117
 1    0.422883
 Name: y, dtype: float64)

In [18]:
df1.to_csv("data/za_data/kd_train.csv", index=False, sep="\t", header=False, encoding="utf-8")
df2.to_csv("data/za_data/kd_dev.csv", index=False, sep="\t", header=False, encoding="utf-8")