<a href="https://colab.research.google.com/github/lclazx/nlp_learning/blob/master/classify_text_with_tf_hub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 引入资源和设计辅助类、函数

In [1]:
import tensorflow as tf
import tensorflow_hub as hub

In [0]:
tf.__version__

'1.15.0'

In [0]:
class InputFeature(object):
  def __init__(self,
         input_ids,
         input_mask,
         segment_ids,
         label_ids,
         is_real_example=True):
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.label_ids = label_ids
    self.is_real_example = is_real_example


In [0]:
def input_fn_builder(features, seq_length, is_training, num_labels, drop_remainder):
  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_ids)

    def input_fn(params):
      batch_size = params['batch_size']
      num_examples = len(features)
      d = tf.data.Dataset.from_tensor_slices({
          'input_ids': 
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
          'input_mask':
            tf.constant(all_input_mask, shape=[num_examples, seq_length], dtype=tf.int32),
          'segment_ids':
            tf.constant(all_segment_ids, shape=[num_examples, seq_length], dtype=tf.int32),
          'label_ids':
            tf.constant(all_label_ids, shape=[num_examples, num_labels], dtype=tf.int32)
      })

      if is_training:
        d = d.repeat()
        d = d.shuffle(buffer_size=100)
      d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
      return d
    return input_fn

In [0]:
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer):
  label_map = {}
  # for(i, label) in enumrate(label_list):
  #   label_map[label] = id
  tokens_a = tokenizer.tokenize(example.text_a)
  tokens_b = None

  if len(tokens_a) > max_seq_length - 2:
    tokens_a = tokens_a[0: (max_seq_length - 2)]
  tokens = []
  segment_ids = []
  tokens.append('[CLS]')
  segment_ids.append(0)
  for token in tokens_a:
    tokens.append(token)
    segment_ids.append(0)
  tokens.append('[SEP]')
  segment_ids.append(0)

  input_ids = tokenizer.convert_tokens_to_ids(tokens)
  input_mask = [1] * len(input_ids)

  while len(input_ids) < max_seq_length:
    input_ids.append(0)
    input_mask.append(0)
    segment_ids.append(0)

  assert len(input_ids) == max_seq_length
  assert len(input_mask) == max_seq_length
  assert len(segment_ids) == max_seq_length

  label_ids = [0] * len(label_list)
  for label in example.label:
    label_ids[label] = 1

  feature = InputFeature(input_ids=input_ids, input_mask = input_mask, segment_ids = segment_ids, label_ids = label_ids, is_real_example=True)
  return feature

def conver_examples_to_features(examples, label_list, max_seq_length, tokenizer):
  features=[]
  for(ex_index, example) in enumerate(examples):
    if ex_index%10000 == 0:
      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
    feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer)
    features.append(feature)
  return features

    

# 步骤1


## 引用 google cloud service 包， 并下载文件

In [0]:
from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
gcs_service = build('storage', 'v1')

## 创建输出目录

In [0]:

OUTPUT_DIR = 'training_output' #@param
DO_DELETE =  False#@param
USE_BUCKET = True #@param
BUCKET = 'bert_classification'

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
  
if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
  

## 下载文件

In [8]:
from apiclient.http import MediaIoBaseDownload
def download_file(output_dir, source_file):
  with open(output_dir, 'wb') as f:
    request = gcs_service.objects().get_media(bucket=BUCKET, object=source_file)
    media = MediaIoBaseDownload(f, request)
    done = False
    while not done:
      _, done = media.next_chunk()

download_file(output_dir='/tmp/train_data.json', source_file='raw_data/train_data.json/train_data.json')
download_file(output_dir='/tmp/test_data.json', source_file='raw_data/test_data_postag.json/test_data_postag.json')
download_file(output_dir='/tmp/dev_data.json', source_file='raw_data/dev_data.json/dev_data.json')

print('Downloaded')

Downloaded


# 步骤2


## 准备样本

In [0]:
import json
import pandas as pd
def prepare_data(input_file, is_training=True, predicate_list=None, limit=None):
  if not predicate_list:
    predicate_list = []
  data = {
      'text':[],
      'labels':[]      
  }
  with open(input_file, 'r') as f:
    for (index, line) in enumerate(f):
      if limit and index>=limit:
        break
      if index % 10000 == 0:
        print('sample {}', index)
      line_data = json.loads(line)
      text = line_data['text']
      data['text'].append(text)
      labels = []
      for spo in line_data['spo_list']:
        predicate = spo['predicate']
        if is_training:
          if predicate_list.count(predicate) == 0:
            predicate_list.append(predicate)
        else:
          if predicate_list.count(predicate) == 0:
            continue

        predicate_index = predicate_list.index(predicate)
        if labels.count(predicate_index) == 0:
          labels.append(predicate_index)
      data['labels'].append(labels)
  return pd.DataFrame.from_dict(data), predicate_list

## 获取训练数据

In [10]:
train_data, predicate_list = prepare_data('/tmp/train_data.json')

sample {} 0
sample {} 10000
sample {} 20000
sample {} 30000
sample {} 40000
sample {} 50000
sample {} 60000
sample {} 70000
sample {} 80000
sample {} 90000
sample {} 100000
sample {} 110000
sample {} 120000
sample {} 130000
sample {} 140000
sample {} 150000
sample {} 160000
sample {} 170000


In [0]:
train_data

# 步骤3


## 引入BERT

In [11]:
!pip install bert-tensorflow

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

Collecting bert-tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl (67kB)
[K     |████▉                           | 10kB 23.9MB/s eta 0:00:01[K     |█████████▊                      | 20kB 1.7MB/s eta 0:00:01[K     |██████████████▋                 | 30kB 2.5MB/s eta 0:00:01[K     |███████████████████▍            | 40kB 1.7MB/s eta 0:00:01[K     |████████████████████████▎       | 51kB 2.1MB/s eta 0:00:01[K     |█████████████████████████████▏  | 61kB 2.5MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 2.3MB/s 
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.1



## 加载BERT hub

In [12]:
BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_chinese_L-12_H-768_A-12/1'
TRAINABLE = True

def create_tokenizer_from_hub_module():
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature='tokenization_info', as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info['vocab_file'], tokenization_info['do_lower_case']])
    return bert.tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore








In [0]:
tokenizer.tokenize("《宝贝》张悬这首歌也是小暖自己从孕期就开始一直循环听的，俏皮好听")

In [13]:
MAX_SEQ_LENGTH=256
predicate_list_map = [0]*len(predicate_list)
for (index, predicate) in enumerate(predicate_list):
  predicate_list_map[index] = index

train_input_examples = train_data.apply(lambda x: bert.run_classifier.InputExample(guid=None, text_a=x['text'], text_b=None, label=x['labels']), axis=1)
train_input_features = conver_examples_to_features(train_input_examples, predicate_list_map, MAX_SEQ_LENGTH, tokenizer)


INFO:tensorflow:Writing example 0 of 173108


INFO:tensorflow:Writing example 0 of 173108


INFO:tensorflow:Writing example 10000 of 173108


INFO:tensorflow:Writing example 10000 of 173108


INFO:tensorflow:Writing example 20000 of 173108


INFO:tensorflow:Writing example 20000 of 173108


INFO:tensorflow:Writing example 30000 of 173108


INFO:tensorflow:Writing example 30000 of 173108


INFO:tensorflow:Writing example 40000 of 173108


INFO:tensorflow:Writing example 40000 of 173108


INFO:tensorflow:Writing example 50000 of 173108


INFO:tensorflow:Writing example 50000 of 173108


INFO:tensorflow:Writing example 60000 of 173108


INFO:tensorflow:Writing example 60000 of 173108


INFO:tensorflow:Writing example 70000 of 173108


INFO:tensorflow:Writing example 70000 of 173108


INFO:tensorflow:Writing example 80000 of 173108


INFO:tensorflow:Writing example 80000 of 173108


INFO:tensorflow:Writing example 90000 of 173108


INFO:tensorflow:Writing example 90000 of 173108


INFO:tensorflow:Writing example 100000 of 173108


INFO:tensorflow:Writing example 100000 of 173108


INFO:tensorflow:Writing example 110000 of 173108


INFO:tensorflow:Writing example 110000 of 173108


INFO:tensorflow:Writing example 120000 of 173108


INFO:tensorflow:Writing example 120000 of 173108


INFO:tensorflow:Writing example 130000 of 173108


INFO:tensorflow:Writing example 130000 of 173108


INFO:tensorflow:Writing example 140000 of 173108


INFO:tensorflow:Writing example 140000 of 173108


INFO:tensorflow:Writing example 150000 of 173108


INFO:tensorflow:Writing example 150000 of 173108


INFO:tensorflow:Writing example 160000 of 173108


INFO:tensorflow:Writing example 160000 of 173108


INFO:tensorflow:Writing example 170000 of 173108


INFO:tensorflow:Writing example 170000 of 173108


In [0]:
train_input_features[0].label_ids

# 步骤4



## 创建model_fn_builder


In [0]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels, num_labels):
  bert_module = hub.Module(BERT_MODEL_HUB, trainable=False)
  bert_input = dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids)
  bert_output = bert_module(inputs=bert_input, signature='tokens', as_dict=True)
  print(tf.get_variable_scope().name)

  output_layer = bert_output['pooled_output']
  hidden_size = output_layer.shape[-1].value
  print(input_ids)
  print([num_labels, hidden_size])
  print(num_labels)
 
  output_weights = tf.get_variable('output_weights', [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02))  
  output_bias = tf.get_variable('output_bias', [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope('loss'):
    print(output_weights)

    # if not is_predicting:
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.sigmoid(logits)
    label_ids = tf.cast(labels, tf.float32)
    per_example_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=label_ids), axis=-1)
    loss = tf.reduce_mean(per_example_loss)

  return loss, per_example_loss, logits, log_probs
     

In [0]:
def model_fn_builder(num_labels, learning_rate, num_train_steps, num_warmup_steps):
  def model_fn(features, labels, mode, params):
    input_ids = features['input_ids']
    input_mask = features['input_mask']
    segment_ids = features['segment_ids']
    label_ids = features['label_ids']
    is_predicting = (mode==tf.estimator.ModeKeys.PREDICT)
    (loss, per_example_loss, logits, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

    if not is_predicting:
      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu = False)
      def metric_fn(loss, label_ids, log_probs, per_example_loss):
        predicted_ids = tf.cast(log_probs > 0.5, tf.int32)
        label_ids = tf.cast(label_ids, tf.int32)
        elements_equal = tf.cast(tf.equal(predicted_ids, label_ids), tf.int32)
        row_predict_ids = tf.reduce_sum(elements_equal, -1)
        row_label_ids = tf.reduce_sum(tf.ones_like(label_ids), -1)
        accuracy = tf.metrics.accuracy(labels=row_label_ids, predictions=row_predict_ids)
        loss = tf.metrics.mean(values=per_example_loss)
        return{
            'eval_accuracy': accuracy,
            'eval_loss': loss
        }
      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode = mode, loss = loss, train_op = train_op)
      else:
        eval_metrics = metric_fn(loss, label_ids, log_probs, per_example_loss)
        return tf.estimator.EstimatorSpec(mode = mode, loss = loss, eval_metric_ops=eval_metrics)
    else:
      predictions = {
          'logits': logits,
          'probabilities': log_probs,
      }
      return tf.estimator.EstimatorSpec(mode = mode, predictions=predictions)

  return model_fn


# 步骤5


## 训练

In [0]:
BATCH_SIZE=16
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
WARMUP_PROPORTION = 0.1
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [0]:
num_train_steps = int(len(train_input_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)


In [0]:
run_config = tf.estimator.RunConfig(
    model_dir = OUTPUT_DIR,
    save_summary_steps = SAVE_SUMMARY_STEPS,
    save_checkpoints_steps = SAVE_CHECKPOINTS_STEPS
)

In [19]:
model_fn = model_fn_builder(
    num_labels = len(predicate_list),
    learning_rate = LEARNING_RATE,
    num_train_steps = num_train_steps,
    num_warmup_steps = num_warmup_steps
)

estimator = tf.estimator.Estimator(
    model_fn = model_fn,
    config = run_config,
    params = {'batch_size': BATCH_SIZE}
)

INFO:tensorflow:Using config: {'_model_dir': 'gs://bert_classification/training_output', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fcbea6b6080>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': 'gs://bert_classification/training_output', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fcbea6b6080>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [0]:
train_input_fn = input_fn_builder(
    features = train_input_features,
    seq_length = MAX_SEQ_LENGTH,
    is_training = True,
    num_labels = len(predicate_list),
    drop_remainder = False
)

In [0]:
from datetime import datetime
print(f'Begining Training!')
current_time = datetime.now()
estimator.train(input_fn = train_input_fn, max_steps=num_train_steps)
print('Training took time', datetime.now() - current_time)


# 步骤 6

## 测试

In [0]:
!head /tmp/dev_data.json

In [28]:

test_data, _ = prepare_data('/tmp/dev_data.json', is_training=False, predicate_list=predicate_list, limit=3)
print(test_data)
test_input_examples = test_data.apply(lambda x: bert.run_classifier.InputExample(guid=None, text_a=x['text'], text_b=None, label=x['labels']), axis=1)
test_input_features = conver_examples_to_features(test_input_examples, predicate_list, MAX_SEQ_LENGTH, tokenizer)
test_input_fn = input_fn_builder(
    features = test_input_features,
    seq_length = MAX_SEQ_LENGTH,
    is_training = False,
    num_labels = len(predicate_list),
    drop_remainder = False)
estimator.evaluate(input_fn=test_input_fn, steps=None)

sample {} 0
                                                text    labels
0  查尔斯·阿兰基斯（Charles Aránguiz），1989年4月17日出生于智利圣地亚哥...    [9, 3]
1                                      《离开》是由张宇谱曲，演唱   [7, 22]
2  《愤怒的唐僧》由北京吴意波影视文化工作室与优酷电视剧频道联合制作，故事以喜剧元素为主，讲述唐...  [18, 10]
INFO:tensorflow:Writing example 0 of 3


INFO:tensorflow:Writing example 0 of 3


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore



Tensor("IteratorGetNext:0", shape=(?, 256), dtype=int32)
[49, 768]
49
<tf.Variable 'output_weights:0' shape=(49, 768) dtype=float32_ref>


















INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2019-12-09T02:58:58Z


INFO:tensorflow:Starting evaluation at 2019-12-09T02:58:58Z


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from gs://bert_classification/training_output/model.ckpt-5625


INFO:tensorflow:Restoring parameters from gs://bert_classification/training_output/model.ckpt-5625


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Finished evaluation at 2019-12-09-02:59:47


INFO:tensorflow:Finished evaluation at 2019-12-09-02:59:47


INFO:tensorflow:Saving dict for global step 5625: eval_accuracy = 0.6666667, eval_loss = 4.109061, global_step = 5625, loss = 4.109061


INFO:tensorflow:Saving dict for global step 5625: eval_accuracy = 0.6666667, eval_loss = 4.109061, global_step = 5625, loss = 4.109061


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5625: gs://bert_classification/training_output/model.ckpt-5625


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5625: gs://bert_classification/training_output/model.ckpt-5625


{'eval_accuracy': 0.6666667,
 'eval_loss': 4.109061,
 'global_step': 5625,
 'loss': 4.109061}

# 步骤7

## 预测

In [0]:
def get_prediction(in_sentences):
  input_examples = [bert.run_classifier.InputExample(guid="", text_a = x,
                                                     text_b=None, label=[]) for x in in_sentences]

  input_features = conver_examples_to_features(input_examples, predicate_list_map,MAX_SEQ_LENGTH,
                                                tokenizer)
  predict_input_fn = input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH,
                                      is_training=False, num_labels=len(predicate_list), drop_remainder=False)
  predictions = estimator.predict(predict_input_fn)
  return [(sentence, prediction['probabilities'], prediction['logits']) 
          for sentence, prediction in zip(in_sentences, predictions)]
                                                  

In [0]:
predictions = get_prediction([
                             
                              '《愤怒的唐僧》由北京吴意波影视文化工作室与优酷电视剧频道联合制作，故事以喜剧元素为主，讲述唐僧与佛祖打牌，得罪了佛祖，被踢下人间再渡九九八十一难的故事',
                              '李治即位后，萧淑妃受宠，王皇后为了排挤萧淑妃，答应李治让身在感业寺的武则天续起头发，重新纳入后宫',
                              ])
            
# get_prediction([])

In [0]:
predictions

In [25]:
labels = []
for text, probilities, logits in predictions:
  label=[]
  for index,prob in enumerate(probilities):
    if prob>0.3:
      label.append(predicate_list[index])
  labels.append(label)

labels

[['海拔', '创始人', '祖籍']]

In [32]:
in_sentences = [
                             
                              '《愤怒的唐僧》由北京吴意波影视文化工作室与优酷电视剧频道联合制作，故事以喜剧元素为主，讲述唐僧与佛祖打牌，得罪了佛祖，被踢下人间再渡九九八十一难的故事',
                              '李治即位后，萧淑妃受宠，王皇后为了排挤萧淑妃，答应李治让身在感业寺的武则天续起头发，重新纳入后宫',
                              ]

input_examples = [bert.run_classifier.InputExample(guid="", text_a = x,
                                            text_b=None, label=[]) for x in in_sentences]

input_features = conver_examples_to_features(input_examples, predicate_list_map,MAX_SEQ_LENGTH,
                                      tokenizer)
predict_input_fn = input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH,
                            is_training=False, num_labels=len(predicate_list), drop_remainder=False)
predictions = estimator.predict(predict_input_fn)


INFO:tensorflow:Writing example 0 of 2


INFO:tensorflow:Writing example 0 of 2


In [46]:
#[var for var in tf.global_variables() ]
estimator.get_variable_value('output_wegihts')

NotFoundError: ignored