In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tokenization
import run_classifier
import optimization
import modeling

  from ._conv import register_converters as _register_converters


In [2]:
class inputExample:
    """
    a input example
    """
    def __init__(self, words, label):
        self.token_A = [(w, l) for w, l in zip(words, label)]
        
    def __str__(self):
        index = 0
        for ws, ls in self.token_A:
            print('word_%s is %s with label %s' % (index, ws, ls))
            index += 1
            
    def __len__(self):
        return len(self.token_A)
        

In [3]:
def truncate_tokens(tokens, label_id, max_seq_length):
    index = 0
    while len(tokens) > max_seq_length:
        
        if index ==  len(tokens):
            i = np.random.randint(len(tokens))
            tokens.pop(i)
            label_id.pop(i) 
            index -= 1
        else:
            for i in range(index,len(tokens)):
                index = i+1
                if label_id[i] == 'X':
                    tokens.pop(i)
                    label_id.pop(i)
                    index -= 1
                    break
        

In [25]:
def convert_single_example(inputExample, max_seq_length, tokenizer, ex_index, tagtolabel):
    """
    convert existing tokens to Wordpiece tokens
    """    
    def re_tokenize(list_tokens):    
           
        tokens, labels = [], []
        for token_label in list_tokens:
            token, label = token_label[0], token_label[1]       
            w_token = tokenizer.tokenize(token)
            if len(w_token) == 0:
                w_label = []
            else:
                w_label = [label] + ['X'] * (len(w_token) - 1)
            
            tokens.extend(w_token) 
            labels.extend(w_label)
            
        
        assert len(tokens) == len(labels)
        return tokens, labels
    
    tokens_A, labels_A = re_tokenize(inputExample.token_A)
    truncate_tokens(tokens_A, labels_A, max_seq_length -2)
        
    word_piece_token = ['[CLS]']
    label_id = ['X']    
    
    word_piece_token.extend(tokens_A)
    label_id.extend(labels_A)
    
    word_piece_token.append('[SEP]')
    label_id.append('X')
    
    segment_ids = [0] * len(word_piece_token)

    input_ids = tokenizer.convert_tokens_to_ids(word_piece_token)
    input_mask = [1] * len(input_ids)
    
    # Zero-pad up to the sequence length.   
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
        label_id.append('X')

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    assert len(label_id) == max_seq_length

    if ex_index % 3000 == 0:
        tf.logging.info("*** Example ***")
        tf.logging.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in word_piece_token]))
        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
        tf.logging.info("label: %s" % label_id)
    
    labels = []
    for l in label_id:
        labels.append(-1) if l == 'X'else labels.append(tagtolabel[l])
    
    feature = run_classifier.InputFeatures(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids,
      label_id=labels,
      is_real_example=True)
    
    return feature

In [20]:
def convert_examples_to_features(examples, max_seq_length, tokenizer, tagstolabel):

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
            
        feature = convert_single_example(example, max_seq_length, tokenizer, ex_index, tagstolabel)
        features.append(feature)
    return features

In [7]:
def model_fn_builder(bert_config, num_labels, learning_rate,
                     num_train_steps, num_warmup_steps, use_one_hot_embeddings):

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (total_loss, per_example_loss, logits, probabilities) = create_model(
            bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
            num_labels, use_one_hot_embeddings)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            train_op = optimization.create_optimizer(
                  total_loss, learning_rate, num_train_steps, num_warmup_steps, False)
            
            output_spec = tf.estimator.EstimatorSpec(mode, loss = total_loss, train_op = train_op)
            
        elif mode == tf.estimator.ModeKeys.EVAL:
            
            predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
            accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions)
            loss = tf.metrics.mean(values=per_example_loss)
        
            eval_metrics = {"eval_accuracy": accuracy, "eval_loss": loss}
            
            output_spec = tf.estimator.EstimatorSpec(mode, loss = total_loss, 
                                                     eval_metric_ops = eval_metrics)
        else:
            output_spec = tf.estimator.EstimatorSpec(mode,  
                                                     predictions={"probabilities": probabilities})
            
        return output_spec

    return model_fn

In [8]:
def input_fn_builder(features, seq_length, is_training):

    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    all_label_ids = []

    for feature in features:
        all_input_ids.append(feature.input_ids)
        all_input_mask.append(feature.input_mask)
        all_segment_ids.append(feature.segment_ids)
        all_label_ids.append(feature.label_id)

    def input_fn(params):

        batch_size = params["batch_size"]

        num_examples = len(features)

        d = tf.data.Dataset.from_tensor_slices({
            "input_ids":
                tf.constant(
                    all_input_ids, shape=[num_examples, seq_length],
                    dtype=tf.int32),
            "input_mask":
                tf.constant(
                    all_input_mask,
                    shape=[num_examples, seq_length],
                    dtype=tf.int32),
            "segment_ids":
                tf.constant(
                    all_segment_ids,
                    shape=[num_examples, seq_length],
                    dtype=tf.int32),
            "label_ids":
                tf.constant(all_label_ids, shape=[num_examples, seq_length], dtype=tf.int32),
        })

        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=100)

        d = d.batch(batch_size=batch_size)
        return d

    return input_fn

In [9]:
vocan_file = 'bert_model/vocab.txt'

In [10]:
tokenizer = tokenization.FullTokenizer(
vocab_file=vocan_file, do_lower_case=True)

In [11]:
data_path = 'data/ner.csv.zip'
dataset_path = 'data/ner_dataset.csv.zip'
data = pd.read_csv(dataset_path, encoding = 'latin')

In [12]:
tags = data['Tag'].unique()
tagstolabel = {t:i for t, i in zip(tags, range(len(tags)))}

In [13]:
list_tokens = []
tokens, lables = [data.loc[0,'Word']], [data.loc[0,'Tag']]
pre_sentence_no = data.loc[0,'Sentence #'].split(' ')

for index, row in data[1:].iterrows():
    sentence_index = row['Sentence #']
    word = row['Word']
    tag = row['Tag']
    
    if isinstance(sentence_index, str):
        sentence_no = sentence_index.split(' ')[1] 
        inputexmplae = inputExample(tokens, lables)
        list_tokens.append(inputexmplae)
        tokens, lables = [word], [tag]

    else:
        tokens.append(word)
        lables.append(tag)
        

In [14]:
del data

In [15]:
len_tokens = [len(example.token_A) for example in list_tokens]
print(np.mean(len_tokens))
print(np.max(len_tokens))

21.864277075774638
104


In [26]:
max_seq_length = 50
features = convert_examples_to_features(list_tokens, max_seq_length, tokenizer, tagstolabel)

INFO:tensorflow:Writing example 0 of 47958
INFO:tensorflow:*** Example ***
INFO:tensorflow:tokens: [CLS] thousands of demonstrators have marched through london to protest the war in iraq and demand the withdrawal of british troops from that country . [SEP]
INFO:tensorflow:input_ids: 101 5190 1997 28337 2031 9847 2083 2414 2000 6186 1996 2162 1999 5712 1998 5157 1996 10534 1997 2329 3629 2013 2008 2406 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:label: ['X', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 

INFO:tensorflow:input_ids: 101 2720 1012 19330 5017 2102 2056 2002 7164 2107 1037 2693 2052 2022 1999 3956 1005 1055 3037 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:label: ['X', 'B-per', 'X', 'I-per', 'X', 'X', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'X', 'O', 'O', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']
INFO:tensorflow:*** Example ***
INFO:tensorflow:tokens: [CLS] later in the day , nine construction workers were killed in a separate roadside bombing in the bar kun ##ar district of kun ##ar province . [SEP]
INFO:tensorflow:input_ids: 101 2101 1999 1996 2154 1010 3157

In [27]:
len(features)

47958

In [28]:
del list_tokens

In [29]:
bert_config = modeling.BertConfig.from_json_file('bert_model/bert_config.json')
num_labels = len(tags)
learning_rate = 0.002
num_train_steps = 50
num_warmup_steps = 0
use_one_hot_embeddings = True

In [27]:
# model_fn = model_fn_builder(bert_config, num_labels, learning_rate,
#                      num_train_steps, num_warmup_steps, use_one_hot_embeddings)
# params = {'batch_size':32}

# estimator = tf.estimator.Estimator(model_fn, params = params)
# is_training = True
# seq_length = max_seq_length

# train_input_fn = input_fn_builder(features, seq_length, is_training)
# estimator.train(train_input_fn, max_steps=50)

In [47]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):

    model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

    # In the demo, we are doing a simple classification task on the entire
    # segment.  
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    
#     output_layer = model.get_pooled_output()
    output_layer = model.get_sequence_output()
    timesteps = output_layer.shape[1].value

    with tf.variable_scope("loss"):
        if is_training:
          # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
            
        logits = tf.layers.dense(output_layer, units = num_labels)
        probabilities = tf.nn.softmax(logits, axis=-1)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        
        one_hot_labels = tf.one_hot(labels, depth =num_labels)
           
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        per_example_loss = tf.reduce_sum(per_example_loss, axis = -1)
        loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits, probabilities)

In [109]:
def build_feed_dict(features):

    f_input_ids = []
    f_input_mask = []
    f_segment_ids = []
    f_label_ids = []
    
    for feature in features:
        f_input_ids.append(feature.input_ids)
        f_input_mask.append(feature.input_mask)
        f_segment_ids.append(feature.segment_ids)
        f_label_ids.append(feature.label_id)
    
    feed_dict = {'input_id:0': f_input_ids,
                'input_mask:0': f_input_mask,
                'segment_ids:0': f_segment_ids,
                'label_ids:0': f_label_ids }
    
    return feed_dict
        
    

In [102]:
def buildDataSet(features, is_training, batch_size = 32):
    
    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    d = tf.data.Dataset.from_tensor_slices((input_ids,input_mask,segment_ids,label_ids))

    if is_training:
        d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size)
    return d

In [103]:
tf.reset_default_graph()
is_training = True
use_one_hot_embeddings = True
epoch = num_train_steps

feature_dict = { 'input_ids':tf.placeholder(dtype = tf.int32, shape = [None, max_seq_length], name = 'input_id'),
                'input_mask':tf.placeholder(dtype = tf.int32, shape = [None, max_seq_length], name = 'input_mask'),
                'segment_ids': tf.placeholder(dtype = tf.int32, shape = [None, max_seq_length], name = 'segment_ids'),
               'label_ids': tf.placeholder(dtype = tf.int32, shape = [None, max_seq_length], name = 'label_ids' )}

train_data = buildDataSet(feature_dict, is_training)

iterator = train_data.make_initializable_iterator()
input_ids, input_mask, segment_ids, label_ids = iterator.get_next()

model = modeling.BertModel(
  config=bert_config,
  is_training=is_training,
  input_ids=input_ids,
  input_mask=input_mask,
  token_type_ids=segment_ids,
  use_one_hot_embeddings=use_one_hot_embeddings)

output_layer = model.get_sequence_output()

if is_training:
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

logits = tf.layers.dense(output_layer, units = num_labels)
probabilities = tf.nn.softmax(logits, axis=-1)
log_probs = tf.nn.log_softmax(logits, axis=-1)

one_hot_labels = tf.one_hot(label_ids, depth =num_labels)

per_example_loss = - tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
per_example_loss = tf.reduce_sum(per_example_loss, axis = -1)
total_loss = tf.reduce_mean(per_example_loss)

predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
mask = tf.where(label_ids < 0, tf.zeros(tf.shape(label_ids)), tf.ones(tf.shape(label_ids)))
acc, acc_op = tf.metrics.accuracy(labels=label_ids, predictions=predictions, weights = mask)
# loss, loss_op = tf.metrics.mean(values=per_example_loss)

learning_rate = 0.03
num_train_steps = 30
num_warmup_steps = 5
train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, False)
# train_op = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)

In [112]:
feed_dict = build_feed_dict(features[:64])

In [113]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(epoch):
        sess.run(tf.local_variables_initializer())
        sess.run(iterator.initializer, feed_dict = feed_dict)
        try: 
            while True:
                _, los, accuracy, _ = sess.run([train_op, total_loss, acc, acc_op])
        except tf.errors.OutOfRangeError:
            pass
        print('at epoch %s, loss is %s, accuracy is %s' % (i, los, accuracy))

at epoch 0, loss is 68.59122, accuracy is 0.053416148
at epoch 1, loss is 1585.7644, accuracy is 0.8680352
at epoch 2, loss is 1829.0155, accuracy is 0.028288543
at epoch 3, loss is 200.98743, accuracy is 0.0
at epoch 4, loss is 399.6418, accuracy is 0.008894537
at epoch 5, loss is 120.25573, accuracy is 0.023346303
at epoch 6, loss is 183.97185, accuracy is 0.71936756
at epoch 7, loss is 318.14795, accuracy is 0.71467394
at epoch 8, loss is 120.92107, accuracy is 0.767209
at epoch 9, loss is 146.97801, accuracy is 0.8794326
at epoch 10, loss is 119.70573, accuracy is 0.8737864
at epoch 11, loss is 115.27351, accuracy is 0.86455333
at epoch 12, loss is 96.332794, accuracy is 0.8577181
at epoch 13, loss is 104.02053, accuracy is 0.8551724
at epoch 14, loss is 92.58022, accuracy is 0.86082476
at epoch 15, loss is 90.744095, accuracy is 0.8475452
at epoch 16, loss is 105.41007, accuracy is 0.86576355
at epoch 17, loss is 121.93743, accuracy is 0.871409
at epoch 18, loss is 106.57053, accu