In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tokenization
import run_classifier
import optimization
import modeling
import collections

In [2]:
class inputExample:
    """
    a input example
    """
    def __init__(self, words, label):
        self.token_A = [(w, l) for w, l in zip(words, label)]
        
    def __str__(self):
        index = 0
        for ws, ls in self.token_A:
            print('word_%s is %s with label %s' % (index, ws, ls))
            index += 1
            
    def __len__(self):
        return len(self.token_A)
        

In [3]:
def truncate_tokens(tokens, label_id, max_seq_length):
    index = 0
    while len(tokens) > max_seq_length:
        
        if index ==  len(tokens):
            i = np.random.randint(len(tokens))
            tokens.pop(i)
            label_id.pop(i) 
            index -= 1
        else:
            for i in range(index,len(tokens)):
                index = i+1
                if label_id[i] == 'X':
                    tokens.pop(i)
                    label_id.pop(i)
                    index -= 1
                    break
        

In [4]:
def convert_single_example(inputExample, max_seq_length, tokenizer, ex_index, tagtolabel):
    """
    convert existing tokens to Wordpiece tokens
    """    
    def re_tokenize(list_tokens):    
           
        tokens, labels = [], []
        for token_label in list_tokens:
            token, label = token_label[0], token_label[1]       
            w_token = tokenizer.tokenize(token)
            if len(w_token) == 0:
                w_label = []
            else:
                w_label = [label] + ['X'] * (len(w_token) - 1)
            
            tokens.extend(w_token) 
            labels.extend(w_label)
            
        assert len(tokens) == len(labels)
        return tokens, labels
    
    tokens_A, labels_A = re_tokenize(inputExample.token_A)
    truncate_tokens(tokens_A, labels_A, max_seq_length -2)
        
    word_piece_token = ['[CLS]']
    labels = ['[CLS]']    
    
    word_piece_token.extend(tokens_A)
    labels.extend(labels_A)
    
    word_piece_token.append('[SEP]')
    labels.append('[SEP]')
    
    segment_ids = [0] * len(word_piece_token)

    input_ids = tokenizer.convert_tokens_to_ids(word_piece_token)
    input_mask = [1] * len(input_ids)
    
    # Zero-pad up to the sequence length.   
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
        labels.append('N')

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    assert len(labels) == max_seq_length
    
    label_ids = []
    for l in labels:
        label_ids.append(0) if l == 'N'else label_ids.append(tagtolabel[l])

    if ex_index % 3000 == 0:
        tf.logging.info("*** Example ***")
        tf.logging.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in word_piece_token]))
        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
        tf.logging.info("label: %s" % labels)
        tf.logging.info("label_ids: %s" % label_ids)
    
    
    feature = run_classifier.InputFeatures(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids,
      label_id=label_ids,
      is_real_example=True)
    
    return feature

In [5]:
def file_based_convert_examples_to_features(examples, max_seq_length, tokenizer, output_file, tagstolabel):

    writer = tf.python_io.TFRecordWriter(output_file)
    features_out = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

        feature = convert_single_example(example, max_seq_length, tokenizer, ex_index, tagstolabel)
        features_out.append(feature)
        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
            return f
        
        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["label_ids"] = create_int_feature(feature.label_id)
        features["is_real_example"] = create_int_feature([int(feature.is_real_example)])

        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()
    return features_out

In [6]:
def model_fn_builder(bert_config, num_labels, learning_rate,
                     num_train_steps, num_warmup_steps, use_one_hot_embeddings):

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        total_loss, per_example_loss, logits, probabilities = create_model(bert_config, is_training, input_ids, 
                                                                     input_mask, segment_ids,
                                                                label_ids, num_labels, use_one_hot_embeddings)

        mask = tf.where(label_ids < 0, tf.zeros(tf.shape(label_ids)), tf.ones(tf.shape(label_ids)))
        predictions = tf.cast(tf.argmax(logits, axis=-1), dtype = tf.int32)

        batch_accuracy = tf.reduce_sum(tf.cast(tf.equal(predictions, label_ids), dtype = tf.float32) * mask) / tf.reduce_sum(mask)

        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
        accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions)
        loss = tf.metrics.mean(values=per_example_loss)
        
        eval_metrics = {"eval_accuracy": accuracy, "eval_loss": loss}

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            
            train_op = optimization.create_optimizer(
                  total_loss, learning_rate, num_train_steps, num_warmup_steps, False)
            
            output_spec = tf.estimator.EstimatorSpec(mode, loss = total_loss, train_op = train_op, eval_metric_ops = eval_metrics)
            
        elif mode == tf.estimator.ModeKeys.EVAL:
            
            output_spec = tf.estimator.EstimatorSpec(mode, loss = total_loss, 
                                                     eval_metric_ops = eval_metrics)
        else:
            output_spec = tf.estimator.EstimatorSpec(mode,  
                                                     predictions={"probabilities": probabilities})
            
        return output_spec

    return model_fn

In [7]:
def file_based_input_fn_builder(input_file, seq_length, drop_remainder, num_data_to_use = 128, batch_size = 32):

    name_to_features = {
      "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
      "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
      "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
      "label_ids": tf.FixedLenFeature([seq_length], tf.int64),
      "is_real_example": tf.FixedLenFeature([], tf.int64),
    }

    def _decode_record(record, name_to_features):
        example = tf.parse_single_example(record, name_to_features)

    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t

        return example    
    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.
    d = tf.data.TFRecordDataset(input_file)
    d = d.take(num_data_to_use).cache()
    d = d.shuffle(buffer_size=100)

    d = d.map(lambda record: _decode_record(record, name_to_features)).batch(batch_size =batch_size)

    return d

In [None]:
def generateTagsLabels():
    
    vocan_file = 'bert_model/vocab.txt'
    tokenizer = tokenization.FullTokenizer(vocab_file=vocan_file, do_lower_case=True)
    data_path = 'data/ner.csv.zip'
    dataset_path = 'data/ner_dataset.csv.zip'
    data = pd.read_csv(dataset_path, encoding = 'latin')
    tags = list(data['Tag'].unique())
    tags.extend(['X','[CLS]','[SEP]'])
    tagstolabel = {t:i for i, t in enumerate(tags, 1)}
    
    return tages, tagstolabel, data


In [13]:
def covert_data_to_Example(data):
    list_tokens = []
    tokens, lables = [data.loc[0,'Word']], [data.loc[0,'Tag']]
    pre_sentence_no = data.loc[0,'Sentence #'].split(' ')

    for index, row in data[1:].iterrows():
        sentence_index = row['Sentence #']
        word = row['Word']
        tag = row['Tag']

        if isinstance(sentence_index, str):
            sentence_no = sentence_index.split(' ')[1] 
            inputexmplae = inputExample(tokens, lables)
            list_tokens.append(inputexmplae)
            tokens, lables = [word], [tag]

        else:
            tokens.append(word)
            lables.append(tag)
    return list_tokens
        

In [None]:
def write_Data_to_dist(data, max_seq_length, tokenizer, tagstolabel):
    
    list_tokens = covert_data_to_Example(data)
    del data
    
    len_tokens = [len(example.token_A) for example in list_tokens]
    pd.Series(len_tokens).describe()
    
    max_seq_length = 50
    output_file = 'data_file'
    # features = convert_examples_to_features(list_tokens, max_seq_length, tokenizer, tagstolabel)
    features = file_based_convert_examples_to_features(list_tokens, max_seq_length, tokenizer, output_file, tagstolabel)
    
    print('length of feature %s is' % len(features))
    
    del list_tokens
    del features
    
    

In [14]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):

    is_train = True
    model = modeling.BertModel(
      config=bert_config,
      is_training=is_train,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

#     output_layer = model.get_pooled_output()
    output_layer = model.get_sequence_output()
    timesteps = output_layer.shape[1].value
    hidden_size = output_layer.shape[-1].value
    
    sequence_length = tf.cast(input_ids > 0, dtype = tf.int32)
    sequence_length = tf.reduce_sum(sequence_length, axis=-1)

    with tf.variable_scope("loss"):
        output_layer = tf.layers.dropout(output_layer, rate = 0.2, training = is_training)     
        logits = tf.layers.dense(output_layer,units = num_labels, kernel_initializer = tf.truncated_normal_initializer(stddev=0.02))
        
        per_example_loss, tran_para = tf.contrib.crf.crf_log_likelihood(logits, labels, sequence_length)
        loss = - tf.reduce_mean(per_example_loss)
        
        decode_tags, best_score = tf.contrib.crf.crf_decode(logits, tran_para, sequence_length)

    return loss, per_example_loss, decode_tags, best_score

In [None]:
tages, tagstolabel, data =  generateTagsLabels()
# write_Data_to_dist(data, max_seq_length, tokenizer, tagstolabel)

In [15]:
bert_config_file = 'bert_model/bert_config.json'
bert_config = modeling.BertConfig.from_json_file(bert_config_file)
use_one_hot_embeddings = True
epoch = 30
batch_size = 32
max_seq_length = 50
input_file = 'data_file'
num_labels = len(tags) + 1
learning_rate = 0.03
num_train_steps = 30
num_warmup_steps = 5

In [20]:
tf.reset_default_graph()

is_training = tf.placeholder_with_default(True, shape =  ())
mask_label = tagstolabel['X']

train_data = file_based_input_fn_builder(input_file, max_seq_length, drop_remainder = True, num_data_to_use = 5000)

iterator = train_data.make_initializable_iterator()
features = iterator.get_next()

input_ids = features['input_ids']
input_mask = features['input_mask']
segment_ids = features['segment_ids']
label_ids = features['label_ids']

total_loss, per_example_loss, predictions, best_score = create_model(bert_config, is_training, input_ids, 
                                                                     input_mask, segment_ids,
                                                                     label_ids, num_labels, use_one_hot_embeddings)

mask = 1 - tf.cast(tf.equal(label_ids, mask_label), dtype = tf.int32)
loss, loss_op = tf.metrics.mean(values=per_example_loss)
accu, accu_op = tf.metrics.accuracy(labels = label_ids, predictions= predictions, weights = mask)
# train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, False)
train_op = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)

In [21]:
epoch = num_train_steps
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(epoch):
        sess.run(tf.local_variables_initializer())
        sess.run(iterator.initializer)
        try: 
            while True:
                _,los,_, accuracy, _ = sess.run([train_op, loss, loss_op, accu, accu_op])
        except tf.errors.OutOfRangeError:
            pass
        print('at epoch %s, loss is %s, accuracy is %s' % (i, los, accuracy))

at epoch 0, loss is -86.2068, accuracy is 0.851363
at epoch 1, loss is -23.5998, accuracy is 0.909994
at epoch 2, loss is -21.4473, accuracy is 0.927787
at epoch 3, loss is -20.5428, accuracy is 0.928795
at epoch 4, loss is -20.1679, accuracy is 0.9288
at epoch 5, loss is -19.8692, accuracy is 0.928823
at epoch 6, loss is -19.7506, accuracy is 0.928867
at epoch 7, loss is -19.6697, accuracy is 0.928788


KeyboardInterrupt: 