In [1]:
import tensorflow as tf
import os, glob
import numpy as np


import logging
import time
import sys

In [2]:
UNK = "$UNK$"
NUM = "$NUM$"
NONE = "0"

class MyIOError(Exception):
    def __init__(self, filename):
        message = "IOError: Unable to locate file{}".format(filename)
        super(MyIOError, self).__init__(message)
        
class PreprocessData(object):
    def __init__(self, filename, processing_word=None, processing_tag=None, m_iteration=None):
        self.filename = filename
        self.processing_word = processing_word
        self.processing_tag = processing_tag
        self.m_iteration = m_iteration
        self.length = None
        
    
    def _pad_sequences(sequences, pad_tok, max_length):
        sequence_padded = []
        sequence_length = []
        
        for seq in sequences:
            seq = list(seq)
            seq_ = seq[:max_length] + pad_tok*max(max_length, len(seq),0)
            sequence_padded += [seq_]
            sequence_length += [min(len(seq), max_length)]
            return sequence_padded, sequence_length
    def pad_sequences(sequences, pad_tok, nlevels=1):
        if nlevels == 1 :
            max_length = max(map(lambda x: len(x), sequences))
            sequence_padded, sequence_length = _pad_sequences(sequences, pad_tok, max_length)
        elif nlevels == 2 :
            max_length_word = max([max(map(lambda x: len(x), seq)) 
                              for seq in sequences])
            sequence_padded = []
            sequence_length = []
            for seq in sequences:
                sp, s1 = _pad_sequences(seq, pad_tok, max_length_word)
                sequence_padded += [sp]
                sequence_length += [s1]
            
            max_length_sentence = max(map(lambda x: len(x), sequences))
            
            sequence_padded, _ = _pad_sequences(sequence_padded, [pad_tok]*max_length_word, max_length_sentence)
            sequence_length, _ = _pad_sequences(sequence_length, 0 , max_length_sentence)
            

In [3]:
class Progbar(object):
    def __init__(self, target, width=30, verbose=1):
        self.width = width
        self.target = target
        self.verbose = verbose
        self.sum_values = {}
        self.unique_value = {}
        self.start = time.time()
        self.total_width = 0
        self.seen_so_far = 0


In [4]:
# import tensorflow as tf
# import os

class Model(object):
    def __init__(self, config):
        self.config = config
        self.log = config.log
        self.session = None
        self.save = None
    
    def initialize_weights(self, scope):
        variables = tf.contrib.framework.get_variables(scope)
        init = tf.variables_initializer(variables)
        self.session.run(init)
    
    def add_train_op(self, method, lr_rate, loss, clip=-1):
        _m = method.lower()
        
        with tf.variable_scope("train_step"):
            if _m == 'adam':
                optimizer = tf.train.AdamOptimizer(lr_rate)
            elif _m == 'adagrad':
                optimizer = tf.train.AdagradOptimizer(lr_rate)
            elif _m == 'sgd':
                optimizer = tf.train.GradientDescentOptimizer(lr_rate)
            elif _m == 'rmsprop':
                optimizer = tf.train.RMSPropOptimizer(lr_rate)
            else:
                raise NotImplementedError("Unknown method {}".format(_m))
            
            if clip > 0 :
                gd, vs = zip(*optimizer.compute_gradients(loss))
                gd, gnorm = tf.clip_by_norm(gd, clip)
                self.train_op = optimizer.apply_gradients(zip(gd, vs))
            else:
                self.train_op = optimizer.minimize(loss)
        
    def initialize_session(self):
        self.log.info("Initialize tf session")
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        self.save = tf.train.Saver()
        
    def restore_session(self, dir_model):
        self.log.info("Reload the latest trained model...")
        self.save.restore(self.session, dir_model)
    
    def save_session(self):
        if not os.path.exists(self.config.dir_model):
            os.makedirs(self.config.dir_model)
        self.save.save(self.session, self.config.dir_model)
    
    def close_session(self):
        self.session.close()

    
    def add_summary(self):
        self.merged = tf.summary.merge_all()
        self.file_writer = tf.summary.FileWriter(self.config,dir_output, self.session.graph)
        
    def train(self,train, dev):
        record = 0
        num_epoch_no_imprv = 0
        self.add_summary()
        
        for epoch in range(self.config.num_epochs):
            self.log.info("Epoch {:} out of {:}".format(epoch + 1,
                        self.config.num_epochs))
            score = self.run_epoch(train, dev, epoch)
            self.config.lr_rate *= self.config.lr_decay
            
            if score >= record:
                num_epoch_no_imprv = 0
                self.save_session()
                record = score
                self.log.info("New best score recorded...")
            else:
                num_epoch_no_imprv += 1
                if num_epoch_no_imprv >= self.config.num_epoch_no_imprv:
                    self.log.info("Early stopping {} epochs without "\
                            "improvement".format(num_epoch_no_imprv))
                    break            
                    
    def evaluate(self, test):
        self.log.info("Evaluating on test set")
        metrics = self.run_evaluate(test)
        msg = " - ".join(["{} {:04.2f}".format(k, v)
                for k, v in metrics.items()])
        self.log.info(msg)
        

In [5]:
from baseModel import Model
class NERModel(Model):
    def __init__(self, config):
        super(NERModel, self).__init__(config)
        self.index_to_tag = {idx: tag for tag, idx in self.config.vocab_tag.items()}
        
    def initialize_placeholder_tensor(self):
        self.word_id = tf.placeholder(tf.int32, shape=[None, None], name="word_id")
        self.sequence_length = tf.placeholder(tf.int32, shape=[None], name="sequence_length")
        self.char_id = tf.placeholder(tf.int32, shape=[None, None, None], name="char_id")
        self.word_length = tf.placeholder(tf.int32, shapnge=[None, None], name="word_length")
        self.label = tf.placeholder(tf.int32, shape=[None, None], name="label")
        self.drop_out = tf.placeholder(tf.float32, shape=[], name="drop_out")
        self.lr_rate = tf.placeholder(tf.float32, shape=[], name="learning_rate")
        
    def feed_dict(self, word, label=None, lr_rate=None, drop_out=None):
        if self.config.use_chars:
            char_id, word_id = zip(*word)
            word_id, sequence_length = pad_sequences(word_id, 0)
            char_id, word_length = pad_sequences(char_id, pad_tok=0, nlevel=2)
        else: 
            word_id , sequence_length = pad_sequences(word, 0)
        
        feed = {self.word_id: word_id, self.sequence_length: sequence_length}
        
        if self.config.use_chars:
            feed[self.char_id] = char_id
            feed[self.word_length] = word_length
            
        if label is not None:
            label, _ = pad_sequences(label, 0)
            feed[self.label] = label
        
        if lr_rate is not None:
            feed[self.lr_rate] = lr_rate
        
        if drop_out is not None:
            feed[self.drop_out] = drop_out
        
        return feed, sequence_length

    def word_embbeding_option(self):
        with tf.variable_scope("words"):
            if self.config.embbedings is None:
                self.log.info("WARNING: randomly initializing word vectors")
                _word_embbedings = tf.get_variable(name="_word_embbedings",dtype=tf.float32,shape=[self.config.nwords, self.config.dim_word])
                
            else:
                _word_embbedings = tf.Variable(self.config.embbedings, name="_word_embbedings", dtype=tf.float32, trainable=self.config.embbedings)
                
            word_embbedings = tf.nn.embedding_lookup(_word_embbedings, self.word_id, name="word_embbeding")
        
        
        with tf.variable_scope("chars"):
            if self.config.use_chars:
                _char_embbedings = tf.get_variable(name="_char_embbeding", dtype=tf.float32,shape=[self.config.nchars, self.config.dim_char])
                char_embbedings = tf.nn.embedding_lookup(_char_embbedings, self.char_id, name="char_embbedings")
                s = tf.shape(char_embbedings)
                char_embbedings = tf.reshape(char_embbedings, shape=[s[0]*s[1],s[-2], self.config.dim_char])
                word_lengths = tf.reshape(self.word_length, shape=[s[0]]*s[1])
                #define bi-LSTM neural network
                
                cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char, state_is_tuple=True)
                cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char, state_is_tuple=True)
                _output = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw, inputs=char_embbedings,sequence_length=word_lengths, dtype=tf.float32)
                
                _, ((_, output_fw),(_, output_bw)) = _output
                output = tf.concat([output_fw, output_bw], axis=1)
                
                output = tf.reshape(output, shape=[s[0]*s[1],2*self.config.hidden_size_char])
                word_embbedings = tf.concat([word_embbedings,output], axis=1)
                
        self.word_embbedings = tf.nn.dropout(word_embbedings, self.drop_out)
    
    def logits_option(self):
        with tf.variable_scope("bi-lstm"):
            cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            
            (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw, inputs=self.word_embbedings, sequence_length=self.sequence_length, dtype=tf.float32)
            
            output = tf.concat([output_fw, output_bw], axis=1)
            output = tf.nn.dropout(output, self.drop_out)
            
        with tf.variable_scope("proj"):
            W = tf.get_variable("W", dtype=tf.float32, shape=[2*self.confog.hidden_size_lstm, self.config.num_tags])
            b= tf.get_variable("b", dtype=tf.float32, shape=[self.config.n_tags],initializer=tf.zeros_initializer())
            num_steps = tf.shape(output)[1]
            output = tf.reshape(output, shape=[-1,2*self.config.hidden_size_lstm])
            pred = tf.matmul(output, W) + b
            self.logits = tf.reshape(pred, [-1, num_steps, self.config.num_tags])
            
    def prediction_option(self):
        if not self.config.use_crf:
            self.label_pred = tf.cast(tf.argmax(self.logits,axis=-1), tf.int32)
            
    def loss_option(self):        
        if self.config.use_crf:
            log_similar, trans_params = tf.contrib.crf.crf_log_likelihood(self.logits, self.label, self.sequence_length)
            self.trans_params = trans_params
            self.loss = tf.reduce_mean(-log_similar)
            
        else:
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.label)
            mask = tf.sequence_mask(self.sequence_length)
            losses = tf.boolean_mask(losses, mask)
            self.loss = tf.reduce_mean(losses)
        
        tf.summary.scalar("loss", self.loss)
        
    def build(self):
        self.initialize_placeholder_tensor()
        self.word_embbeding_option()
        self.logits_option()
        self.prediction_option()
        self.loss_option()
        
        self.add_train_op(self.config.method, self.lr_rate, self.loss, self.config.clip)
        self.initialize_session()
        
        
    def predict_batch(self,words):
        fd, sequence_length = self.feed_dict(words, drop_out=1.0)
        if self.config.use_crf:
            bi_sequences = []
            loggits , trans_params = self.session.run([self.logits, self.trans_params], feed_dict=fd)
            
            for lg, seq_len in zip(logits, sequence_length):
                logit = logit[:seq_len]
                bi_seq , bi_score = tf.contrib.crf.viterbi_decode(logit, trans_params)
                bi_seqs += [bi_seq]
            
            return bi_seqs, sequence_length
        
    def run_epoch(self, train, dev, epoch):
        batch_size = self.config.batch_size
        num_batch = (len(train) + batch_size -1) // batch_size
        prog = Progbar(target=num_batch)
        
        for i, (word, label) in enumerate(minibatches(train, batch_size)):
            fd , _ = self.feed_dict(word, label, self.config.lr_rate, self.config.drop_out)
            _, train_loss, summary = self.session.run([self.train_op, self.loss, self.merged], feed_dict=fd)
            
            prog.update(i+1, [("train loss", train_loss)])
            if (i%10 == 0):
                self.file_writer.add_summary(summary, epoch*num_batch+i)
                
        metrics = self.run_evaluate(dev)
        msg = " - ".join(["{} {:04.2f}".format(k, v)
                for k, v in metrics.items()])
        self.logger.info(msg)

        return metrics["f1"]
    
    def evaluate(self, test):
        accs = []
        correct_pred. total_correct, total_pred = 0.,0.,0.
        for word, label in minibatches(test, self.config.batch_size):
            labels_pred, sequence_lengths = self.predict_batch(word)

        
            
        
        
        
        
        
        
        
        
        
        
        
        
        
        

In [None]:
tf.argmax?