In [None]:
import tensorflow as tf
import os, glob
import numpy as np


import logging
import time
import sys

In [1]:
import numpy as np
import os

UNK = "$UNK$"
NUM = "$NUM$"
NONE = "0"

class MIOError(Exception):
    def __init__(self, filename):
        message = "IOError: Unable to locate file{}".format(filename)
        super(MIOError, self).__init__(message)
        
class PreProcessData(object):
    def __init__(self, filename, processing_word=None, processing_tag=None, m_iteration=None):
        self.filename = filename
        self.processing_word = processing_word
        self.processing_tag = processing_tag
        self.m_iteration = m_iteration
        self.length = None
    
    def __iter__(self):
        num_iter = 0
        
        with open(self.filename) as f:
            word = []
            tag = []
            for line in f:
                line = line.strip()
                if (len(line) == 0 or line.startswith("-DOCSTART-")):
                    if len(word) != 0:
                        num_iter +=1
                        if self.m_iteration is not None and num_iter > self.m_iteration:
                            break
                        yield word, tag 
                        word, tag = [], []
                else:
                    ls = line.split(' ')
                    w, t = ls[0], ls[-1]
                    if self.processing_word is not None:
                        w = self.processing_word(w)
                    if self.processing_tag is not None:
                        t = self.processing_tag(t)
                    word += [w]
                    tag += [t]
    
    def __len__(self):
        if self.length is None:
            self.length = 0
            for  _ in self:
                self.length += 1
        
        return self.length
    
    def processing_vocab(data):
        print("Building Vocabulary...")
        w_vocab = set()
        t_vocab = set()
        for d in data:
            for word, tag in data:
                w_vocab.update(word)
                t_vocab.update(tag)
        print("- done. {} tokens".format(len(w_vocab)))
        return w_vocab, t_vocab
    
    def processing_char_vocab(data):
        c_vocab = set()
        for word, _ in data:
            c_vocab.update(word)
        return c_vocab
    
    def glove_vocab(filename):
        print("Buidling Glove Vocabulary ...")
        glove_vocab = set()
        with open(filename) as f:
            for line in f:
                word = line.strip().split(' ')[0]
                glove_vocab.add(word)
        print(" -done. {} tokens".format(len(glove_vocab)))
        
    def writing_vocab(vocab, filename):
        print("Writing output file...")
        with open(filename,"w") as f:
            for i, word in enumerate(vocab):
                if i != len(vocab) -1:
                    f.write("{}\n".format(word))
                else:
                    f.write(word)
        print(" - done. {} tokens". format(len(vocab)))
    
    def load_dict(filename):
        try:
            d = dict()
            with open(filename) as f:
                for idx, word in enumerate(f):
                    word = word.strip()
                    d[word] = idx
        except IOError:
            raise MIOError(filename)
        return d
    
    def exp_trimmed_glove_vector(vocab, glove_filename, trimmed_filename, dim):
        embeddings = np.zeros([len(vocab), dim])
        with open(glove_filename) as f:
            for line in f:
                line = line.strip().split(' ')
                word = line[0]
                embeddings = [float(x) for x in line[1:]]
                if word in vocab:
                    w_idx = vocab[word]
                    embeddings[w_idx] = np.asarray(embeddings)
        np.savez_compressed(trimmed_filename, embeddings=embeddings)
    
    def processing_trimmed_glove_vector(filename):
        try:
            with np.load(filename) as data:
                return data["embeddings"]
        except IOError:
            raise MIOError(filename)
            
    def get_processing_word(vocab_words=None, vocab_chars=None, 
                            lowercase=False, chars=False, allow_unk=True):
        def f(word):
            if vocab_words is not None and chars == True:
                c_id =[]
                for char in word:
                    if char in vocab_chars:
                        c_id += [vocab_chars[char]]
            if lowercase:
                word.lower()
            if word.digit():
                word = NUM
            
            if vocab_words is not None:
                if word in vocab_words:
                    word = vocab_words[word]
                else:
                    if allow_unk:
                        word = vocab_words[UNK]
                    else:
                        raise Exception("UnKnown Key. Please re-check the tag")
            
            if vocab_chars is not None and chars == True:
                return c_id, word
            else:
                return word
        return f
    
    def _pad_sequences(sequences, pad_tok, max_length):
        sequence_padded = []
        sequence_length = []
        for seq in sequences:
            seq = list(seq)
            seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq),0)
            sequence_padded += [seq_]
            sequence_length += [min(len(seq), max_length)]
            return sequence_padded, sequence_length
    def pad_sequences(sequences, pad_tok, nlevels=1):
        if nlevels == 1 :
            max_length = max(map(lambda x: len(x), sequences))
            sequence_padded, sequence_length = _pad_sequences(sequences, pad_tok, max_length)
        elif nlevels == 2 :
            max_length_word = max([max(map(lambda x: len(x), seq)) for seq in sequences])
            sequence_padded = []
            sequence_length = []
            for seq in sequences:
                sp, s1 = _pad_sequences(seq, pad_tok, max_length_word)
                sequence_padded += [sp]
                sequence_length += [s1]
            
            max_length_sentence = max(map(lambda x: len(x), sequences))
            
            sequence_padded, _ = _pad_sequences(sequence_padded, [pad_tok]*max_length_word, max_length_sentence)
            sequence_length, _ = _pad_sequences(sequence_length, 0 , max_length_sentence)
        
        return sequence_padded, sequence_length
    def minibatches(data, size):
        b_x, b_y = [],[]
        for (x,y) in data:
            if len(b_x) == size:
                yield b_x, b_y
                b_x, b_y = [],[]
            
            if type(x[0]) == tuple:
                x = zip(*x)
            b_x += [x]
            b_y += [y]
            
        if len(b_x) != 0:
            yield b_x, b_y
    
    def get_chunk_type(token,tag):
        tag_name = tag[token]
        tag_class = tag_name.split('-')[0]
        tag_type = tag_name.split('-')[1]
        return tag_class, tag_type
    
    def get_chunks(seq, tags):
        default = tags[NONE]
        tag_id = {idx: tag for tag, idx in tags.items()}
        chunk_list = []
        chunk_type, chunk_start = None,None
        for i, token in enumerate(seq):
            if token == default and chunk_type is not None:
                chunk = (chunk_type, chunk_start, i)
                chunk_list.append(chunk)
                chunk_type, chunk_start = None, None
            elif token != default:
                token_chunk_class, token_chunk_type = get_chunk_type(token, tag_id)
                if chunk_type is None:
                    chunk_type, chunk_start = token_chunk_type, i
                elif token_chunk_type != chunk_type or token_chunk_class == "B"
                    chunk = (chunk_type, chunk_start, i)
                    chunk_list.append(chunk)
                    chunk_type, chunk_start = token_chunk_type, i
            else:
                pass
        
        if chunk_type is not None:
            chunk = (chunk_type, chunk_start, len(seq))
            chunk_list.append(chunk)
        return chunk_list
    

In [None]:
import tensorflow as tf
import os
import numpy as np
import logging
import time
import sys

def logging_file(filename):
    logger = logging.getLogger('logger')
    logger.setLevel(logging.DEBUG)
    logging.basicConfig(format='%(message)s', level=logging.DEBUG)
    handler = logging.FileHandler(filename)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)
    return logger

class Progress(object):
    def __init__(self, target, width=30, verbose=1):
        self.width = width
        self.target = target
        self.verbose = verbose
        self.sum_value = {}
        self.unique_value = {}
        self.start = time.time()
        self.total_width = 0
        self.seen_so_far = 0
        
    def update(self, current, value=[], exact=[], strict=[]):
        for k,v in value:
            if k not in self.sum_value:
                self.sum_value[k] =[v*(current - self.seen_so_far), current - self.seen_sofar]
                self.unique_value.append(k)
            else:
                self.sum_value[k][0] += v*(current - self.seen_so_far)
                self.sum_value[k][1] += (current - self.seen_so_far)
        
        for k, v in exact:
            if k not in self.sum_value:
                self.unique_value.append(k)
            self.sum_value[k] = v
        
        self.seen_so_far = current
        
        now = time.time()
        
        if self.verbose == 1:
            prev_total_width = self.total_width
            sys.stdout.write("\b" * prev_total_width)
            sys.stdout.write("\r")
            
            num_digit = int(np.floor(np.log10(self.target))) +1 
            string_bar = '%%%dd/%%%dd [' % (num_digit, num_digit)
            bar = string_bar % (current, self.target)
            prog = float(current)/ self.target
            prog_width = int(self.width*prog)
            if prog_width > 0:
                bar += ('='*(prog_width-1))
                if current < self.target:
                    bar += '>'
                else:
                    bar += '='
            bar += ('.'*(self.width-prog_width))
            bar += ']'
            sys.stdout.write(bar)
            self.total_width = len(bar)
            
            if current:
                time_per_unit = (now - self.start) / current
            else:
                time_per_unit = 0
            eta = time_per_unit*(self.target - current)
            info = ''
            if current < self.target:
                info += ' -ETA: %ds '%eta
            else:
                info += ' -%ds' %(now - self.start)

            for k in self.unique_value:
                if type(self.sum_value[k][0]) is list:
                    info += ' -%s: %.4f ' %(k, self.sum_value[k][0] / max(1, self.sum_value[k][1]))
                else:
                    info += ' -%s: %s ' %(k, self.sum_value[k])

            self.total_width += len(info)

            if prev_total_width > self.total_width:
                info += ((prev_total_width-self.total_width)*" ")
            sys.stdout.write(info)
            sys.stdout.flush()

            if current > self.target: 
                sys.stdout.write("\n")
        
        if self.verbose == 2:
            if current >= self.target:
                info = '%ds' %(now-self.start)
                
                for k in self.unique_value:
                    info += '- %s: %.4f' %(k, self.sum_value[k][0] /  max(1, self.sum_value[k][1]))
                    
                sys.stdout.write(info + "\n")
        
    def add(self, n, value=[]):
        self.update(self.seen_so_far+n, value)

                

In [None]:
# import tensorflow as tf
# import os

class Model(object):
    def __init__(self, config):
        self.config = config
        self.log = config.log
        self.session = None
        self.save = None
    
    def initialize_weights(self, scope):
        variables = tf.contrib.framework.get_variables(scope)
        init = tf.variables_initializer(variables)
        self.session.run(init)
    
    def add_train_op(self, method, lr_rate, loss, clip=-1):
        _m = method.lower()
        
        with tf.variable_scope("train_step"):
            if _m == 'adam':
                optimizer = tf.train.AdamOptimizer(lr_rate)
            elif _m == 'adagrad':
                optimizer = tf.train.AdagradOptimizer(lr_rate)
            elif _m == 'sgd':
                optimizer = tf.train.GradientDescentOptimizer(lr_rate)
            elif _m == 'rmsprop':
                optimizer = tf.train.RMSPropOptimizer(lr_rate)
            else:
                raise NotImplementedError("Unknown method {}".format(_m))
            
            if clip > 0 :
                gd, vs = zip(*optimizer.compute_gradients(loss))
                gd, gnorm = tf.clip_by_norm(gd, clip)
                self.train_op = optimizer.apply_gradients(zip(gd, vs))
            else:
                self.train_op = optimizer.minimize(loss)
        
    def initialize_session(self):
        self.log.info("Initialize tf session")
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        self.save = tf.train.Saver()
        
    def restore_session(self, dir_model):
        self.log.info("Reload the latest trained model...")
        self.save.restore(self.session, dir_model)
    
    def save_session(self):
        if not os.path.exists(self.config.dir_model):
            os.makedirs(self.config.dir_model)
        self.save.save(self.session, self.config.dir_model)
    
    def close_session(self):
        self.session.close()

    
    def add_summary(self):
        self.merged = tf.summary.merge_all()
        self.file_writer = tf.summary.FileWriter(self.config,dir_output, self.session.graph)
        
    def train(self,train, dev):
        record = 0
        num_epoch_no_imprv = 0
        self.add_summary()
        
        for epoch in range(self.config.num_epochs):
            self.log.info("Epoch {:} out of {:}".format(epoch + 1,
                        self.config.num_epochs))
            score = self.run_epoch(train, dev, epoch)
            self.config.lr_rate *= self.config.lr_decay
            
            if score >= record:
                num_epoch_no_imprv = 0
                self.save_session()
                record = score
                self.log.info("New best score recorded...")
            else:
                num_epoch_no_imprv += 1
                if num_epoch_no_imprv >= self.config.num_epoch_no_imprv:
                    self.log.info("Early stopping {} epochs without "\
                            "improvement".format(num_epoch_no_imprv))
                    break            
                    
    def evaluate(self, test):
        self.log.info("Evaluating on test set")
        metrics = self.run_evaluate(test)
        msg = " - ".join(["{} {:04.2f}".format(k, v)
                for k, v in metrics.items()])
        self.log.info(msg)
        

In [None]:
import tensorflow as tf
import os
import numpy as np

from model import Model
from utils import Progress
from data_utils import minibatches, pad_sequences, get_chunks

class NERModel(Model):
    def __init__(self, config):
        super(NERModel, self).__init__(config)
        self.tag_idx = {idx: tag for tag, idx in self.config.vocab_list.items()}
        
    def initialize_placeholder_tensor(self):
        self.c_id = tf.placeholder(tf.int32, shape=[None, None, None], name="char_id") # [batch_size, max_length_sentence, max_length_word]				
        self.w_id = tf.placeholder(tf.int32, shape=[None, None], name="word_id") #[batch_size, max_length_of_sentence_in_batch]
        self.w_len = tf.placeholder(tf.int32, shapnge=[None, None], name="word_len")  # [batch_size, max_length_sentence]
        self.seq_len = tf.placeholder(tf.int32, shape=[None], name="sequence_length") #[batch_size]
        self.label = tf.placeholder(tf.int32, shape=[None, None], name="label") # [batch size, max_length_of_sentence_in_batch]
        self.drop_out = tf.placeholder(tf.float32, shape=[], name="drop_out")
        self.lr_rate = tf.placeholder(tf.float32, shape=[], name="learning_rate")
        
    def feed_dict(self, word, label=None, lr_rate=None, drop_out=None):
        if self.config.use_chars:
            c_id, w_id = zip(*word)
            w_id, seq_len = pad_sequences(w_id, 0)
            c_id, w_len = pad_sequences(c_id, pad_tok=0, nlevel=2)
        else: 
            w_id , seq_len = pad_sequences(word, 0)
        
        feed = {self.w_id: w_id, self.seq_len: seq_len}
        
        if self.config.use_chars:
            feed[self.c_id] = c_id
            feed[self.w_len] = w_len
            
        if label is not None:
            label, _ = pad_sequences(label, 0)
            feed[self.label] = label
        
        if lr_rate is not None:
            feed[self.lr_rate] = lr_rate
        
        if drop_out is not None:
            feed[self.drop_out] = drop_out
        
        return feed, seq_len

    def word_embbeding_option(self):
        with tf.variable_scope("words"):
            if self.config.embbedings is None:
                self.log.info("WARNING: randomly initializing word vectors")
                _word_embbeding = tf.get_variable(name="_word_embbeding",dtype=tf.float32,shape=[self.config.num_word, self.config.dim_word])
            else:
                _word_embbeding = tf.Variable(self.config.embbedings, name="_word_embbeding", dtype=tf.float32, trainable=self.config.train_embbedings)
                
            word_embbedings = tf.nn.embedding_lookup(_word_embbeding, self.w_id, name="word_embbeding")
        
        with tf.variable_scope("chars"):
            if self.config.use_chars:
                _char_embbeding = tf.get_variable(name="_char_embbeding", dtype=tf.float32, shape=[self.config.num_char, self.config.dim_char])
                char_embbedings = tf.nn.embedding_lookup(_char_embbedings, self.c_id, name="char_embbeding")
                s = tf.shape(char_embbedings)

                char_embbedings = tf.reshape(char_embbedings, shape=[s[0]*s[1],s[-2], self.config.dim_char])

                w_len = tf.reshape(self.w_len, shape=[s[0]*s[1]])

                #define bi-LSTM
                cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char, state_is_tuple=True)
                cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char, state_is_tuple=True)
                _output = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw,char_embbedings, sequence_length=w_len, dtype=tf.float32)
                
                _, ((_, output_fw),(_, output_bw)) = _output
                output = tf.concat([output_fw, output_bw], axis=-1)
                
                output = tf.reshape(output, shape=[s[0],s[1],2*self.config.hidden_size_char])
                word_embbeding = tf.concat([word_embbeding,output], axis=-1)
                
        self.word_embbeding = tf.nn.dropout(word_embbeding, self.drop_out)
    
    def logits_option(self):
        with tf.variable_scope("bi-lstm"):
            cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
            
            (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.word_embbeding, sequence_length=self.seq_len, dtype=tf.float32)
            
            output = tf.concat([output_fw, output_bw], axis=-1)
            output = tf.nn.dropout(output, self.drop_out)
            
        with tf.variable_scope("projection"):
            W = tf.get_variable("W", dtype=tf.float32, shape=[2*self.config.hidden_size_lstm, self.config.n_tag])
            b= tf.get_variable("b", dtype=tf.float32,  shape=[self.config.n_tag], initializer=tf.zeros_initializer())
            num_step = tf.shape(output)[1]
            output = tf.reshape(output, shape=[-1,2*self.config.hidden_size_lstm])
            pred = tf.matmul(output, W) + b
            self.logit = tf.reshape(pred, [-1, num_step, self.config.n_tag])
            
    def prediction_option(self):
        if not self.config.use_crf:
            self.label_pred = tf.cast(tf.argmax(self.logit,axis=-1), tf.int32)
            
    def loss_option(self):        
        if self.config.use_crf:
            log_similar, trans_params = tf.contrib.crf.crf_log_likelihood(self.logit, self.label, self.seq_len)
            self.trans_params = trans_params
            self.loss = tf.reduce_mean(-log_similar)       
        else:
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logit, labels=self.label)
            mask = tf.sequence_mask(self.seq_len)
            loss = tf.boolean_mask(loss, mask)
            self.loss = tf.reduce_mean(loss)
        
        tf.summary.scalar("loss", self.loss)
        
    def build(self):
        self.initialize_placeholder_tensor()
        self.word_embbeding_option()
        self.logits_option()
        self.prediction_option()
        self.loss_option()
        
        self.add_train_op(self.config.method, self.lr_rate, self.loss, self.config.clip)
        self.initialize_session()
        
        
    def predict_batch(self,word):
        fd, seq_len = self.feed_dict(word, drop_out=1.0)
        if self.config.use_crf:
            viterbi_seq = []
            logit , trans_params = self.session.run([self.logit, self.trans_params], feed_dict=fd)            
            for lg, sl in zip(logit, seq_len):
                lg = lg[:sl]
               	vi_seq , vi_score = tf.contrib.crf.viterbi_decode(lg, trans_params)
                viterbi_seq += [vi_seq]
            
            return viterbi_seq, seq_len
        
    def run_epoch(self, train, dev, epoch):
        batch_size = self.config.batch_size
        num_batch = (len(train) + batch_size -1) // batch_size
        prog = Progress(target=num_batch)
        
        for i, (word, label) in enumerate(minibatches(train, batch_size)):
            fd , _ = self.feed_dict(word, label, self.config.lr_rate, self.config.drop_out)
            _, train_loss, summary = self.session.run([self.train_op, self.loss, self.merged], feed_dict=fd)
            
            prog.update(i+1, [("train loss", train_loss)])
            if (i%10 == 0):
                self.file_writer.add_summary(summary, epoch*num_batch+i)
                
        metric = self.evaluate(dev)
        msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in metrics.items()])
        self.log.info(msg)

        return metrics["f1"]
    
    def evaluate(self, test):
        accuracy = []
        correct_prediction = 0.
        total_correct = 0.
        total_prediction = 0.
        for word, label in minibatches(test, self.config.batch_size):
            label_predict, seq_len = self.predict_batch(word)

        for lb, lb_pred, length in zip(label, label_predict, seq_len):
            lb = lb[:length]
            lb_pred = lb_pred[:length]
            accuracy += [a==b for (a,b) in zip(lb, lb_pred)]
            lb_chunks = set(get_chunks(lb, self.config.vocab_list))
            lb_pred_chunks = set(get_chunks(lb_pred, self.config.vocab_list))
            correct_prediction += len(lb_chunks & lb_pred_chunks)
            total_prediction += len(lb_pred_chunks)
            total_correct += len(lb_chunks)
            
        
        precision = correct_prediction / total_prediction if correct_prediction >0 else 0
        recall = correct_prediction / total_correct if correct_prediction >0 else 0
        f1 = 2*precision*recall / (precision+recall) if correct_prediction >0 else 0
        acc = np.mean(accuracy)
        
        return {"accuracy": 100*acc, "f1-score": 100*f1}
    
    def predict(self, raw_word):
        
        word = [self.config.processing_word(w) for w in raw_word]
        if type(word[0]) == tuple:
            word = zip(*word)
        p_id, _ = self.predict_batch([word])
        prediction = [self.tag_idx[idx] for idx in list(p_id[0])]
        
        return prediction


        

In [None]:
tf.nn.bidirectional_dynamic_rnn?