In [1]:
import os

class DefaultConfig():
    ###     Path     ###
    #   graph Path
    output_path = "results/"
    model_output = output_path + "model.weights/"
    log_path = output_path + "logs/"
    # preprocessed Data path
    test_filename = "data/test_data"
    train_filename = "data/train_data"
    #word2vec_filename = "data/wikipedia-200-mincount-20-window-8-cbow.bin"
    word2vec_filename = "data/wikipedia-100-mincount-30-window-8-cbow.bin"
    # preprocessed Data file names
    tags_filename = "data/tags.txt"
    words_filename = "data/words.txt"
    chars_filename = "data/chars.txt"
    # preprocessed data variables
    UNK = "$UNK$"
    NUM = "$NUM$"
    NONE = "O"

    ###  Hyper parameters    ###
    BATCH_SIZE = 40
    MAX_LENGTH_WORD = 50
    N_EPOCHS = 50
    LR = 0.001
    LR_DECAY = 0.95
    DROPOUT = 0.5
    # Char Embedding (CNN)
    CHAR_EMB_DIM = 120
    FILTER_SIZE = [2, 3, 4, 5]
    N_FILTERS = 128
    # BiLSTM
    HIDDEN_SIZE = 400

    ### Tensorflow op   ###
    NUM_THREADS = 4


In [2]:
import numpy as np
import os
#from config import DefaultConfig as cfg

class conllReader(object):
    """
    This class will iterate over CoNLL dataset.
    """

    def __init__(self, filename, processing_word=None, processing_tag=None):

        self.filename = filename
        self.processing_word = processing_word
        self.processing_tag = processing_tag
        self.length = None

    def __iter__(self):
        with open(self.filename, encoding='utf-8') as f:
            words, tags = [], []
            for line in f:
                line = line.strip()
                if len(line) == 0:
                    if len(words) != 0:
                        yield words, tags
                        words, tags = [], []
                else:
                    ls = line.split('\t')
                    word, tag = ls[1], ls[5]
                    if self.processing_word is not None:
                        word = self.processing_word(word)
                    if self.processing_tag is not None:
                        tag = self.processing_tag(tag)
                    words += [word]
                    tags += [tag]

    def __len__(self):
        """
        Iterates once over the corpus to set and store length
        """
        if self.length is None:
            self.length = 0
            for _ in self:
                self.length += 1

        return self.length


def get_vocabs(datasets):
    """
    Args:
        datasets: a list of dataset objects
    Return:
        a set of all the words in the dataset
    """
    print("Building vocab...")
    vocab_words = set()
    vocab_tags = set()
    for dataset in datasets:
        for words, tags in dataset:
            vocab_words.update(words)
            vocab_tags.update(tags)
    print("- done. {} tokens".format(len(vocab_words)))
    return vocab_words, vocab_tags


def get_char_vocab(dataset):
    """
    Args:
        dataset: a iterator yielding tuples (sentence, tags)
    Returns:
        a set of all the characters in the dataset
    """
    vocab_char = set()
    for words, _ in dataset:
        for word in words:
            vocab_char.update(word)

    return vocab_char


def get_sentences(datasets):
    """
    :param dataset: an iterator yielding tuples (sentence, tags)
    :return: a list of sentences
    """
    sentences = []
    for dataset in datasets:
        for sentence, _ in dataset:
            sentences.append(sentence)

    return sentences

def write_vocab(vocab, filename):
    """
    Writes a vocab to a file
    Args:
        vocab: iterable that yields word
        filename: path to vocab file
    Returns:
        write a word per line
    """
    print("Writing vocab...")
    with open(filename, "w") as f:
        for i, word in enumerate(vocab):
            if i != len(vocab) - 1:
                f.write("{}\n".format(word))
            else:
                f.write(word)
    print("- done. {} tokens".format(len(vocab)))

def load_vocab(filename):
    """
    Args:
        filename: file with a word per line
    Returns:
        d: dict[word] = index
    """
    try:
        d = dict()
        with open(filename) as f:
            for idx, word in enumerate(f):
                word = word.strip()
                d[word] = idx

    except IOError:
        print("Error loading file")
    return d

def get_processing_word(vocab_words=None, vocab_chars=None,
                    lowercase=False, chars=False):
    """
    Args:
        vocab: dict[word] = idx
    Returns:
        f("cat") = ([12, 4, 32], 12345)
                 = (list of char ids, word id)
    """
    def f(word):
        # 0. get chars of words
        if vocab_chars is not None and chars == True:
            char_ids = []
            for char in word:
                # ignore chars out of vocabulary
                if char in vocab_chars:
                    char_ids += [vocab_chars[char]]

        # 1. preprocess word
        if lowercase:
            word = word.lower()
        if word.isdigit():
            word = cfg.NUM

        # 2. get id of word
        if vocab_words is not None:
            if word in vocab_words:
                word = vocab_words[word]
            else:
                word = vocab_words[cfg.UNK]

        # 3. return tuple char ids, word id
        if vocab_chars is not None and chars == True:
            return char_ids, word
        else:
            return word

    return f

In [3]:
#from config import DefaultConfig as cfg

def pad_sequences(sequences, pad_token, type):
    '''
    add pad_token to the words, or sentences to have same length
    :param sequences: a list of words or sentences
    :param pad_token: the value should be added to all sequences
    :param type: either 'words' or 'sentences'
    :return: a list of words or sentences with same length
    '''
    if type == 'sentences':
        max_length = max(map(lambda x: len(x), sequences))
        sequence_padded, sequence_length = add_pad(sequences, pad_token, max_length)

    elif type == 'words':
        max_length_word = cfg.MAX_LENGTH_WORD#max([max(map(lambda x : len(x), seq)) for seq in sequences])
        sequence_padded, sequence_length = [], []
        for seq in sequences:
            sp, sl = add_pad(seq, pad_token, max_length_word)
            sequence_padded += [sp]
            sequence_length += [sl]

        max_length_sentence = max(map(lambda x: len(x), sequences))
        sequence_padded, _ = add_pad(sequence_padded, [pad_token]*max_length_word, max_length_sentence)
        sequence_length, _ = add_pad(sequence_length, 0, max_length_sentence)

    return sequence_padded, sequence_length


def add_pad(sequences, pad_token, max_length):
    '''
    add pad to sequences
    :param sequences: a list
    :param pad_token: pad token
    :param max_length: maximum length to be padded
    :return: Padded sequence and sequence length
    '''
    sequence_padded, sequence_length = [], []

    for seq in sequences:
        seq = list(seq)
        seq_ = seq[:max_length] + [pad_token]*max(max_length - len(seq), 0)
        sequence_padded += [seq_]
        sequence_length += [min(len(seq), max_length)]

    return sequence_padded, sequence_length


def batch_gen(data, minibatch_size):
    """
    Args:
        data: generator of (sentence, tags) tuples
        minibatch_size: (int)
    Returns:
        list of tuples
    """
    x_batch, y_batch = [], []
    for (x, y) in data:
        if len(x_batch) == minibatch_size:
            yield x_batch, y_batch
            x_batch, y_batch = [], []

        if type(x[0]) == tuple:
            x = zip(*x)
        x_batch += [x]
        y_batch += [y]

    if len(x_batch) != 0:
        yield x_batch, y_batch


def get_chunk_type(tok, idx_to_tag):
    """
    Args:
        tok: id of token, ex 4
        idx_to_tag: dictionary {4: "B-PER", ...}
    Returns:
        tuple: "B", "PER"
    """
    tag_name = idx_to_tag[tok]
    tag_class = tag_name.split('-')[0]
    tag_type = tag_name.split('-')[-1]
    return tag_class, tag_type


def get_chunks(seq, tags):
    """
    Args:
        seq: [4, 4, 0, 0, ...] sequence of labels
        tags: dict["O"] = 4
    Returns:
        list of (chunk_type, chunk_start, chunk_end)
    Example:
        seq = [4, 5, 0, 3]
        tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
        result = [("PER", 0, 2), ("LOC", 3, 4)]
    """
    default = tags[cfg.NONE]
    idx_to_tag = {idx: tag for tag, idx in tags.items()}
    chunks = []
    chunk_type, chunk_start = None, None
    for i, tok in enumerate(seq):
        # End of a chunk 1
        if tok == default and chunk_type is not None:
            # Add a chunk.
            chunk = (chunk_type, chunk_start, i)
            chunks.append(chunk)
            chunk_type, chunk_start = None, None

        # End of a chunk + start of a chunk!
        elif tok != default:
            tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
            if chunk_type is None:
                chunk_type, chunk_start = tok_chunk_type, i
            elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
                chunk = (chunk_type, chunk_start, i)
                chunks.append(chunk)
                chunk_type, chunk_start = tok_chunk_type, i
        else:
            pass
    # end condition
    if chunk_type is not None:
        chunk = (chunk_type, chunk_start, len(seq))
        chunks.append(chunk)

    return chunks

In [5]:
#from config import DefaultConfig as cfg

def pad_sequences(sequences, pad_token, type):
    '''
    add pad_token to the words, or sentences to have same length
    :param sequences: a list of words or sentences
    :param pad_token: the value should be added to all sequences
    :param type: either 'words' or 'sentences'
    :return: a list of words or sentences with same length
    '''
    if type == 'sentences':
        max_length = max(map(lambda x: len(x), sequences))
        sequence_padded, sequence_length = add_pad(sequences, pad_token, max_length)

    elif type == 'words':
        max_length_word = cfg.MAX_LENGTH_WORD#max([max(map(lambda x : len(x), seq)) for seq in sequences])
        sequence_padded, sequence_length = [], []
        for seq in sequences:
            sp, sl = add_pad(seq, pad_token, max_length_word)
            sequence_padded += [sp]
            sequence_length += [sl]

        max_length_sentence = max(map(lambda x: len(x), sequences))
        sequence_padded, _ = add_pad(sequence_padded, [pad_token]*max_length_word, max_length_sentence)
        sequence_length, _ = add_pad(sequence_length, 0, max_length_sentence)

    return sequence_padded, sequence_length


def add_pad(sequences, pad_token, max_length):
    '''
    add pad to sequences
    :param sequences: a list
    :param pad_token: pad token
    :param max_length: maximum length to be padded
    :return: Padded sequence and sequence length
    '''
    sequence_padded, sequence_length = [], []

    for seq in sequences:
        seq = list(seq)
        seq_ = seq[:max_length] + [pad_token]*max(max_length - len(seq), 0)
        sequence_padded += [seq_]
        sequence_length += [min(len(seq), max_length)]

    return sequence_padded, sequence_length


def batch_gen(data, minibatch_size):
    """
    Args:
        data: generator of (sentence, tags) tuples
        minibatch_size: (int)
    Returns:
        list of tuples
    """
    x_batch, y_batch = [], []
    for (x, y) in data:
        if len(x_batch) == minibatch_size:
            yield x_batch, y_batch
            x_batch, y_batch = [], []

        if type(x[0]) == tuple:
            x = zip(*x)
        x_batch += [x]
        y_batch += [y]

    if len(x_batch) != 0:
        yield x_batch, y_batch


def get_chunk_type(tok, idx_to_tag):
    """
    Args:
        tok: id of token, ex 4
        idx_to_tag: dictionary {4: "B-PER", ...}
    Returns:
        tuple: "B", "PER"
    """
    tag_name = idx_to_tag[tok]
    tag_class = tag_name.split('-')[0]
    tag_type = tag_name.split('-')[-1]
    return tag_class, tag_type


def get_chunks(seq, tags):
    """
    Args:
        seq: [4, 4, 0, 0, ...] sequence of labels
        tags: dict["O"] = 4
    Returns:
        list of (chunk_type, chunk_start, chunk_end)
    Example:
        seq = [4, 5, 0, 3]
        tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
        result = [("PER", 0, 2), ("LOC", 3, 4)]
    """
    default = tags[cfg.NONE]
    idx_to_tag = {idx: tag for tag, idx in tags.items()}
    chunks = []
    chunk_type, chunk_start = None, None
    for i, tok in enumerate(seq):
        # End of a chunk 1
        if tok == default and chunk_type is not None:
            # Add a chunk.
            chunk = (chunk_type, chunk_start, i)
            chunks.append(chunk)
            chunk_type, chunk_start = None, None

        # End of a chunk + start of a chunk!
        elif tok != default:
            tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
            if chunk_type is None:
                chunk_type, chunk_start = tok_chunk_type, i
            elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
                chunk = (chunk_type, chunk_start, i)
                chunks.append(chunk)
                chunk_type, chunk_start = tok_chunk_type, i
        else:
            pass
    # end condition
    if chunk_type is not None:
        chunk = (chunk_type, chunk_start, len(seq))
        chunks.append(chunk)

    return chunks

In [8]:
import tensorflow as tf
#from data_helper import pad_sequences, batch_gen, get_chunks
import numpy as np


class Model(object):
    def __init__(self, config, embeddings, ntags, nchars):
        '''
        Tensorflow model
        :param embeddings: word2vec embedding file which loaded
        :param ntags: number of tags
        :param nchars: number of chars
        '''
        self.cfg = config
        self.embeddings = embeddings
        self.nchars = nchars
        self.ntags = ntags

        self.add_placeholders()                 # Initial placeholders
        self.add_word_embeddings_op()           # add embedding operation to graph
        self.add_logits_op()                    # add logits operation to graph
        self.add_loss_op()                      # add loss operation to graph
        self.add_train_op()                     # add train (optimzier) operation to graph

        # Merge all summaries into a single op
        self.merged_summary_op = tf.summary.merge_all()

    def add_placeholders(self):
        '''
        Initial placeholders
        '''
        # Shape = (batch size, max length of sentences in batch)
        self.word_ids = tf.placeholder(tf.int32, shape=[None, None], name="word_ids")

        # Shape = (batch size)
        self.sentences_lengths = tf.placeholder(tf.int32, shape=[None], name="sentences_lengths")

        # Shape = (batch size, max length of sentences, max length of words)
        self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None], name="char_ids")

        # Shape = (batch size, max length of sentences)
        self.word_lengths = tf.placeholder(tf.int32, shape=[None, None], name="word_length")

        # Shape = (batch size, max length of sentences)
        self.labels = tf.placeholder(tf.int32, shape=[None, None], name="labels")

        # Learning rate for Optimization
        self.lr = tf.placeholder(tf.float32, shape=[], name="Learning_rate")

        # Dropout
        self.dropout = tf.placeholder(tf.float32, shape=[], name="Dropout")


    def add_word_embeddings_op(self):
        '''
        Add word embedings + Char CNN operation to graph
        '''
        with tf.variable_scope("words"):
            _word_embeddings = tf.Variable(self.embeddings, name="_word_embeddings", dtype=tf.float32, trainable=False)
            word_embeddings = tf.nn.embedding_lookup(_word_embeddings, self.word_ids, name="word_embeddings")

        with tf.variable_scope("chars"):
            xavi = tf.contrib.layers.xavier_initializer
            # Get char level embeddings matrix
            _char_embeddings = tf.get_variable("_char_embeddings", shape=[self.nchars, self.cfg.CHAR_EMB_DIM],
                                               dtype=tf.float32,
                                               initializer=xavi())
            self.char_embeddings = tf.nn.embedding_lookup(_char_embeddings,
                                                     self.char_ids,
                                                     name="char_embeddings")
            # get shape of char embd matrix
            s = tf.shape(self.char_embeddings)
            # Reshape char_embd matrix to [batches * sentence length , max_word_length , char_embedding_size]
            self.char_embeddings = tf.reshape(self.char_embeddings, [-1, self.cfg.MAX_LENGTH_WORD, self.cfg.CHAR_EMB_DIM])
            # Add one dimension at the end of char_emb matrix to have shape like:
            # [batches, height, width, channels] like an image. Here channel=1
            self.embedded_chars_expanded = tf.expand_dims(self.char_embeddings, -1)

            # Create a convolution + maxpool layer for each filter size
            pooled_outputs = []
            # Here we do convolution over [words x char_emb] with different filter sizes [2,3,4,5].
            for i, filter_size in enumerate(self.cfg.FILTER_SIZE):
                with tf.name_scope("conv-maxpool-%s" % filter_size):
                    # Define convolution filter Layer with shape = [filter_height, filter_width, in_channels, out_channels]
                    filter_shape = [filter_size, self.cfg.CHAR_EMB_DIM, 1, self.cfg.N_FILTERS]
                    W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_char")
                    b = tf.Variable(tf.constant(0.1, shape=[self.cfg.N_FILTERS]), name="b_char")
                    # conv return shape= [batch * sentence_length, MAX_LENGTH_WORD - FILTER_SIZE + 1, 1, N_FILTERS]
                    conv = tf.nn.conv2d(
                        self.embedded_chars_expanded,
                        W,
                        strides=[1, 1, 1, 1],
                        padding="VALID",
                        name="conv")
                    # Apply nonlinearity
                    h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # h has same shape as conv
                    # Maxpooling over the outputs
                    # return shape= [Batch_size, 1, 1, N_FILTERS]
                    pooled = tf.nn.max_pool(
                        h,
                        ksize=[1, self.cfg.MAX_LENGTH_WORD - filter_size + 1, 1, 1],
                        strides=[1, 1, 1, 1],
                        padding='VALID',
                        name="pool")
                    # Add all convolution outputs to a list
                    pooled_outputs.append(pooled)

            # Combine all the pooled features
            num_filters_total = self.cfg.N_FILTERS * len(self.cfg.FILTER_SIZE)
            # Concatinate all pooled features over 3th dimension
            self.h_pool = tf.concat(pooled_outputs, 3) # has shape= [Batch_size, 1, 1, len(FILTER_SIZE) * N_FILTERS]
            # Reshape data to shape= [Batch_Size, Sentence_Length, len(FILTER_SIZE) * N_FILTERS]
            self.h_pool_flat = tf.reshape(self.h_pool, [-1, s[1], num_filters_total])
            # Add char features to embedding words. Shape= [Batch_Size, Sentence_Length, Word_Emb_length + len(FILTER_SIZE) * N_FILTERS]
            word_embeddings = tf.concat([word_embeddings, self.h_pool_flat], axis=-1)
        # add Dropout regularization
        self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout)


    def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
        """
        add pad to the data and build feed data for tensorflow
        :param words: data
        :param labels: labels
        :param lr: learning rate
        :param dropout: dropout probability
        :return: padded data with their corresponding length
        """
        # Unzip data to char_ids and word_ids
        char_ids, word_ids = zip(*words)
        # pad sentence to maximum sentence length of current batch
        word_ids, sentences_lengths = pad_sequences(word_ids, 0, type='sentences')
        # pad words to maximum word length of current batch
        char_ids, word_lengths = pad_sequences(char_ids, pad_token=0, type='words')

        feed = {
            self.word_ids: word_ids,
            self.sentences_lengths: sentences_lengths,
            self.char_ids: char_ids,
            self.word_lengths: word_lengths
        }

        if labels is not None:
            labels, _ = pad_sequences(labels, 0, type='sentences')
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        return feed, sentences_lengths


    def add_logits_op(self):
        """
        Adds logits to Model. We use BiLSTM + fully connected layer to predict word sequences labels
        """
        with tf.variable_scope("bi-lstm"):
            # Define Forwards cell
            cell_fw = tf.contrib.rnn.LSTMCell(self.cfg.HIDDEN_SIZE)
            # Define Backwards cell
            cell_bw = tf.contrib.rnn.LSTMCell(self.cfg.HIDDEN_SIZE)
            # Run BiLSTM
            (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw,
                                                                        cell_bw, self.word_embeddings,
                                                                        sequence_length=self.sentences_lengths,
                                                                        dtype=tf.float32)
            # Concatenate Forward and backward over last axis
            # The shape is: [Batch_size, Sentence_length, 2*HIDDEN_SIZE]
            rnn_output = tf.concat([output_fw, output_bw], axis=-1)
            # Apply Dropout regularization
            rnn_output = tf.nn.dropout(rnn_output, self.dropout)

        with tf.variable_scope("proj"):
            # Define weights and Biases
            W1 = tf.get_variable("W1", shape=[2 * self.cfg.HIDDEN_SIZE, self.cfg.HIDDEN_SIZE],
                                dtype=tf.float32,
                                initializer=tf.contrib.layers.xavier_initializer())

            b1 = tf.get_variable("b1", shape=[self.cfg.HIDDEN_SIZE], dtype=tf.float32,
                                initializer=tf.zeros_initializer())

            W2 = tf.get_variable("W2", shape=[self.cfg.HIDDEN_SIZE, self.ntags],
                                dtype=tf.float32,
                                initializer=tf.contrib.layers.xavier_initializer())

            b2 = tf.get_variable("b2", shape=[self.ntags], dtype=tf.float32,
                                initializer=tf.zeros_initializer())
            # get sentence length
            ntime_steps = tf.shape(rnn_output)[1]
            # Reshape to 2D to calculate W1. shape= [Batch_size * sentences_length, 2*HIDDEN_SIZE]
            rnn_output = tf.reshape(rnn_output, [-1, 2 * self.cfg.HIDDEN_SIZE])
            # Apply projection, return [Batch_size * sentences_length, HIDDEN_SIZE]
            w1_output = tf.matmul(rnn_output, W1) + b1
            # Apply nonlinearity
            w1_output = tf.nn.relu(w1_output, name="w1_relu")
            # Apply Dropout regularization
            w1_output = tf.nn.dropout(w1_output, self.dropout)
            # Apply projection, return shape= [Batch_size * sentences_length, N_Tags]
            pred = tf.matmul(w1_output, W2) + b2
            # Return back to shape= [[Batch_size , sentences_length, N_Tags]
            self.logits = tf.reshape(pred, [-1, ntime_steps, self.ntags])


    def add_loss_op(self):
        """
        Adds loss to Model
        """
        # Get highest probabilty of predicted labels
        self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1), tf.int32)
        # Compute loss
        losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.labels)
        # Use Mask to eliminate Zeros paddings
        mask = tf.sequence_mask(self.sentences_lengths)
        losses = tf.boolean_mask(losses, mask)
        # assign loss to self
        self.loss = tf.reduce_mean(losses)

        # Create a summary to monitor loss
        tf.summary.scalar("loss", self.loss)


    def add_train_op(self):
        """
        Add train_op to Model
        """
        with tf.variable_scope("train_step"):
            # In each epoch iteration, the Learning Rate will decay which defined in config file
            optimizer = tf.train.AdamOptimizer(self.lr)
            self.train_op = optimizer.minimize(self.loss)


    def predict_batch(self, sess, words, labels):
        """
        Args:
            sess: a tensorflow session
            words: list of sentences
            labels: list of true labels
        Returns:
            labels_pred: list of labels for each sentence
            sequence_length: length of sentences
            loss: loss of current batch
        """
        # get the feed dictionnary
        fd, sequence_lengths = self.get_feed_dict(words, labels, dropout=1.0)
        # Run Tensorflow graph
        labels_pred, loss = sess.run([self.labels_pred, self.loss], feed_dict=fd)
        return labels_pred, sequence_lengths, loss


    def run_evaluate(self, sess, test, tags):
        """
        Evaluates performance on dev set
        Args:
            sess: tensorflow session
            test: dataset that yields tuple of sentences, tags
            tags: {tag: index} dictionary
        Returns:
            accuracy
            f1 score
            loss
            Precision
            Recall
        This code honored to:
        https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html
        """
        accs = []
        losses = 0.0
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for words, labels in batch_gen(test, self.cfg.BATCH_SIZE):
            labels_pred, sequence_lengths, loss = self.predict_batch(sess, words, labels)
            losses += loss
            for lab, lab_pred, length in zip(labels, labels_pred, sequence_lengths):
                lab = lab[:length] #TODO: it is useless!
                lab_pred = lab_pred[:length]
                accs += [a==b for (a, b) in zip(lab, lab_pred)]
                lab_chunks = set(get_chunks(lab, tags))
                lab_pred_chunks = set(get_chunks(lab_pred, tags))
                correct_preds += len(lab_chunks & lab_pred_chunks)
                total_preds += len(lab_pred_chunks)
                total_correct += len(lab_chunks)

        p = correct_preds / total_preds if correct_preds > 0 else 0
        r = correct_preds / total_correct if correct_preds > 0 else 0
        f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
        acc = np.mean(accs)
         # Create a summary to monitor accuracy
        tf.summary.scalar("accuracy", acc)
        # Create a summary to monitor Precision
        tf.summary.scalar("accuracy", p)
        # Create a summary to monitor Recall
        tf.summary.scalar("accuracy", r)
        return acc, f1, losses, p,r

In [11]:
#import conll_reader as cr
#from config import DefaultConfig
import gensim
import os
import sys
import tensorflow as tf
#from model import Model
#from data_helper import batch_gen


def train_model(cfg, train_set, dev_set, embed, tags, chars):

    # Build Model
    model = Model(cfg, embed, len(tags), len(chars))

    # initial session
    with tf.Session() as sess:
    # with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=cfg.NUM_THREADS)) as sess:
        sess.run(tf.global_variables_initializer())
        # create log writer
        summary_writer = tf.summary.FileWriter(cfg.log_path, graph=tf.get_default_graph())
        # run epoch
        for epoch in range(cfg.N_EPOCHS):
            train_losses = 0.0
            validation_loss = 0.0
            accuracy = 0.0
            # Run batches
            i = 0 # counter for summary results.
            for words, labels in batch_gen(train_set, cfg.BATCH_SIZE):
                fd, _ = model.get_feed_dict(words, labels, cfg.LR, cfg.DROPOUT)
                # train model
                _, train_loss, summary = sess.run([model.train_op, model.loss, model.merged_summary_op], feed_dict=fd)
                train_losses += train_loss
                # Write logs at every iteration
                summary_writer.add_summary(summary, epoch * cfg.BATCH_SIZE + i)
                i += 1
            # Evaluate model after training
            accuracy, f1, validation_loss, p, r = model.run_evaluate(sess, dev_set, tags)
            # decay learning rate
            cfg.LR *= cfg.LR_DECAY

            print("epoch %d - train loss: %.2f, validation loss: %.2f, accuracy: %.2f  with f1: %.2f , P: %.2f , R: %.2f " % \
                (epoch + 1, train_losses, validation_loss, accuracy * 100, f1 * 100, p * 100, r * 100))


if __name__ == "__main__":
    if len(sys.argv) != 4:
        sys.stderr.write("Usage: %s wikipedia-xxx-mincount-xx-window-x-cbow.bin TRAIN_SET DEV_SET\n" % sys.argv[0])
        sys.exit(1)

    cfg = DefaultConfig()

    # check if data not processed, generate tags, words, chars
    if not (os.path.exists(cfg.words_filename) & os.path.exists(cfg.tags_filename) & os.path.exists(cfg.chars_filename)):
        print("preprocessed Data not found. processing data...")
        train = cr.conllReader(sys.argv[2])
        test = cr.conllReader(sys.argv[3])

        # get words and tags vocabulary from whole data
        vocab_words, vocab_tags = cr.get_vocabs([train, test])
        # Add unknown token and number to vocab
        vocab_words.add(cfg.UNK)
        vocab_words.add(cfg.NUM)
        # save all words and tags to file
        cr.write_vocab(vocab_tags, cfg.tags_filename)
        cr.write_vocab(vocab_words, cfg.words_filename)
        # get and save chars from dataset to file
        vocab_chars = cr.get_char_vocab(train)
        cr.write_vocab(vocab_chars, cfg.chars_filename)

    # load preprocessed vocabs
    try:
        vocab_words = cr.load_vocab(cfg.words_filename)
        vocab_tags  = cr.load_vocab(cfg.tags_filename)
        vocab_chars = cr.load_vocab(cfg.chars_filename)
    except IOError:
        print("Error loading words, tags, chars files")

    # Load wikipedia-200-mincount-20-window-8-cbow embedding file
    # Load wikipedia-xxx-mincount-xx-window-x-cbow embedding file
    try:
        word2vec = gensim.models.KeyedVectors.load_word2vec_format(sys.argv[1], binary=True)
        embeddings = word2vec.syn0
    except IOError:
        print("error loading file with genism: wikipedia-200-mincount-20-window-8-cbow")

    # assign processing options to processing function
    processing_word = cr.get_processing_word(vocab_words, vocab_chars,
                    lowercase=False, chars=True)
    processing_tag  = cr.get_processing_word(vocab_tags,
                    lowercase=False)
    # read trian and test set
    train = cr.conllReader(sys.argv[2], processing_word, processing_tag)
    test = cr.conllReader(sys.argv[3], processing_word, processing_tag)
    # train and test model
    train_model(cfg, train, test,  embeddings, vocab_tags, vocab_chars)

Usage: C:\LocalDiskD\ProgrammingRequisites\Anaconda\lib\site-packages\ipykernel_launcher.py wikipedia-xxx-mincount-xx-window-x-cbow.bin TRAIN_SET DEV_SET


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
