## Imports come here

In [1]:
import keras
print(keras.__version__)

Using TensorFlow backend.
  return f(*args, **kwds)


2.1.5


In [2]:
import keras
import codecs
import re
from collections import OrderedDict
import numpy as np
import math

from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers.embeddings import Embedding
from keras.layers import Bidirectional,Input,merge #ChainCRF
from keras.layers.recurrent import LSTM
from keras.preprocessing.sequence import pad_sequences
#from keras.utils.np_utils import accuracy
#from keras.utils import np_utils
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD
from keras.layers.convolutional import Convolution1D, MaxPooling1D #Convolution2D, MaxPooling2D

## Parameters

In [3]:
parameters = OrderedDict()
parameters['train'] = "./data/eng.train" #Path to train file
parameters['dev'] = "./data/eng.testa" #Path to test file
parameters['test'] = "./data/eng.testb" #Path to dev file
parameters['tag_scheme'] = "iob" #iob or iobes
parameters['lower'] = True # Boolean variable to control lowercasing of words
parameters['zeros'] =  True # Boolean variable to control replacement of  all digits by 0 
parameters['char_dim'] = 30 #Char embedding dimension
parameters['char_lstm_dim'] = 25 #Char LSTM hidden layer size
parameters['char_bidirect'] = True #Use a bidirectional LSTM for chars
parameters['word_dim'] = 100 #Token embedding dimension
parameters['word_lstm_dim'] = 200 #Token LSTM hidden layer size
parameters['word_bidirect'] = True #Use a bidirectional LSTM for words
parameters['embedding_path'] = "./embeddings/glove/glove.6B.100d.txt" #Location of pretrained embeddings
#parameters['all_emb'] = 1 #Load all embeddings
parameters['cap_dim'] = 4 #Capitalization feature dimension (0 to disable)
parameters['crf'] =1 #Use CRF (0 to disable)
parameters['dropout'] = 0.5 #Droupout on the input (0 = no dropout)
#parameters['lr_method'] = "sgd-lr_.005" #Learning method (SGD, Adadelta, Adam..)
parameters['epoch'] =  50 #Number of epochs to run"
parameters['weights'] = "" #path to Pretrained for from a previous run
parameters['reload'] = "" #Path to Reload the last saved model

## Loads sentences

##### Data Preprocessing

In [4]:
def zero_digits(s):
    """
    Replace every digit in a string by a zero.
    """
    return re.sub('\d', '0', s)

In [5]:
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences

In [6]:
train_sentences = load_sentences(parameters['train'], parameters['lower'], parameters['zeros'])
test_sentences = load_sentences(parameters['test'], parameters['lower'], parameters['zeros'])
dev_sentences = load_sentences(parameters['dev'], parameters['lower'], parameters['zeros'])

## Tag format checker

##### Function to modify and check Tagging Scheme

In [7]:
def iob2(tags):
    """
    Check that tags have a valid IOB format.
    Tags in IOB1 format are converted to IOB2.
    """
    for i, tag in enumerate(tags):
        if tag == 'O':
            continue
        split = tag.split('-')
        if len(split) != 2 or split[0] not in ['I', 'B']:
            return False
        if split[0] == 'B':
            continue
        elif i == 0 or tags[i - 1] == 'O':  # conversion IOB1 to IOB2
            tags[i] = 'B' + tag[1:]
        elif tags[i - 1][1:] == tag[1:]:
            continue
        else:  # conversion IOB1 to IOB2
            tags[i] = 'B' + tag[1:]
    return True


def iob_iobes(tags):
    """
    IOB -> IOBES
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag == 'O':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'B':
            if i + 1 != len(tags) and \
               tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('B-', 'S-'))
        elif tag.split('-')[0] == 'I':
            if i + 1 < len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('I-', 'E-'))
        else:
            raise Exception('Invalid IOB format!')
    return new_tags


def iobes_iob(tags):
    """
    IOBES -> IOB
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag.split('-')[0] == 'B':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'I':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'S':
            new_tags.append(tag.replace('S-', 'B-'))
        elif tag.split('-')[0] == 'E':
            new_tags.append(tag.replace('E-', 'I-'))
        elif tag.split('-')[0] == 'O':
            new_tags.append(tag)
        else:
            raise Exception('Invalid format!')
    return new_tags

In [8]:
def update_tag_scheme(sentences, tag_scheme):
    """
    Check and update sentences tagging scheme to IOB2.
    Only IOB1 and IOB2 schemes are accepted.
    """
    for i, s in enumerate(sentences):
        tags = [w[-1] for w in s]
        # Check that tags are given in the IOB format
        if not iob2(tags):
            s_str = '\n'.join(' '.join(w) for w in s)
            raise Exception('Sentences should be given in IOB format! ' +
                            'Please check sentence %i:\n%s' % (i, s_str))
        if tag_scheme == 'iob':
            # If format was IOB1, we convert to IOB2
            for word, new_tag in zip(s, tags):
                word[-1] = new_tag
        elif tag_scheme == 'iobes':
            new_tags = iob_iobes(tags) #convert data in iob to iobes format
            for word, new_tag in zip(s, new_tags):
                word[-1] = new_tag
        else:
            raise Exception('Unknown tagging scheme!')

update_tag_scheme(train_sentences, parameters['tag_scheme'])
update_tag_scheme(dev_sentences, parameters['tag_scheme'])
update_tag_scheme(test_sentences, parameters['tag_scheme'])

In [9]:
def create_dico(item_list):
    """
    Create a dictionary of items from a list of list of items.
    """
    assert type(item_list) is list
    dico = {}
    for items in item_list:
        for item in items:
            if item not in dico:
                dico[item] = 1
            else:
                dico[item] += 1
    return dico

def create_mapping(dico):
    """
    Create a mapping (item to ID / ID to item) from a dictionary.
    Items are ordered by decreasing frequency.
    """
    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item

def word_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(words)
    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = create_mapping(dico)
    print("Found %i unique words (%i in total)" % (
        len(dico), sum(len(x) for x in words)
    ))
    return dico, word_to_id, id_to_word

def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = ["".join([w[0] for w in s]) for s in sentences]
    dico = create_dico(chars)
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique characters" % len(dico))
    return dico, char_to_id, id_to_char

def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[-1] for word in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag

In [10]:
dico_words,word_to_id,id_to_word = word_mapping(train_sentences, parameters['lower'])
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

Found 17493 unique words (203621 in total)
Found 75 unique characters
Found 9 unique named entity tags


In [11]:
def cap_feature(s):
    """
    Capitalization feature:
    0 = low caps
    1 = all caps
    2 = first letter caps
    3 = one capital (not first letter)
    """
    if s.lower() == s:
        return 0
    elif s.upper() == s:
        return 1
    elif s[0].upper() == s[0]:
        return 2
    else:
        return 3

In [12]:
def prepare_dataset(sentences, word_to_id, char_to_id, tag_to_id, lower=False):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    def f(x): return x.lower() if lower else x
    data = []
    for s in sentences:
        str_words = [w[0] for w in s]
        words = [word_to_id[f(w) if f(w) in word_to_id else '<UNK>']
                 for w in str_words]
        # Skip characters that are not in the training set
        chars = [[char_to_id[c] for c in w if c in char_to_id]
                 for w in str_words]
        caps = [cap_feature(w) for w in str_words]
        tags = [tag_to_id[w[-1]] for w in s]
        data.append({
            'str_words': str_words,
            'words': words,
            'chars': chars,
            'caps': caps,
            'tags': tags,
        })
    return data

train_data = prepare_dataset(
    train_sentences, word_to_id, char_to_id, tag_to_id, parameters['lower']
)
dev_data = prepare_dataset(
    dev_sentences, word_to_id, char_to_id, tag_to_id, parameters['lower']
)
test_data = prepare_dataset(
    test_sentences, word_to_id, char_to_id, tag_to_id, parameters['lower']
)
print("{} / {} / {} sentences in train / dev / test.".format(len(train_data), len(dev_data), len(test_data)))

14041 / 3250 / 3453 sentences in train / dev / test.


In [13]:
#Model Parameters
parameters['word_vocab_size'] = len(dico_words.keys()) #total vocab size for words
parameters['char_vocab_size'] = len(dico_chars.keys()) #total vocab size for characters
parameters['batch_size'] = 10 #Batch Size
parameters['cnn_nb_filters']  = 30 #number of filters in CNN
parameters['cnn_window_length']= 3 #window length in CNN

#SGD parameters
parameters['learning_rate'] = 0.015 #learning rate for SGD
parameters['decay_rate'] = 0.05 #decay rate for the learning rate
parameters['momentum'] = 0.9 #momentum paramter for the SGD
parameters['clipvalue'] = 5.0 #gradient clipping value
parameters['max_words'] = 100 #a sentence can have atmost 100 words
parameters['max_chars'] = 20 #a word can ahve atmost 20 char

In [14]:
def pad_parameter (dataSet, key,max_words,max_chars):
    str_words=[]
    for row in dataSet:
        words = row[key]
        nb_remaining = max_words - len(words)
        sent  = []
        if(nb_remaining > 0 ):
            for i in range (nb_remaining):
                sent.append("<UNK>")
                
        #we clip sentences bigger than 100 words
        word_length = min(len(words) , max_words)
        sent = sent+words[0:word_length]
        str_words.append(sent)
    return str_words

def CreateX_Y(dataSet,max_words=100,maxCharLength=20):
    Words_id = []
    tag = []
    caps =[]
    char = []
    str_words = []
    for row in dataSet:
        Words_id.append(row['words'])
        
    str_words = pad_parameter(dataSet,'str_words',max_words,maxCharLength)
    
    #Dont have the call to the function above because 
    #we have padsequences function below
    for row in dataSet:
        tag.append(row['tags'])

    for row in dataSet:
        caps.append(row['caps'])
      
    for row in dataSet:
        char_1 = row['chars']
        nb_remaining = max_words - len(char_1)
        sentence = []
        if(nb_remaining > 0 ):
            sentence = [0] * maxCharLength * nb_remaining
        wordList = []
        wordCount = 0;
        for word in char_1[0:max_words]:
            padding = [0] * (maxCharLength - len(word))
            word_pad = padding +word[0:maxCharLength]
            #print (word_pad)
            sentence = sentence + word_pad   
        #we want 100 words per sentence, each of which has 20 char
        char.append(sentence)
        
    Words_id= pad_sequences(Words_id,maxlen=max_words)
    
    tag = pad_sequences(tag,maxlen=max_words)   
    caps = pad_sequences(caps,maxlen=max_words)
    char = np.asarray(char)
    
    #char = char.reshape(char.shape[0],max_words*maxCharLength)
    return Words_id,tag,caps,char,str_words

In [15]:
Words_id_train,tag_train,caps_train,char_train,Words_str_train = \
                                CreateX_Y(train_data,parameters['max_words'],parameters['max_chars'])
Words_id_test,tag_test,caps_test,char_test,Words_str_test= \
                                CreateX_Y(test_data,parameters['max_words'],parameters['max_chars'])
Words_id_dev,tag_dev,caps_dev,char_dev,Words_str_dev = \
                                CreateX_Y(dev_data,parameters['max_words'],parameters['max_chars'])

In [16]:
parameters['tag_label_size'] = len(tag_to_id.keys())

In [17]:
tag_train = np.expand_dims(tag_train, -1)
tag_dev = np.expand_dims(tag_dev, -1)
tag_test = np.expand_dims(tag_test, -1)

In [18]:
def initialize_embed_matrix(word_to_id,ext_emb_path,word_vocab_size,word_embedding_dim):
    #based on https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
    embeddings_index = {}
    f = open(ext_emb_path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    embedding_weights_range = math.sqrt(3/word_embedding_dim)
    embedding_matrix = np.zeros((word_vocab_size + 1, word_embedding_dim))
    for word, i in word_to_id.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        else:
            embedding_matrix[i] = np.random.uniform(low=-embedding_weights_range, high= embedding_weights_range,size = (1,100))
    return embedding_matrix

embedding_matrix = initialize_embed_matrix(word_to_id,parameters['embedding_path'],parameters['word_vocab_size'],parameters['word_dim'])

##### Model Build

In [None]:
def build_model(parameters,embedding_matrix =None, weightsPath = None):
    lstm_dim = parameters['word_lstm_dim']
    word_vocab_size = parameters['word_vocab_size'] 
    char_vocab_size = parameters['char_vocab_size']
    char_embedding_dim = parameters['char_dim']
    word_embedding_dim = parameters['word_dim']
    maxCharSize = parameters['max_chars']
    cap_size = 	parameters['cap_size']
    cap_embed_size = parameters['cap_dim']
    max_words = parameters['max_words']
    nb_filters = parameters['cnn_nb_filters']
    window_length = parameters['cnn_window_length']
    learning_rate = parameters['learning_rate']
    decay_rate = parameters['decay_rate'] 
    momentum = parameters['momentum']
    clipvalue = parameters['clipvalue']
    tag_label_size = parameters['tag_label_size']
    dropout = parameters['dropout']

    char_input = Input(shape=(maxCharSize * max_words,), dtype='int32', name='char_input')
    char_emb = Embedding(char_vocab_size, char_embedding_dim, input_length=max_words*maxCharSize, dropout=dropout, name='char_emb')(char_input)
    char_cnn = Convolution1D(nb_filter=nb_filters,filter_length= window_length, activation='tanh', border_mode='full') (char_emb) 
    char_max_pooling = MaxPooling1D(pool_length=maxCharSize) (char_cnn) #  get output per word. this is the size of the hidden layer

    #based on https://github.com/pressrelations/keras/blob/a2d358e17ea7979983c3c6704390fe2d4b29bbbf/examples/conll2000_bi_lstm_crf.py
    word_input = Input(shape=(max_words,), dtype='int32', name='word_input')
    if (embedding_matrix is not None):
        word_emb = Embedding(word_vocab_size+1, word_embedding_dim,weights=[embedding_matrix], input_length=max_words, dropout=0, name='word_emb')(word_input)
    else:
        word_emb = Embedding(word_vocab_size+1, word_embedding_dim, input_length=max_words, dropout=0, name='word_emb')(word_input)

    caps_input = Input(shape=(max_words,), dtype='int32', name='caps_input')
    caps_emb = Embedding(cap_size, cap_embed_size, input_length=None, dropout=dropout, name='caps_emb')(caps_input)
    #concat axis refers to the axis whose dimension can be different
    total_emb = merge([word_emb, caps_emb,char_max_pooling], mode='concat', concat_axis=2,name ='total_emb')
    emb_droput = Dropout(dropout)(total_emb)
    #inner_init : initialization function of the inner cells. I believe this is Cell state
    bilstm_word  = Bidirectional(LSTM(lstm_dim,inner_init='uniform', forget_bias_init='one',return_sequences=True))(emb_droput)
    bilstm_word_d = Dropout(dropout)(bilstm_word)

    dense = TimeDistributed(Dense(tag_label_size))(bilstm_word_d)
    crf = ChainCRF()def shared(shape, name):
    crf_output = crf(dense)
    #to accoutn for gradient clipping
    #info on nesterov http://stats.stackexchange.com/questions/211334/keras-how-does-sgd-learning-rate-decay-work
    sgd = SGD(lr=learning_rate, decay=decay_rate, momentum=momentum, nesterov=False,clipvalue = clipvalue)



    model = Model(input=[word_input,caps_input,char_input], output=[crf_output])
    if(weightsPath):
        model.load_weights(weightsPath)
    model.compile(loss=crf.sparse_loss,
                  optimizer=sgd,
                  metrics=['sparse_categorical_accuracy'])

    model.summary()
    return model

def train_model (model,parameters,Words_id_train,caps_train,char_train,tag_train,Words_id_dev=None,caps_dev=None,char_dev = None,tag_dev=None):

    # define the checkpoint
    filepath="weights-improvement-BiLSTM-All-no-wd-{epoch:02d}-{loss:.4f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    batch_size = parameters['batch_size']
    epoch_number = parameters['epoch']
    model.fit([Words_id_train,caps_train,char_train], tag_train,
          batch_size=batch_size,
          validation_data=([Words_id_dev,caps_dev,char_dev], tag_dev), nb_epoch=epoch_number,callbacks=callbacks_list)
    return model

In [None]:
if parameters['pre_emb'] and weights:
    assert os.path.isfile(weights)
    model = build_model(parameters,embedding_matrix=embedding_matrix,weightsPath=weights)
elif parameters['pre_emb']:
    model = build_model(parameters,embedding_matrix=embedding_matrix)
elif weights:
    assert os.path.isfile(weights)
    model = build_model(parameters,weightsPath =weights)
else:
    model = build_model(parameters)

In [None]:
pickle.dump(word_to_id, open("word_to_id.pkl",'wb'))
pickle.dump(char_to_id, open("char_to_id.pkl",'wb'))
pickle.dump(tag_to_id, open("tag_to_id.pkl",'wb'))
pickle.dump(id_to_tag,open("id_to_tag.pkl",'wb'))
pickle.dump(parameters,open("parameters.pkl",'wb'))

In [None]:
model = train_model (model,parameters,Words_id_train,caps_train,char_train,tag_train,Words_id_dev,caps_dev,char_dev,tag_dev)
scores = model.evaluate([Words_id_test, caps_test,char_test], tag_test, verbose=0)