In [1]:
# Load our training data
import json
import random
import numpy
with open("data/ner_train.json") as f:
    data=json.load(f)
print(data[0])

# We need to gather the texts, into a list
texts=[one_example["text"] for one_example in data]
labels=[one_example["tags"] for one_example in data] # This is now a list of lists just like the texts variable
print(texts[:2])
print(labels[:2])
texts[0][0] = 'EU '
print(texts[0])

# Lets do the same thing for the validation data
# We use a separate validation set, since generally using sentences from the same documents as train/validation results in overly optimistic scores
with open("data/ner_test.json") as f:
    validation_data=json.load(f)
validation_texts=[one_example["text"] for one_example in validation_data]
validation_labels=[one_example["tags"] for one_example in validation_data]

{'text': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'tags': ['I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}
[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn']]
[['I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['I-PER', 'I-PER']]
['EU ', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']


In [16]:
# Use gensim to read the embedding model

from gensim.models import KeyedVectors

vector_model=KeyedVectors.load_word2vec_format("data/wiki-news-300d-1M.vec", binary=False, limit=50000)

# sort based on the index to make sure they are in the correct order
words=[k for k,v in sorted(vector_model.vocab.items(), key=lambda x:x[1].index)]
print("Words from embedding model:",len(words))
print("First 50 words:",words[:50])

# Normalize the vectors

print("Before normalization:",vector_model.get_vector("in")[:10])
vector_model.init_sims(replace=True)
print("After normalization:",vector_model.get_vector("in")[:10])

# Build vocabulary mappings

vocabulary={"<SPECIAL>": 0, "<OOV>": 1} # zero has a special meaning in sequence models, prevent using it for a normal word
for word in words:
    vocabulary.setdefault(word, len(vocabulary))

print("Words in vocabulary:",len(vocabulary))
inversed_vocabulary={value:key for key, value in vocabulary.items()} # inverse the dictionary

# Label mappings
label_set = set([label for sentence_labels in labels for label in sentence_labels])
label_map = {label: index for index, label in enumerate(label_set)}
                
# Embedding matrix

def load_pretrained_embeddings(vocab, embedding_model):
    """ vocab: vocabulary from our data vectorizer, embedding_model: model loaded with gensim """
    pretrained_embeddings=numpy.random.uniform(low=-0.05, high=0.05, size=(len(vocab)-1,embedding_model.vectors.shape[1]))
    pretrained_embeddings = numpy.vstack((numpy.zeros(shape=(1,embedding_model.vectors.shape[1])), pretrained_embeddings))
    found=0
    for word,idx in vocab.items():
        if word in embedding_model.vocab:
            pretrained_embeddings[idx]=embedding_model.get_vector(word)
            found+=1
            
    print("Found pretrained vectors for {found} words.".format(found=found))
    return pretrained_embeddings

pretrained=load_pretrained_embeddings(vocabulary, vector_model)

Words from embedding model: 50000
First 50 words: [',', 'the', '.', 'and', 'of', 'to', 'in', 'a', '"', ':', ')', 'that', '(', 'is', 'for', 'on', '*', 'with', 'as', 'it', 'The', 'or', 'was', "'", "'s", 'by', 'from', 'at', 'I', 'this', 'you', '/', 'are', '=', 'not', '-', 'have', '?', 'be', 'which', ';', 'all', 'his', 'has', 'one', 'their', 'about', 'but', 'an', '|']
Before normalization: [-0.0234 -0.0268 -0.0838  0.0386 -0.0321  0.0628  0.0281 -0.0252  0.0269
 -0.0063]
After normalization: [-0.0163762  -0.01875564 -0.05864638  0.02701372 -0.02246478  0.04394979
  0.01966543 -0.0176359   0.01882563 -0.00440898]
Words in vocabulary: 50002
Found pretrained vectors for 50000 words.


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.03059016, -0.01876584,  0.04221981, ...,  0.04360978,
        -0.01395623,  0.00355736],
       [ 0.0644694 ,  0.00534741,  0.0003605 , ...,  0.00300417,
         0.07047773, -0.02403333],
       ...,
       [-0.03178235, -0.05295902, -0.02630622, ..., -0.00124773,
         0.00859544,  0.01202669],
       [-0.06698205, -0.02592853, -0.00983124, ..., -0.05610647,
        -0.02697288,  0.09305462],
       [ 0.05303287,  0.01965057,  0.02571739, ..., -0.10144904,
         0.01290309, -0.05312166]])

In [2]:
docs = []
sentences = []
sentiments = []

for sentences, sentiment in zip(texts, labels):
    sentences_cleaned = [sent.lower() for sent in sentences]
    docs.append(sentences_cleaned)
    sentiments.append(sentiment)

len(docs), len(sentiments)

(14041, 14041)

In [3]:
gold_docs = []
gold_sentences = []
gold_sentiments = []

for sentences, sentiment in zip(validation_texts, validation_labels):
    sentences_cleaned = [sent.lower() for sent in sentences]
    gold_docs.append(sentences_cleaned)
    gold_sentiments.append(sentiment)

len(gold_docs), len(gold_sentiments)

(3250, 3250)

In [4]:
txt = ''
for doc in docs:
    for s in doc:
        for c in s:
            txt += c
chars = set(txt)

print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 59


In [6]:
import numpy

def char_vectorizer(vocab, texts, label_map, labels=None):
    vectorized_data = [] # turn text into numbers based on our vocabulary mapping
    vectorized_labels = [] # same thing for the labels
    sentence_lengths = [] # Number of tokens in each sentence
    
    for i, one_example in enumerate(texts):
        vectorized_example = []
        vectorized_example_labels = []
        for word in one_example:
            vectorized_example.append(vocab.get(word, 1)) # 1 is our index for out-of-vocabulary tokens
        if labels:
            for label in labels[i]:
                vectorized_example_labels.append(label_map[label])

        vectorized_data.append(vectorized_example)
        vectorized_labels.append(vectorized_example_labels)
        
        sentence_lengths.append(len(one_example))
        
    vectorized_data = numpy.array(vectorized_data) # turn python list into numpy matrix
    vectorized_labels = numpy.array(vectorized_labels)
    
    return vectorized_data, vectorized_labels, sentence_lengths

vectorized_data, vectorized_labels, lengths=char_vectorizer(char_indices, texts, label_map, labels)
validation_vectorized_data, validation_vectorized_labels, validation_lengths=vectorizer(char_indices, validation_texts, label_map, validation_labels)

NameError: name 'label_map' is not defined

In [7]:
import numpy as np
maxlen = 1024
max_sentences = 30

X = np.ones((len(docs), max_sentences, maxlen), dtype=np.int64) * -1
y = np.array(sentiments)

for i, doc in enumerate(docs):
    for j, sentence in enumerate(doc):
        if j < max_sentences:
            for t, char in enumerate(sentence[-maxlen:]):
                X[i, j, (maxlen-1-t)] = char_indices[char]

In [10]:
X.shape
#y.shape

(14041, 30, 1024)

In [11]:
Xv = np.ones((len(gold_docs), max_sentences, maxlen), dtype=np.int64) * -1
yv = np.array(gold_sentiments)

for i, doc in enumerate(gold_docs):
    for j, sentence in enumerate(doc):
        if j < max_sentences:
            for t, char in enumerate(sentence[-maxlen:]):
                Xv[i, j, (maxlen-1-t)] = char_indices[char]

In [12]:
import keras

def evaluate(predictions, gold, lengths):
    pred_entities = [_convert_to_entities(labels[:lengths[i]]) for i, labels in enumerate(predictions)]
    
    gold_entities = [_convert_to_entities(labels[:lengths[i], 0]) for i, labels in enumerate(gold)]
    
    tp = sum([len(pe.intersection(gold_entities[i])) for i, pe in enumerate(pred_entities)])
    pred_count = sum([len(e) for e in pred_entities])
    try:
        precision = tp / pred_count
        recall = tp / sum([len(e) for e in gold_entities])
        fscore = 2 * precision * recall / (precision + recall)
    except Exception as e:
        precision, recall, fscore = 0.0, 0.0, 0.0
    print('\nPrecision/Recall/F-score: %s / %s / %s' % (precision, recall, fscore))


def _convert_to_entities(input_sequence):
    """
    Reads a sequence of tags and converts them into a set of entities.
    """
    entities = []
    current_entity = []
    previous_tag = label_map['O']
    for i, tag in enumerate(input_sequence):
        if tag != previous_tag and tag != label_map['O']: # New entity starts
            if len(current_entity) > 0:
                entities.append(current_entity)
                current_entity = []
            current_entity.append((tag, i))
        elif tag == label_map['O']: # Entity has ended
            if len(current_entity) > 0:
                entities.append(current_entity)
                current_entity = []
        elif tag == previous_tag: # Current entity continues
            current_entity.append((tag, i))
        previous_tag = tag
    
    # Add the last entity to our entity list if the sentences ends with an entity
    if len(current_entity) > 0:
        entities.append(current_entity)
    
    entity_offsets = set()
    
    for e in entities:
        entity_offsets.add((e[0][0], e[0][1], e[-1][1]+1))
    
    return entity_offsets

class EvaluateEntities(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        pred = numpy.argmax(self.model.predict(Xv), axis=-1)
        evaluate(pred, yv, validation_lengths)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [18]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Activation, Conv1D, TimeDistributed, LSTM, Bidirectional, MaxPooling1D
from keras.optimizers import SGD, Adam

vector_size= pretrained.shape[1]

In [19]:
##CHAR

#inp=Input(shape=(sequence_len,))
inp = Input(shape=(maxlen, len(chars)),dtype='float32')

conv = Conv1D(256, 3,
                     border_mode='valid', activation='relu',
                     input_shape=(maxlen, len(chars)))(inp)
conv = MaxPooling1D(pool_length=3)(conv)

embeddings=Embedding(len(vocabulary), vector_size, mask_zero=False, trainable=False, weights=[pretrained])(conv)
cnn = Conv1D(100,3, activation='relu', padding='same')(embeddings)
#cnn1 = Conv1D(100,3, activation='relu', padding='same')(inp)
#dense=TimeDistributed(Dense(class_count, activation="softmax"))(cnn)
#cnn2=Conv1D(100,3, activation='relu', padding='same')(dense)
outp=TimeDistributed(Dense(class_count, activation="softmax"))(cnn)
model=Model(inputs=[inp], outputs=[outp])

lr_cnn=0.01
batch_size_cnn=100
epochs_cnn=10

optimizer=Adam(lr=lr_cnn) # define the learning rate
model.compile(optimizer=optimizer,loss="sparse_categorical_crossentropy", sample_weight_mode='temporal')

# This is our model for outputting the time step wise kernel activations.
cnn_out_model=Model(inputs=[inp], outputs=[cnn])
# We have to compile the model, but we nerver train it directly
cnn_out_model.compile(optimizer=optimizer,loss="sparse_categorical_crossentropy",sample_weight_mode='temporal')

print(model.summary())

# train
hist_cnn=model.fit(vectorized_data_padded,vectorized_labels_padded, sample_weight=weights, batch_size=batch_size_cnn,verbose=1,epochs=epochs_cnn, callbacks=[EvaluateEntities()])

  
  if __name__ == '__main__':


ValueError: Input 0 is incompatible with layer conv1d_4: expected ndim=3, found ndim=4