In this notebook we'll try to create NER system.
The purpose of this system is to extract Persons, Organizations and Locations

![img](http://researchkb.files.wordpress.com/2014/02/ner.png)

In [1]:
import sys, json, codecs, csv
import numpy as np
import random
import pandas as pd
import tqdm
import itertools
import seaborn as sns

# visualization
import matplotlib
import numpy as np
from sklearn.model_selection import train_test_split
from itertools import chain
import matplotlib.pyplot as plt
%matplotlib inline  

Import of keras layers

In [2]:
from keras.layers import Input, LSTM, Embedding, Dense, Dropout
from keras.layers.wrappers import Bidirectional
from keras.models import Model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


dataset can be downloaded from this source:
https://github.com/EuropeanaNewspapers/ner-corpora

In [3]:
with open('enp_FR.bnf.bio.txt') as f:
    text = f.read()

In [4]:
print(text[:100])

Emmanuel I-PER
DESOLES I-PER
de O
LOU O
Directeur O
politique O
BГЉ>ГЂCTION O
ET O
ADMINISTRATION O



In [5]:
words = []
types = []
for item in text.split('\n'):
    item = item.strip()
    if len(item) == 0:
        continue
    [w, t] = item.split(' ')
    words.append(w)
    types.append(t)

In [6]:
unique_types = list(set(types))
type2id = {x:index for index, x in enumerate(unique_types)}
print(type2id)

{'I-LOC': 0, 'O': 1, 'I-PER': 2, 'I-ORG': 3}


In [7]:
from collections import Counter
word2count = Counter(words)
MAX_WORD_COUNT = 30000
top_words = [x[0] for x in sorted(word2count.items(), key=lambda x: x[1], reverse=True)][:MAX_WORD_COUNT]
word2id = {x:index+1 for index, x in enumerate(top_words)}

In [8]:
train_dataset, test_dataset = train_test_split(list(zip(words, types)), train_size=0.7)



In [9]:
from keras_contrib.layers import CRF

In [41]:
crf = CRF(10, sparse_target=True)

input = Input(shape=(None,))
out = Embedding(input_dim=len(word2id)+1, output_dim=200)(input)
out=Dropout(0.01)(out)
out = Bidirectional(LSTM(200, activation='relu', return_sequences=True))(out)
out = Dense(len(type2id), activation='softmax')(out)
out =crf(out)
model = Model(input, out)
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, None)              0         
_________________________________________________________________
embedding_11 (Embedding)     (None, None, 200)         6000200   
_________________________________________________________________
dropout_11 (Dropout)         (None, None, 200)         0         
_________________________________________________________________
bidirectional_10 (Bidirectio (None, None, 400)         641600    
_________________________________________________________________
dense_9 (Dense)              (None, None, 4)           1604      
_________________________________________________________________
crf_10 (CRF)                 (None, None, 10)          170       
Total params: 6,643,574
Trainable params: 6,643,574
Non-trainable params: 0
_________________________________________________________________


In [42]:
def getWordId(w):
    return 0 if not w in word2id else word2id[w]

def gen_batches(dataset, batch_size=64, seq_size=32, batch_count=100):
    random.shuffle(dataset)
    
    features = np.zeros((batch_size, seq_size))
    labels = np.zeros((batch_size, seq_size, len(type2id)))
    for _ in range(batch_count):
        for seq_index in range(batch_size):
            left = random.randint(0, len(dataset) - seq_size)
            features[seq_index,:] = [getWordId(x[0]) for x in dataset[left:left+seq_size]]
            labels[seq_index,:] = 0
            for i,(_,t) in enumerate(dataset[left:left+seq_size]):
                labels[seq_index,i] = 0
                labels[seq_index,i,type2id[t]] = 1
        yield features, labels
        
def encode_text(sentence):
    words = sentence.split()
    result = np.zeros((len(words),))
    for i,w in enumerate(words):
        result[i] = getWordId(w)
    return result

In [43]:
from keras.callbacks import TensorBoard
import tensorflow as tf
import os, shutil


def write_log(callback, names, logs, batch_no):
    for name, value in zip(names, logs):
        summary = tf.Summary()
        summary_value = summary.value.add()
        summary_value.simple_value = value
        summary_value.tag = name
        callback.writer.add_summary(summary, batch_no)
        callback.writer.flush()
        
logs_dir = './logs'
callback = TensorBoard(logs_dir)
callback.set_model(model)

In [44]:
unique_types

['I-LOC', 'O', 'I-PER', 'I-ORG']

In [45]:
for epoch in range(100):
    losses = []
    print(epoch)
    for x,y in gen_batches(train_dataset, batch_count=32):
        loss = model.train_on_batch(x, y)
        losses.append(loss)
    train_loss = np.mean(losses)
        
    losses = []
    for x,y in gen_batches(test_dataset, batch_count=32):
        loss = model.test_on_batch(x, y)
        losses.append(loss)
    test_loss = np.mean(losses)
    print("train "+str(train_loss))
    print("test "+str(test_loss))
    #write_log(callback, ['train', 'test'], [train_loss, test_loss], epoch)

0
train 1.0137541
test 0.90717036
1
train 0.86487234
test 0.8513824
2
train 1.1388707
test 1.2253613
3
train 1.187392
test 1.147805
4
train 1.1118658
test 1.074653
5
train 1.0406841
test 1.0062783
6
train 0.9747646
test 0.94327676
7
train 0.91443115
test 0.88625985
8
train 0.8607714
test 0.8348105
9
train 0.8132162
test 0.7916608
10
train 0.7715155
test 0.75343114
11
train 0.73651576
test 0.7233517
12
train 0.70887506
test 0.69655097
13
train 0.68594545
test 0.676216
14
train 0.6670594
test 0.6609539
15
train 0.65152615
test 0.6457351
16
train 0.64005005
test 0.63508034
17
train 0.62988985
test 0.62694365
18
train 0.6215066
test 0.6200566
19
train 0.61611867
test 0.61394095
20
train 0.6115825
test 0.61069304
21
train 0.6047672
test 0.606082
22
train 0.60289836
test 0.6019052
23
train 0.59874696
test 0.5982297
24
train 0.5952654
test 0.59464014
25
train 0.59361255
test 0.59372914
26
train 0.59064424
test 0.59266216
27
train 0.58902705
test 0.5879674
28
train 0.5862354
test 0.58812064
29

# Test model

Let's review how model works in production!

In [55]:
query = test_dataset[160:260]
query_words = [x[0] for x in query]
query_types = [x[1] for x in query]
result = model.predict_on_batch(encode_text(" ".join(query_words)).reshape((1, -1)))[0]
for index in range(result.shape[0]):
    w = query_words[index]
    true_type = query_types[index]
    pred_type = unique_types[np.argmax(result[index,:])] 
    print("{}:\t{}\t{}".format(w, pred_type, true_type))

.:	O	O
.:	O	O
et:	O	O
locataires:	O	O
,:	O	O
,:	O	O
dГ©gagГ©e:	O	O
verrez:	O	O
CORDONNIERS:	O	O
Husselein:	I-PER	I-PER
internat:	O	O
89:	O	O
,:	O	O
niveau:	O	O
AcadГ©mie:	I-ORG	I-ORG
toujours:	O	O
,:	O	O
il:	O	O
d':	O	O
parmi:	O	O
qui:	O	O
mГ©decine:	I-ORG	O
,:	O	O
par:	O	O
et:	O	O
de:	O	O
ВЈr:	O	O
,:	O	O
a:	O	O
coiffeur:	O	O
exister:	O	O
,:	O	O
entr'ouverte:	O	O
.:	O	O
utiles:	O	O
dans:	O	O
oГ№:	O	O
sur:	O	O
Seine:	I-ORG	I-LOC
domicile:	O	O
':	O	O
вЂ“:	O	O
Rennes:	I-LOC	I-LOC
des:	O	O
Г©lГЁvent:	O	O
de:	O	O
Grenelle:	I-LOC	I-LOC
la:	O	O
.:	O	O
.:	O	O
entiГЁre:	O	O
ArrivГ©s:	O	O
reprend:	O	O
lots:	O	O
inf:	O	O
usines:	O	O
,:	O	O
73В»:	O	O
DE:	O	O
Le:	O	O
C':	O	O
prГ©sidГ©e:	O	O
Emile-Zola:	O	I-LOC
avancer:	O	O
salles:	O	O
1900:	O	O
.:	O	O
de:	O	O
tiers:	O	O
rencontre:	O	O
dГ©:	O	O
la:	O	O
ces:	O	O
ministГ©rielle:	O	O
mourant:	O	O
regards:	O	O
de:	O	O
sur:	O	O
courant:	O	O
l':	O	O
que:	O	O
de:	O	O
porte:	O	O
et:	O	O
du:	O	O
as:	O	O
carte:	O	O
.:	O	O
journal:	O	O
connaissance:	O	O
fer:	O

# Home task

- 3 points: make the model better
- 7 points: implement the model with CRF layer (https://github.com/Hironsan/keras-crf-layer)

**Adam**

76
train 0.067345284
test 0.20488992

77
train 0.06869629
test 0.19885963

78
train 0.066755325
test 0.19162706

**Adamax**

2
train 0.068575904
test 0.19806913

3
train 0.06759596
test 0.18924537

4
train 0.06957598
test 0.18730828