## NER system


In [1]:
import sys, json, codecs, csv
import numpy as np
import random
import pandas as pd
import tqdm
import itertools
import seaborn as sns

# visualization
import matplotlib
import numpy as np
from sklearn.model_selection import train_test_split
from itertools import chain
import matplotlib.pyplot as plt
%matplotlib inline

from keras.layers import Input, LSTM, Embedding, Dense
from keras.layers.wrappers import Bidirectional
from keras.models import Model

from collections import Counter
from keras.callbacks import TensorBoard
import tensorflow as tf
import os, shutil

Using TensorFlow backend.


In [15]:
import urllib
#urllib.request.urlretrieve(
    #"https://raw.githubusercontent.com/EuropeanaNewspapers/ner-corpora/master/enp_FR.bnf.bio/enp_FR.bnf.bio",
    #"enp_FR.bnf.bio")
with open("enp_FR.bnf.bio",encoding='utf-8') as f:
    text = f.read()
    
print(text[:100])

Emmanuel I-PER
DESOLES I-PER
de O
LOU O
Directeur O
politique O
BÊ>ÀCTION O
ET O
ADMINISTRATION O
9&


In [16]:
words = []
types = []
for item in text.split('\n'):
    item = item.strip()
    if len(item) == 0:
        continue
    [w, t] = item.split(' ')
    words.append(w)
    types.append(t)

unique_types = list(set(types))
type2id = {x:index for index, x in enumerate(unique_types)}

print(type2id)

word2count = Counter(words)
MAX_WORD_COUNT = 30000
top_words = [x[0] for x in sorted(word2count.items(), key=lambda x: x[1], reverse=True)][:MAX_WORD_COUNT]
word2id = {x:index+1 for index, x in enumerate(top_words)}

train_dataset, test_dataset = train_test_split(list(zip(words, types)), train_size=0.7)

{'I-LOC': 0, 'I-ORG': 1, 'I-PER': 2, 'O': 3}




In [17]:
train_dataset[:10]

[('assurément', 'O'),
 ('que', 'O'),
 ('fera', 'O'),
 ('pu', 'O'),
 ('des', 'O'),
 ('la', 'O'),
 ('demie', 'O'),
 ('Réhel', 'I-PER'),
 ('se', 'O'),
 ('les', 'O')]

In [18]:
input = Input(shape=(None,))
out = Embedding(input_dim=len(word2id)+1, output_dim=200)(input)
out = Bidirectional(LSTM(200, activation='relu', return_sequences=True))(out)
out = Dense(len(type2id), activation='softmax')(out)
model = Model(input, out)
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, None, 200)         6000200   
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 400)         641600    
_________________________________________________________________
dense_2 (Dense)              (None, None, 4)           1604      
Total params: 6,643,404
Trainable params: 6,643,404
Non-trainable params: 0
_________________________________________________________________


In [19]:
def getWordId(w):
    return 0 if not w in word2id else word2id[w]

def gen_batches(dataset, batch_size=64, seq_size=32, batch_count=100):
    random.shuffle(dataset)
    
    features = np.zeros((batch_size, seq_size))
    labels = np.zeros((batch_size, seq_size, len(type2id)))
    for _ in range(batch_count):
        for seq_index in range(batch_size):
            left = random.randint(0, len(dataset) - seq_size)
            features[seq_index,:] = [getWordId(x[0]) for x in dataset[left:left+seq_size]]
            labels[seq_index,:] = 0
            for i,(_,t) in enumerate(dataset[left:left+seq_size]):
                labels[seq_index,i] = 0
                labels[seq_index,i,type2id[t]] = 1
        yield features, labels
        
def encode_text(sentence):
    words = sentence.split()
    result = np.zeros((len(words),))
    for i,w in enumerate(words):
        result[i] = getWordId(w)
    return result


def write_log(callback, names, logs, batch_no):
    for name, value in zip(names, logs):
        summary = tf.Summary()
        summary_value = summary.value.add()
        summary_value.simple_value = value
        summary_value.tag = name
        callback.writer.add_summary(summary, batch_no)
        callback.writer.flush()

In [20]:
from tqdm import tqdm

logs_dir = './logs'
callback = TensorBoard(logs_dir)
callback.set_model(model)

for epoch in tqdm(range(100)):
    losses = []
    for x,y in gen_batches(train_dataset, batch_count=32):
        loss = model.train_on_batch(x, y)
        losses.append(loss)
    train_loss = np.mean(losses)
        
    losses = []
    for x,y in gen_batches(test_dataset, batch_count=32):
        loss = model.test_on_batch(x, y)
        losses.append(loss)
    test_loss = np.mean(losses)
    
    write_log(callback, ['train', 'test'], [train_loss, test_loss], epoch)


query = test_dataset[160:260]
query_words = [x[0] for x in query]
query_types = [x[1] for x in query]
result = model.predict_on_batch(encode_text(" ".join(query_words)).reshape((1, -1)))[0]
for index in range(result.shape[0]):
    w = query_words[index]
    true_type = query_types[index]
    pred_type = unique_types[np.argmax(result[index,:])] 

 64%|███████████████████████████████████████████████████▊                             | 64/100 [12:38<06:55, 11.54s/it]

KeyboardInterrupt: 