In [None]:
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

In [None]:
#lets import our data
data = pd.read_csv("/content/ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")

words = list(set(data["Word"].values))
tags  = list(set(data["Tag"].values))

In [None]:
#convert the data to lists of tuples
to_list_words = lambda sentence: sentence["Word"].values.tolist()
to_list_tags = lambda sentence: sentence["Tag"].values.tolist()
words_list = data.groupby("Sentence #").apply(to_list_words).to_list()
tags_list = data.groupby("Sentence #").apply(to_list_tags).to_list()

In [None]:
t = Tokenizer(filters='', lower=False, oov_token=1)
t.fit_on_texts(words)
encoded_words = t.texts_to_sequences(words)

In [None]:
t_tags=Tokenizer(filters='', lower=False)
t_tags.fit_on_texts(tags)
encoded_tags=t_tags.texts_to_sequences(tags)

In [None]:
X = t.texts_to_sequences(words_list)
X = pad_sequences(sequences=X, maxlen=81, padding='post')

Y = t_tags.texts_to_sequences(tags_list)
Y = pad_sequences(sequences=Y, maxlen=81, padding='post')
Y = [to_categorical(s , num_classes=len(tags)+1) for s in Y]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

In [None]:
!pip -q install git+https://www.github.com/keras-team/keras-contrib.git sklearn_crfsuite
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, TimeDistributed, Dropout
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy


input_sequence = Input(shape = (81,))
model = Embedding(input_dim = len(words)+1, output_dim = 48, input_length=81, mask_zero=True)(input_sequence)
model = Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.1)) (model)
model = TimeDistributed(Dense(48, activation="relu"))(model)
crf = CRF(units = len(tags) + 1)
output_sequence = crf(model)
model = Model(input_sequence,output_sequence)
model.compile(optimizer="rmsprop", loss=crf_loss, metrics=[crf_accuracy])

model.summary()

  Building wheel for keras-contrib (setup.py) ... [?25l[?25hdone
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 81)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 81, 48)            1688592   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 81, 128)           57856     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 81, 48)            6192      
_________________________________________________________________
crf_3 (CRF)                  (None, 81, 18)            1242      
Total params: 1,753,882
Trainable params: 1,753,882
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train, np.array(Y_train), batch_size=256, epochs=8, validation_split=0.1)
prediction = model.predict(X_test)

Train on 38846 samples, validate on 4317 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
# !pip -q install sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report
Y_test_ = np.argmax(Y_test,-1)
prediction_=np.argmax(prediction,-1)
Y_test_taged = t_tags.sequences_to_texts(Y_test_)
prediction_taged = t_tags.sequences_to_texts(prediction_)
Y_test_taged = [s.split() for s in Y_test_taged]
prediction_taged = [s.split() for s in prediction_taged]
metrics = flat_classification_report(Y_test_taged,prediction_taged)
print(metrics)

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        36
       B-eve       0.67      0.13      0.22        31
       B-geo       0.85      0.90      0.87      3754
       B-gpe       0.97      0.93      0.95      1558
       B-nat       0.00      0.00      0.00        22
       B-org       0.81      0.70      0.75      2057
       B-per       0.82      0.84      0.83      1630
       B-tim       0.94      0.84      0.89      2024
       I-art       0.00      0.00      0.00        23
       I-eve       0.00      0.00      0.00        23
       I-geo       0.77      0.81      0.79       710
       I-gpe       0.79      0.55      0.65        20
       I-nat       0.00      0.00      0.00         4
       I-org       0.80      0.78      0.79      1758
       I-per       0.85      0.88      0.87      1682
       I-tim       0.90      0.70      0.79       642
           O       0.99      0.99      0.99     88745

   micro avg       0.97   

In [None]:
quote = 'The 1906 San Francisco earthquake was the biggest earthquake lol lol that has ever hit San Francisco on April 18, 1906'
quote = quote.split(' ')
quote_input=[]
UNK_words=[]
for i in range(len(quote)):
  quote[i]=quote[i].strip(',!.?:')
  s_tag = t.texts_to_sequences([quote[i]])
  if s_tag == [[1]]:
    UNK_words.append(quote[i])
  else:
    quote_input.append(quote[i])

quote_input=t.texts_to_sequences([quote_input])

quote_input=pad_sequences(sequences=quote_input, maxlen=81, padding='post')
pred=model.predict(quote_input)
pred=np.argmax(pred,-1)
pred_taged = t_tags.sequences_to_texts(pred)[0].split()

for i in quote:
  if i in UNK_words:
    print('{:20}=====>         UNK'.format(i))
  else:
    print('{:20}=====>         {:7}'.format(i,pred_taged.pop(0)))