In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from keras import layers
from keras.models import Model

Essential info about tagged entities:
```
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
```

In [None]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1")

In [None]:
data.head(50)

In [None]:
data = data.fillna(method="ffill")

In [None]:
data.head(50)

In [None]:
print("Unique words in corpus:", data['Word'].nunique())
print("Unique tags in corpus:", data['Tag'].nunique())

In [None]:
words=list(data['Word'].unique())
words.append("ENDPAD")
tags=list(data['Tag'].unique())

In [None]:
print("Unique tags are:", tags)

In [None]:
num_words = len(words)
num_tags = len(tags)

In [None]:
num_words

In [None]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(data)
sentences = getter.sentences

In [None]:
len(sentences)

In [None]:
sentences[0]

In [None]:
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [None]:
word2idx

In [None]:
plt.hist([len(s) for s in sentences], bins=50)
plt.show()

In [None]:
X1 = [[word2idx[w[0]] for w in s] for s in sentences]

In [None]:
type(X1[0])

In [None]:
X1[0]

In [None]:
max_len = 50

# **pad_sequences example**

In [None]:
nums = [[1], [2, 3], [4, 5, 6]]
sequence.pad_sequences(nums)

In [None]:
nums = [[1], [2, 3], [4, 5, 6]]
sequence.pad_sequences(nums,maxlen=2)

In [None]:
X = sequence.pad_sequences(maxlen=max_len,
                  sequences=X1, padding="post",
                  value=num_words-1)

In [None]:
X[0]

In [None]:
y1 = [[tag2idx[w[2]] for w in s] for s in sentences]

In [None]:
y = sequence.pad_sequences(maxlen=max_len,
                  sequences=y1,
                  padding="post",
                  value=tag2idx["O"])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=1)

In [None]:
X_train[0]

In [None]:
y_train[0]

In [None]:
input_word = layers.Input(shape=(max_len,))
embedding_layer = layers.Embedding(input_dim=num_words,output_dim=50,
                                   input_length=max_len)(input_word)
dropout = layers.SpatialDropout1D(0.1)(embedding_layer)
bid_lstm = layers.Bidirectional(
    layers.LSTM(units=100,return_sequences=True,
                recurrent_dropout=0.1))(dropout)
output = layers.TimeDistributed(
    layers.Dense(num_tags,activation="softmax"))(bid_lstm)               
model = Model(input_word, output)  

In [None]:
model.summary()

In [None]:
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [None]:
history = model.fit(
    x=X_train,
    y=y_train,
    validation_data=(X_test,y_test),
    batch_size=32, 
    epochs=3,
)

In [None]:
metrics = pd.DataFrame(model.history.history)
metrics.head()

In [None]:
metrics[['accuracy','val_accuracy']].plot()

In [None]:
metrics[['loss','val_loss']].plot()

In [None]:
i = 20
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
y_true = y_test[i]
print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" *30)
for w, true, pred in zip(X_test[i], y_true, p[0]):
    print("{:15}{}\t{}".format(words[w-1], tags[true], tags[pred]))