In [None]:
import numpy as np 
import pandas as pd 


**Importing the dataset for named entity recognition model**

In [None]:
dataset = pd.read_csv("Ner_English_Annotated.csv", encoding = "ISO-8859-1", error_bad_lines=False)

In [None]:
dataset.info()

In [None]:
dataset.head()

> **Create list of list of tuples to differentiate each sentence from each other**

In [None]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["word"].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(dataset)

In [None]:
sentences = getter.sentences

In [None]:
print(sentences[5])

In [None]:
maxlen = max([len(s) for s in sentences])
print ('Maximum sequence length:', maxlen)

In [None]:
# Check how long sentences are so that we can pad them
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")

In [None]:
plt.hist([len(s) for s in sentences], bins=50)
plt.show()

In [None]:
words = list(set(dataset["word"].values))
words.append("ENDPAD")

In [None]:
n_words = len(words); n_words

In [None]:
tags = list(set(dataset["tag"].values))

In [None]:
n_tags = len(tags); n_tags

**Converting words to numbers and numbers to words**

In [None]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [None]:
word2idx['Obama']

In [None]:
tag2idx["O"]

In [None]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]

In [None]:
X = pad_sequences(maxlen=140, sequences=X, padding="post",value=n_words - 1)

In [None]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]

In [None]:
y = pad_sequences(maxlen=140, sequences=y, padding="post", value=tag2idx["O"])

In [None]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

In [None]:
input = Input(shape=(140,))
model = Embedding(input_dim=n_words, output_dim=140, input_length=140)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer

In [None]:
model = Model(input, out)

In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=1, validation_split=0.2, verbose=1)

In [None]:
i = 0
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
print("{:14} ({:5}): {}".format("Word", "True", "Pred"))
for w,pred in zip(X_test[i],p[0]):
    print("{:14}: {}".format(words[w],tags[pred]))

In [None]:
p[0]

In [None]:
model.summary()