In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/named-entity-recognistionner/NER.csv


# Importing Libraries

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split


In [2]:
data = pd.read_csv("/kaggle/input/named-entity-recognistionner/NER.csv")
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


- Words are broken into columns, which are our feature X.
-  And the tag column will be our label Y.

In [3]:
def get_map(data, token_or_a_tag):
    tok2indx = {}
    idx2tok = {}
    vocab = []
    if token_or_a_tag == 'token':
        vocab = list(set(data['Word'].to_list())) # Set to remove the duplicates from the list 
    else:
        vocab = list(set(data['Tag'].to_list()))
        print(vocab)
    
    
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)} #iterators over the vocab and generates pairs 
    idx2idx = {idx:tok for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok

In [4]:
token2idx, idx2token = get_map(data, 'token')
tag2idx, idx2tag = get_map(data,'tag')
tag2idx

['I-tim', 'B-gpe', 'I-gpe', 'I-eve', 'I-geo', 'O', 'B-per', 'I-org', 'B-tim', 'B-nat', 'B-org', 'I-art', 'I-per', 'B-geo', 'I-nat', 'B-eve', 'B-art']


{'I-tim': 0,
 'B-gpe': 1,
 'I-gpe': 2,
 'I-eve': 3,
 'I-geo': 4,
 'O': 5,
 'B-per': 6,
 'I-org': 7,
 'B-tim': 8,
 'B-nat': 9,
 'B-org': 10,
 'I-art': 11,
 'I-per': 12,
 'B-geo': 13,
 'I-nat': 14,
 'B-eve': 15,
 'B-art': 16}

In [5]:
tag2idx

{'I-tim': 0,
 'B-gpe': 1,
 'I-gpe': 2,
 'I-eve': 3,
 'I-geo': 4,
 'O': 5,
 'B-per': 6,
 'I-org': 7,
 'B-tim': 8,
 'B-nat': 9,
 'B-org': 10,
 'I-art': 11,
 'I-per': 12,
 'B-geo': 13,
 'I-nat': 14,
 'B-eve': 15,
 'B-art': 16}

In [6]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
# Sentence # column is not used as the index of the resulting data_group dataframe.
data_group = data_fillna.groupby(['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
# all the words in the sentence, their corresponding POS tags, their original tags, their word indices, and their tag indices. 

  


In [7]:
data_group

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[3290, 18254, 1317, 23803, 29599, 11879, 31142...","[5, 5, 5, 5, 5, 5, 13, 5, 5, 5, 5, 5, 13, 5, 5..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[27174, 11716, 26179, 15498, 19043, 1800, 2607...","[1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[17398, 6071, 29966, 31432, 19924, 7543, 16477...","[5, 5, 8, 5, 5, 5, 5, 5, 13, 5, 5, 5, 5, 5, 10..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[25436, 14325, 16123, 7687, 33794, 136, 26167,...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[19055, 4687, 5343, 26508, 18895, 21857, 31523...","[13, 5, 5, 6, 12, 5, 8, 5, 13, 5, 1, 5, 1, 5, ..."
...,...,...,...,...,...,...
47954,Sentence: 9995,"[Opposition, leader, Mir, Hossein, Mousavi, ha...","[NNP, NN, NNP, NNP, NNP, VBZ, VBN, PRP, VBZ, T...","[O, O, O, B-per, I-per, O, O, O, O, O, O, O, O...","[5478, 17083, 9845, 25619, 30522, 31815, 21857...","[5, 5, 5, 6, 12, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,..."
47955,Sentence: 9996,"[On, Thursday, ,, Iranian, state, media, publi...","[IN, NNP, ,, JJ, NN, NNS, VBN, DT, NN, IN, DT,...","[O, B-tim, O, B-gpe, O, O, O, O, O, O, O, O, B...","[22349, 32072, 6497, 27174, 24496, 28663, 1218...","[5, 8, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 10, 7, 5,..."
47956,Sentence: 9997,"[Following, Iran, 's, disputed, June, 12, elec...","[VBG, NNP, POS, JJ, NNP, CD, NNS, ,, NNS, NNS,...","[O, B-geo, O, O, B-tim, I-tim, O, O, O, O, O, ...","[27744, 33821, 17832, 27898, 5267, 13550, 2148...","[5, 13, 5, 5, 8, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5,..."
47957,Sentence: 9998,"[Since, then, ,, authorities, have, held, publ...","[IN, RB, ,, NNS, VBP, VBN, JJ, NNS, IN, DT, VB...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[21126, 7216, 6497, 3897, 23803, 2592, 34413, ...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."


In [8]:
def preprocing_the_data(data_group, data):
    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx['O'])
    n_tags = len(tag2idx)
    print(tag2idx["O"] )
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print('\ntrain_tokens length:', len(train_tokens))
    print('\ntrain_tags length:', len(train_tags))
    print('\ntest_tokens length:', len(test_tokens))
    print('\ntest_tags:', len(test_tags))
    print('\nval_tokens:', len(val_tokens))
    print('\nval_tags:', len(val_tags))
    
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = preprocing_the_data(data_group, data)

5

train_tokens length: 32372

train_tags length: 32372

test_tokens length: 4796

test_tags: 4796

val_tokens: 10791

val_tags: 10791


In [9]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [10]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

In [11]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser 
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [12]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [13]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 104, 64)           2251072   
                                                                 
 bidirectional (Bidirectiona  (None, 104, 128)         66048     
 l)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 104, 64)           49408     
                                                                 
 time_distributed (TimeDistr  (None, 104, 17)          1105      
 ibuted)                                                         
                                                                 
Total params: 2,367,633
Trainable params: 2,367,633
Non-trainable params: 0
_________________________________________________________________


In [None]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = nlp('Hi, My name is Aman Kharwal \n I am from India \n I want to work with Google \n Steve Jobs is My Inspiration')
displacy.render(text, style = 'ent', jupyter=True)