In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
## Reference: https://www.kaggle.com/code/nicolasbernardin/ner-with-simple-rnn-and-99-accuracy

import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FourthBrain ML Engineer/4B_GLG/Datasets/ner_dataset.csv', encoding="latin1")
df = df.fillna(method="ffill")
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [4]:
df = df.drop(['POS'], axis=1)
df = df.groupby('Sentence #').agg(list)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Word,Tag
0,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,"[Iranian, officials, say, they, expect, to, ge...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
2,"[Helicopter, gunships, Saturday, pounded, mili...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O..."
3,"[They, left, after, a, tense, hour-long, stand...","[O, O, O, O, O, O, O, O, O, O, O]"
4,"[U.N., relief, coordinator, Jan, Egeland, said...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo..."


In [5]:
# Get Class Numbers and List

tag_list = []
for element in df.Tag:
    for i in element:
        if i not in tag_list:
            tag_list.append(i)
num_classes = len(tag_list)
print('Number of classes : {} \nItems : {}'.format(num_classes,tag_list))

Number of classes : 17 
Items : ['O', 'B-geo', 'B-gpe', 'B-tim', 'B-org', 'I-geo', 'B-per', 'I-per', 'I-org', 'I-tim', 'B-art', 'I-art', 'B-nat', 'I-gpe', 'I-nat', 'B-eve', 'I-eve']


In [6]:
# Get Number of Words

vocab = [str(item).lower() for element in df.Word for item in element]
vocab = list(set(vocab))
nb_mots = len(vocab)
print('Number of different words : ',nb_mots)
vocab[:10]

Number of different words :  31817


['empowerment',
 'streaming',
 'cruised',
 'was',
 'alperon',
 'corruption-free',
 'colonies',
 'dabous',
 'hinted',
 'zaldivar']

In [7]:
# Tokenize the Text

import tensorflow as tf

tokenizer_txt = tf.keras.preprocessing.text.Tokenizer(num_words=nb_mots, filters=None)
tokenizer_txt.fit_on_texts(df.Word)

word2idx = tokenizer_txt.word_index
idx2word = tokenizer_txt.index_word
vocab_size = tokenizer_txt.num_words

print(vocab_size)

31817


In [8]:
#show the 20 first words
for i in range(1,20):
    print(idx2word[i])

the
.
,
in
of
to
a
and
's
for
has
on
is
that
have
with
said
was
at


In [9]:
# Tokenize the Tags

tokenizer_tag = tf.keras.preprocessing.text.Tokenizer(num_words=num_classes)
tokenizer_tag.fit_on_texts(df.Tag)

word2idx_tag = tokenizer_tag.word_index
idx2word_tag = tokenizer_tag.index_word
vocab_size_tag = tokenizer_tag.num_words

word2idx_tag

{'b-art': 11,
 'b-eve': 12,
 'b-geo': 2,
 'b-gpe': 8,
 'b-nat': 15,
 'b-org': 4,
 'b-per': 6,
 'b-tim': 3,
 'i-art': 13,
 'i-eve': 14,
 'i-geo': 9,
 'i-gpe': 16,
 'i-nat': 17,
 'i-org': 7,
 'i-per': 5,
 'i-tim': 10,
 'o': 1}

In [10]:
# Change the Text and Tag to Integer Lists

X = tokenizer_txt.texts_to_sequences(df.Word)
y = tokenizer_tag.texts_to_sequences(df.Tag)
print(len(X[0]), X[0])
print(len(y[0]), y[0])

24 [253, 5, 966, 15, 1794, 237, 467, 6, 522, 1, 128, 4, 60, 8, 570, 1, 832, 5, 185, 89, 21, 14, 55, 2]
24 [1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1]


In [11]:
# Pad Sequences to have the Same Length

longueur = df.Tag.apply(lambda x: len(x))
long_max = max(longueur)
print("Tag max Tength : ",long_max)

X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=long_max, padding='post', truncating='post')
y = tf.keras.preprocessing.sequence.pad_sequences(y, maxlen=long_max, padding='post', truncating='post')

Tag max Tength :  104


In [12]:
# Check that X and y have been completed by 0 to have a length of 104

print(len(X[0]), X[0])
print(len(y[0]), y[0])

104 [ 253    5  966   15 1794  237  467    6  522    1  128    4   60    8
  570    1  832    5  185   89   21   14   55    2    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
104 [1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 8 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
# Simple RNN model, could be more complex to achieve better results

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, RNN, GRUCell, Dropout, Dense

out_dim = 64

model = Sequential(layers= [
    Embedding(input_dim=nb_mots+1, output_dim=out_dim, input_length=long_max),
    RNN(cell=GRUCell(out_dim), return_sequences=True),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 104, 64)           2036352   
                                                                 
 rnn (RNN)                   (None, 104, 64)           24960     
                                                                 
 dropout (Dropout)           (None, 104, 64)           0         
                                                                 
 dense (Dense)               (None, 104, 128)          8320      
                                                                 
 dropout_1 (Dropout)         (None, 104, 128)          0         
                                                                 
 dense_1 (Dense)             (None, 104, 17)           2193      
                                                                 
Total params: 2,071,825
Trainable params: 2,071,825
Non-

In [15]:
# Define a custom loss function to mask the zeros added at the end of each sequence

Sparse_loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

def Sparse_loss_function(real, pred):
    # Mask
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    # Avoid type error
    mask = tf.cast(mask, dtype=pred.dtype)
    # Loss function
    loss_ = Sparse_loss_object(real, pred)
    # Apply mask on loss function
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [16]:
# Compile and train the model on a few epochs

model.compile(optimizer='adam', loss=Sparse_loss_function, metrics=['accuracy'])
history = model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5,batch_size=128,workers=-1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
from sklearn.metrics import confusion_matrix
import numpy as np

prob = model.predict(X_test)
pred = prob.argmax(axis=-1)

print(confusion_matrix(y_test.argmax(axis=1), pred.argmax(axis=1)))

[[2451   42   25 ...    0    0    0]
 [  37  491   46 ...    0    0    0]
 [  17   18  442 ...    0    0    0]
 ...
 [   0    0    0 ...    0    0    0]
 [   1    0    0 ...    0    0    0]
 [   0    0    0 ...    0    0    1]]
