In [19]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed

In [15]:
df = pd.read_csv('data/ner_dataset.csv', encoding = "ISO-8859-1")
df.head(10)

df = df.fillna(method = 'ffill')
df['Sentence #'] = df['Sentence #'].apply(lambda x: int(x.split(':')[1]))

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Sentence #  1048575 non-null  int64 
 1   Word        1048575 non-null  object
 2   POS         1048575 non-null  object
 3   Tag         1048575 non-null  object
dtypes: int64(1), object(3)
memory usage: 32.0+ MB


In [21]:
grouped_data = df.groupby("Sentence #").agg({"Word": " ".join, "Tag": " ".join}).reset_index()
grouped_data.head(10)

Unnamed: 0,Sentence #,Word,Tag
0,1,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,2,Families of soldiers killed in the conflict jo...,O O O O O O O O O O O O O O O O O O B-per O O ...
2,3,They marched from the Houses of Parliament to ...,O O O O O O O O O O O B-geo I-geo O
3,4,"Police put the number of marchers at 10,000 wh...",O O O O O O O O O O O O O O O
4,5,The protest comes on the eve of the annual con...,O O O O O O O O O O O B-geo O O B-org I-org O ...
5,6,The party is divided over Britain 's participa...,O O O O O B-gpe O O O O B-geo O O O O O O O B-...
6,7,The London march came ahead of anti-war protes...,O B-geo O O O O O O O O O O O O B-geo O B-geo ...
7,8,The International Atomic Energy Agency is to h...,O B-org I-org I-org I-org O O O O O O O O B-ge...
8,9,Iran this week restarted parts of the conversi...,B-gpe O O O O O O O O O O B-geo O O O
9,10,Iranian officials say they expect to get acces...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...


In [22]:
train_texts = grouped_data["Word"].tolist()
train_labels = grouped_data["Tag"].tolist()

# Create a vocabulary
vocab = set(" ".join(train_texts).split())
vocab.add("PAD")
vocab_size = len(vocab)
tag_vocab = set(" ".join(train_labels).split())
tag_vocab_size = len(tag_vocab)

# Create mappings between words/tags and their indices
word2idx = {word: idx for idx, word in enumerate(vocab)}
tag2idx = {tag: idx for idx, tag in enumerate(tag_vocab)}

# Convert text and labels into numerical sequences
train_text_sequences = [[word2idx[word] for word in text.split()] for text in train_texts]
train_label_sequences = [[tag2idx[tag] for tag in label.split()] for label in train_labels]


In [35]:
max_seq_length = max(len(seq) for seq in train_text_sequences)
train_text_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_text_sequences, maxlen=max_seq_length, padding="post")
train_label_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_label_sequences, maxlen=max_seq_length, padding="post")

f = open("maxlenseq", "a")
f.write(f"{max_seq_length}")
f.close()


In [24]:
input_layer = Input(shape=(max_seq_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=64, input_length=max_seq_length)(input_layer)
lstm_layer = LSTM(units=64, return_sequences=True)(embedding_layer)
output_layer = TimeDistributed(Dense(units=tag_vocab_size, activation="softmax"))(lstm_layer)

model = Model(input_layer, output_layer)
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])


In [25]:
model.fit(train_text_sequences, train_label_sequences, batch_size=32, epochs=10, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a69ead8790>

In [30]:
model.save("ner_model.model")



INFO:tensorflow:Assets written to: ner_model.model\assets


INFO:tensorflow:Assets written to: ner_model.model\assets


In [37]:
import pickle


# Create mappings between indices and tags
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

variables = {
    'maxlenseq': max_seq_length,
    'word2idx': word2idx,
    'idx2tag': idx2tag
}
with open('file.var', 'wb') as file:
    pickle.dump(variables, file)