Reading the Dataset

In [10]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense, concatenate, Conv1D, GlobalMaxPooling1D

# File paths
train_file = r"H:\NER\eng.train"
testa_file = r"H:\NER\eng.testa"
testb_file = r"H:\NER\eng.testb"

# Function to read CoNLL files
def read_conll_file(filepath):
    sentences = []
    labels = []
    with open(filepath, "r", encoding="utf-8") as f:
        sent = []
        label = []
        for line in f:
            line = line.strip()
            if not line:
                if sent:
                    sentences.append(sent)
                    labels.append(label)
                    sent, label = [], []
            else:
                parts = line.split()
                word = parts[0]
                ner = parts[-1]
                sent.append(word)
                label.append(ner)
        if sent:
            sentences.append(sent)
            labels.append(label)
    return sentences, labels

# Read train and test files
train_sentences, train_labels = read_conll_file(train_file)
testa_sentences, testa_labels = read_conll_file(testa_file)
testb_sentences, testb_labels = read_conll_file(testb_file)

# Combine test sets
test_sentences = testa_sentences + testb_sentences
test_labels = testa_labels + testb_labels

print("Number of training sentences:", len(train_sentences))
print("Number of test sentences:", len(test_sentences))


Number of training sentences: 15020
Number of test sentences: 7150


Turning words & tags into numbers

In [11]:
# Word vocabulary
words = list(set(word for s in train_sentences for word in s))
words.append("PAD")
words.append("UNK")
word2idx = {w: i for i, w in enumerate(words)}
vocab_size = len(word2idx)

# Tag vocabulary
tags = list(set(tag for l in train_labels for tag in l))
tags.append("PAD")
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {i: t for t, i in tag2idx.items()}
tag_size = len(tag2idx)

# Max sentence length
max_len = max(len(s) for s in train_sentences)

# Convert words to indices
X_train = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in train_sentences]
X_test = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in test_sentences]

X_train = pad_sequences(X_train, maxlen=max_len, padding="post", value=word2idx["PAD"])
X_test = pad_sequences(X_test, maxlen=max_len, padding="post", value=word2idx["PAD"])

# Convert tags to indices
y_train = [[tag2idx[t] for t in l] for l in train_labels]
y_test = [[tag2idx.get(t, tag2idx["PAD"]) for t in l] for l in test_labels]

y_train = pad_sequences(y_train, maxlen=max_len, padding="post", value=tag2idx["PAD"])
y_test = pad_sequences(y_test, maxlen=max_len, padding="post", value=tag2idx["PAD"])

# One-hot encoding for softmax
y_train = tf.keras.utils.to_categorical(y_train, num_classes=tag_size)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=tag_size)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


X_train shape: (15020, 113)
y_train shape: (15020, 113, 10)


Character encoding

In [12]:
# Character vocabulary
chars = set(char for s in train_sentences for w in s for char in w)
chars.add("PAD")
chars.add("UNK")
char2idx = {c: i for i, c in enumerate(chars)}
idx2char = {i: c for c, i in char2idx.items()}

max_word_len = 12  # max characters per word

# Convert sentences to char indices
def sentences_to_char_indices(sentences, max_len, max_word_len):
    X_char = []
    for sent in sentences:
        sent_chars = []
        for word in sent[:max_len]:
            word_chars = [char2idx.get(c, char2idx["UNK"]) for c in word[:max_word_len]]
            word_chars += [char2idx["PAD"]] * (max_word_len - len(word_chars))
            sent_chars.append(word_chars)
        sent_chars += [[char2idx["PAD"]] * max_word_len] * (max_len - len(sent_chars))
        X_char.append(sent_chars)
    return np.array(X_char)

X_train_char = sentences_to_char_indices(train_sentences, max_len, max_word_len)
X_test_char = sentences_to_char_indices(test_sentences, max_len, max_word_len)

print("X_train_char shape:", X_train_char.shape)
print("X_test_char shape:", X_test_char.shape)


X_train_char shape: (15020, 113, 12)
X_test_char shape: (7150, 113, 12)


Building the Model

In [13]:
# Word input
word_in = Input(shape=(max_len,))
word_embed = Embedding(input_dim=vocab_size, output_dim=80, mask_zero=True)(word_in)

# Character input
char_in = Input(shape=(max_len, max_word_len))
char_embed = TimeDistributed(Embedding(input_dim=len(char2idx), output_dim=16))(char_in)
char_embed = TimeDistributed(Conv1D(filters=20, kernel_size=3, padding='same', activation='relu'))(char_embed)
char_embed = TimeDistributed(GlobalMaxPooling1D())(char_embed)

# Combine embeddings
x = concatenate([word_embed, char_embed])
x = Bidirectional(LSTM(units=48, return_sequences=True))(x)
output = TimeDistributed(Dense(tag_size, activation='softmax'))(x)

# Build and compile model
model_fast = Model([word_in, char_in], output)
model_fast.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model_fast.summary()


In [14]:
history = model_fast.fit(
    [X_train, X_train_char],
    y_train,
    batch_size=32,
    epochs=10,
    validation_split=0.2
)


Epoch 1/10
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 192ms/step - accuracy: 0.9737 - loss: 0.1056 - val_accuracy: 0.9841 - val_loss: 0.0520
Epoch 2/10
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 110ms/step - accuracy: 0.9913 - loss: 0.0308 - val_accuracy: 0.9920 - val_loss: 0.0278
Epoch 3/10
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 109ms/step - accuracy: 0.9973 - loss: 0.0119 - val_accuracy: 0.9942 - val_loss: 0.0200
Epoch 4/10
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 110ms/step - accuracy: 0.9988 - loss: 0.0057 - val_accuracy: 0.9946 - val_loss: 0.0190
Epoch 5/10
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 109ms/step - accuracy: 0.9994 - loss: 0.0032 - val_accuracy: 0.9949 - val_loss: 0.0177
Epoch 6/10
[1m376/376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 110ms/step - accuracy: 0.9996 - loss: 0.0020 - val_accuracy: 0.9948 - val_loss: 0.0188
Epoch 7/1

In [36]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Raw paragraph
raw_paragraph = """
I eat apple every day , I leave in India , I leave in Mumbai .
"""

# Split paragraph into sentences using regex
sentences = re.split(r'(?<=[.!?])\s+', raw_paragraph.strip())

# Split each sentence into words
test_sentences = [s.split() for s in sentences]

# Store all predictions
all_predictions = []

for test_sentence in test_sentences:
    # Convert words to indices
    test_seq = [word2idx.get(w, word2idx["UNK"]) for w in test_sentence]
    test_seq = pad_sequences([test_seq], maxlen=max_len, padding="post", value=word2idx["PAD"])

    # Predict
    pred = model_fast.predict([test_seq, sentences_to_char_indices([test_sentence], max_len, max_word_len)], verbose=0)
    pred_labels = [idx2tag[np.argmax(i)] for i in pred[0][:len(test_sentence)]]

    all_predictions.append(list(zip(test_sentence, pred_labels)))

# Print predictions
for i, pred in enumerate(all_predictions):
    print(f"Sentence {i+1}: {pred}")


Sentence 1: [('I', 'O'), ('eat', 'O'), ('apple', 'O'), ('every', 'O'), ('day', 'O'), (',', 'O'), ('I', 'O'), ('leave', 'O'), ('in', 'O'), ('India', 'B-LOC'), (',', 'O'), ('I', 'O'), ('leave', 'O'), ('in', 'O'), ('Mumbai', 'B-LOC'), ('.', 'O')]
