## NER Model

In [14]:
import warnings
warnings.filterwarnings("ignore")

#### Importing the Libraries

In [15]:
# Importing the necessary libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from sklearn_crfsuite import CRF, metrics
from sklearn.metrics import classification_report

#### Loading the Data

In [16]:
# Reading the csv file
df = pd.read_csv("ner dataset/data/ner_dataset.csv", encoding='latin1')

In [17]:
# Sentence has missing values filling them using forward fill to fill null values
df['Sentence #'] = df['Sentence #'].ffill(axis=0)

In [18]:
df['Tag'].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [19]:
df['Tag'].value_counts()

Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

In [20]:
# About 10 values in Word are null which are basically at the start of new sentence
df.dropna(subset='Word', inplace=True)

#### Feature Engineering

In [21]:
# Group the dataframe by Sentence
sentences = df.groupby('Sentence #').apply(lambda x: [(word, tag) for word, tag in zip(x['Word'].tolist(), x['Tag'].tolist())])

In [22]:
# Creating nested list of all sentences
sentences = [sentence for sentence in sentences]

In [23]:
len(sentences)

47959

In [24]:
# Splitting the data into test train and split
train_val_sentences, test_sentences = train_test_split(sentences, test_size=0.2, random_state=42)

# Further splitting the training data into training and validation dataset
train_sentences, val_sentences = train_test_split(train_val_sentences, test_size=0.1, random_state=42)

In [25]:
# Viewing the shape of Train test and validation dataset
print(f"Number of sentences - Train: {len(train_sentences)}, Validation: {len(val_sentences)}, Test: {len(test_sentences)}")

Number of sentences - Train: 34530, Validation: 3837, Test: 9592


In [26]:
# Creating the function for calculating Precision, Recall, F1 Score and Accuracy
def evaluation_ner(true_tags, pred_tags):
    precision, recall, f1, _ = precision_recall_fscore_support(true_tags, pred_tags, average='weighted')
    accuracy = accuracy_score(true_tags, pred_tags)
    return precision, recall, f1, accuracy

In [27]:
## CFR

In [28]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word': word,
        'word.lower': word.lower(),
        'word.isupper': word.isupper(),
        'word.isdigit': word.isdigit(),
    }
    if i > 0:
        features['prev_word'] = sent[i-1][0]
    else:
        features['prev_word'] = 'BOS'  # Beginning of Sentence
    if i < len(sent)-1:
        features['next_word'] = sent[i+1][0]
    else:
        features['next_word'] = 'EOS'  # End of Sentence
    return features

In [29]:
def sent2features(sentence):
    return [word2features(sentence, i) for i in range(len(sentence))]

In [30]:
def sent2labels(sentence):
    return [tag for _, tag in sentence]

In [31]:
# Convert datasets into feature representation

X_train = [sent2features(s) for s in train_sentences]
y_train = [sent2labels(s) for s in train_sentences]

X_val = [sent2features(s) for s in val_sentences]
y_val = [sent2labels(s) for s in val_sentences]

X_test = [sent2features(s) for s in test_sentences]
y_test = [sent2labels(s) for s in test_sentences]

## CRF

In [32]:
# Train the CRF model
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100)

In [33]:
# fitting the model on training data
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

In [34]:
# Predicting on validation dataset
val_predictions = crf.predict(X_val)

In [35]:
# Flatten the lists for comparison
true_val_labels = [label for sentence in y_val for label in sentence]
pred_val_labels = [label for sentence in val_predictions for label in sentence]

# Evaluate on the validation set
precision, recall, f1, accuracy = evaluation_ner(true_val_labels, pred_val_labels)
print(f"Validation - \n\nPrecision: {precision}, \nRecall: {recall}, \nF1 Score: {f1}, \nAccuracy: {accuracy}")

Validation - 

Precision: 0.9643564539525207, 
Recall: 0.9658495654779742, 
F1 Score: 0.9647773327883615, 
Accuracy: 0.9658495654779742


In [36]:
# Predicting on test dataset
test_predictions = crf.predict(X_test)

In [37]:
# Flatten the lists for comparison
true_test_labels = [label for sentence in y_test for label in sentence]
pred_test_labels = [label for sentence in test_predictions for label in sentence]

# Evaluate on the test set
precision, recall, f1, accuracy = evaluation_ner(true_test_labels, pred_test_labels)
print(f"Validation - \n\nPrecision: {precision}, \nRecall: {recall}, \nF1 Score: {f1}, \nAccuracy: {accuracy}")

Validation - 

Precision: 0.9653775829209574, 
Recall: 0.9670344334564518, 
F1 Score: 0.9658597854652271, 
Accuracy: 0.9670344334564518


In [38]:
# Print classification report
print(classification_report(true_test_labels, pred_test_labels))

              precision    recall  f1-score   support

       B-art       0.35      0.09      0.14        94
       B-eve       0.60      0.34      0.44        70
       B-geo       0.87      0.88      0.88      7558
       B-gpe       0.97      0.93      0.95      3142
       B-nat       0.68      0.33      0.44        40
       B-org       0.82      0.69      0.75      4151
       B-per       0.86      0.77      0.82      3400
       B-tim       0.93      0.87      0.90      4077
       I-art       0.27      0.05      0.08        84
       I-eve       0.53      0.25      0.34        65
       I-geo       0.82      0.79      0.80      1462
       I-gpe       0.94      0.48      0.64        33
       I-nat       0.86      0.46      0.60        13
       I-org       0.82      0.75      0.78      3394
       I-per       0.87      0.85      0.86      3406
       I-tim       0.82      0.79      0.81      1251
           O       0.98      0.99      0.99    177585

    accuracy              

In [53]:
x = [i[0] for i in test_sentences[0]]

In [57]:
" ".join(x)

"The report calls on President Bush and Congress to urge Chinese officials not to use the global war against terrorism as a pretext to suppress minorities ' rights ."

In [58]:
def predict_ner_tags_from_string(input_sentence):
    # Creating list of words
    words = input_sentence.split()
    
    # Convert the list of words into the format needed for prediction
    formatted_sentence = [(word, "") for word in words]
    
    # Extract features and predict using the CRF model
    features = sent2features(formatted_sentence)
    predicted_tags = crf.predict([features])[0]
    
    # Combine words and their predicted tags
    result = list(zip(words, predicted_tags))
    return result

In [60]:
# Example usage
user_input = input("Enter a sentence for NER tagging: ")
predicted_tags = predict_ner_tags_from_string(user_input)
print("Predicted NER Tags:", predicted_tags)

Enter a sentence for NER tagging:  The report calls on President Bush and Congress to urge Chinese officials not to use the global war against terrorism as a pretext to suppress minorities ' rights .


Predicted NER Tags: [('The', 'O'), ('report', 'O'), ('calls', 'O'), ('on', 'O'), ('President', 'B-per'), ('Bush', 'I-per'), ('and', 'O'), ('Congress', 'B-org'), ('to', 'O'), ('urge', 'O'), ('Chinese', 'B-gpe'), ('officials', 'O'), ('not', 'O'), ('to', 'O'), ('use', 'O'), ('the', 'O'), ('global', 'O'), ('war', 'O'), ('against', 'O'), ('terrorism', 'O'), ('as', 'O'), ('a', 'O'), ('pretext', 'O'), ('to', 'O'), ('suppress', 'O'), ('minorities', 'O'), ("'", 'O'), ('rights', 'O'), ('.', 'O')]


## BiLSTM

In [24]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [25]:
# Extract words and tags
words = list(set(df["Word"].values))
tags = list(set(df["Tag"].values))

In [26]:
# Add padding token to the list of words and tags
words.append("PADword")
tags.append("PADtag")

In [27]:
# Create word and tag dictionaries
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {i: t for t, i in tag2idx.items()}

In [28]:
# Convert sentences into numerical format
X = [[word2idx[word[0]] for word in sentence] for sentence in sentences]
y = [[tag2idx[word[1]] for word in sentence] for sentence in sentences]

In [63]:
df['Sentence #'].value_counts()

Sentence #
Sentence: 22480    104
Sentence: 33481     81
Sentence: 40153     73
Sentence: 21167     72
Sentence: 21776     70
                  ... 
Sentence: 1595       2
Sentence: 40249      2
Sentence: 37093      2
Sentence: 8412       1
Sentence: 38917      1
Name: count, Length: 47959, dtype: int64

In [29]:
MAX_LEN=50

In [30]:
# Padding sequences
X = pad_sequences(X, maxlen=MAX_LEN, padding='post', value=word2idx["PADword"])
y = pad_sequences(y, maxlen=MAX_LEN, padding='post', value=tag2idx["PADtag"])

In [31]:
# Convert labels to categorical (one-hot encoding)
y = [to_categorical(i, num_classes=len(tag2idx)) for i in y]

In [32]:
# Split the data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [33]:
# Define the BiLSTM Model
input = tf.keras.Input(shape=(MAX_LEN,))

In [34]:
# Embedding layer
model = tf.keras.layers.Embedding(input_dim=len(words), output_dim=100, input_length=MAX_LEN)(input)

# BiLSTM layer
model = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)

# Dense layer
model = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(tag2idx), activation="softmax"))(model)

In [35]:
# Build the model
model = tf.keras.Model(input, model)

In [36]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [37]:
# Print model summary
model.summary()

In [38]:
# Train the model
history = model.fit(X_train, np.array(y_train), validation_data=(X_val, np.array(y_val)), batch_size=32, epochs=5, verbose=1)

Epoch 1/5
[1m1080/1080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 60ms/step - accuracy: 0.9332 - loss: 0.2998 - val_accuracy: 0.9830 - val_loss: 0.0568
Epoch 2/5
[1m1080/1080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 63ms/step - accuracy: 0.9865 - loss: 0.0442 - val_accuracy: 0.9854 - val_loss: 0.0482
Epoch 3/5
[1m1080/1080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 63ms/step - accuracy: 0.9900 - loss: 0.0318 - val_accuracy: 0.9855 - val_loss: 0.0480
Epoch 4/5
[1m1080/1080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 75ms/step - accuracy: 0.9918 - loss: 0.0253 - val_accuracy: 0.9857 - val_loss: 0.0489
Epoch 5/5
[1m1080/1080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 64ms/step - accuracy: 0.9931 - loss: 0.0210 - val_accuracy: 0.9853 - val_loss: 0.0519


In [39]:
# Evaluate the model
test_pred = model.predict(X_test, verbose=1)

[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step


In [40]:
# Convert predictions to label format
pred_labels = np.argmax(test_pred, axis=-1)
true_labels = np.argmax(np.array(y_test), axis=-1)

In [41]:
# Flatten the labels for evaluation
flat_true_labels = [idx2tag[i] for sublist in true_labels for i in sublist if i != tag2idx['PADtag']]
flat_pred_labels = [idx2tag[i] for sublist in pred_labels for i in sublist if i != tag2idx['PADtag']]

In [42]:
# Evaluate on the test set
precision, recall, f1, accuracy = evaluation_ner(flat_true_labels, flat_pred_labels)
print(f"Validation - \n\nPrecision: {precision}, \nRecall: {recall}, \nF1 Score: {f1}, \nAccuracy: {accuracy}")

Validation - 

Precision: 0.9671529855975101, 
Recall: 0.9678869988602603, 
F1 Score: 0.9673189764422029, 
Accuracy: 0.9678869988602603


In [43]:
# Print classification report
print(classification_report(flat_true_labels, flat_pred_labels))

              precision    recall  f1-score   support

       B-art       0.42      0.05      0.09        94
       B-eve       0.62      0.26      0.36        70
       B-geo       0.86      0.89      0.87      7556
       B-gpe       0.96      0.94      0.95      3137
       B-nat       0.33      0.28      0.30        40
       B-org       0.76      0.73      0.75      4148
       B-per       0.85      0.80      0.82      3397
       B-tim       0.92      0.88      0.90      4074
       I-art       0.00      0.00      0.00        84
       I-eve       0.47      0.12      0.20        65
       I-geo       0.79      0.79      0.79      1461
       I-gpe       0.86      0.55      0.67        33
       I-nat       1.00      0.08      0.14        13
       I-org       0.78      0.79      0.79      3391
       I-per       0.86      0.88      0.87      3404
       I-tim       0.76      0.83      0.79      1249
           O       0.99      0.99      0.99    177481

    accuracy              