In [28]:
# write the list of necessary packages here:
!pip install pandas
!pip install nltk
!pip install spacy
!pip install scikit-learn
!pip install numpy
!pip install tqdm
!pip install tensorflow



# Training a model on Named Entity Recognition task

## Token classification refers to the task of classifying individual tokens in a sentence. One of the most common token classification tasks is Named Entity Recognition (NER). NER attempts to find a label for each entity in a sentence, such as a person, location, or organization. In this assignment, you will learn how to train a model on the CoNLL 2023 NER Dataset dataset to detect new entities.

## Loading the dataset

In [29]:
# import your packages here:
import pandas as pd
import nltk
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import spacy
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, TimeDistributed
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.utils import to_categorical

# Load spaCy model for token processing
nlp = spacy.load("en_core_web_sm")

In [30]:
train_df = pd.read_csv("train.txt", header=0, sep=" ")
val_df = pd.read_csv("val.txt", header=0, sep=" ")
test_df = pd.read_csv("test.txt", header=0, sep=" ")

print(f"{train_df.shape}, {val_df.shape}, {test_df.shape}")

(204566, 4), (51577, 4), (46665, 4)


In [31]:
"""
# google colab crashes due to ram. so i had to use small part of dataset.

train_df = train_df.iloc[:10000]
val_df = val_df.iloc[:10000]
test_df = test_df.iloc[:10000]

print(f"{train_df.shape}, {val_df.shape}, {test_df.shape}")
"""

'\n# google colab crashes due to ram. so i had to use small part of dataset.\n\ntrain_df = train_df.iloc[:10000]\nval_df = val_df.iloc[:10000]\ntest_df = test_df.iloc[:10000]\n\nprint(f"{train_df.shape}, {val_df.shape}, {test_df.shape}")\n'

### The CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on a separate line and there is an empty line after each sentence. The first item on each line is a word, the second a part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags and the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only if two phrases of the same type immediately follow each other, the first word of the second phrase will have tag B-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Here is an example:


In [32]:
train_df.head()

Unnamed: 0,-DOCSTART-,-X-,-X-.1,O
0,EU,NNP,B-NP,B-ORG
1,rejects,VBZ,B-VP,O
2,German,JJ,B-NP,B-MISC
3,call,NN,I-NP,O
4,to,TO,B-VP,O


In [33]:
val_df.head()

Unnamed: 0,-DOCSTART-,-X-,-X-.1,O
0,CRICKET,NNP,B-NP,O
1,-,:,O,O
2,LEICESTERSHIRE,NNP,B-NP,B-ORG
3,TAKE,NNP,I-NP,O
4,OVER,IN,B-PP,O


In [34]:
test_df.head()

Unnamed: 0,-DOCSTART-,-X-,-X-.1,O
0,SOCCER,NN,B-NP,O
1,-,:,O,O
2,JAPAN,NNP,B-NP,B-LOC
3,GET,VB,B-VP,O
4,LUCKY,NNP,B-NP,O


In [35]:
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

labels_vocab = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
labels_vocab_reverse = {v:k for k,v in labels_vocab.items()}

In [36]:
labels_vocab_reverse

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

# Feature Extraction

## You need to extract features for each token. The features can be: • Basic features: Token itself, token lowercase, prefix/suffix of the token. • Context features: Neighboring tokens (previous/next token). • Linguistic features: Part-of-speech (POS) tags or word shapes (capitalization, digits, etc.). Note that you are expected to briefly mention which features you employ for training your model.

In [37]:
columns = ["Token", "POS", "Chunk", "NER"]      # for ease of use
train_df.columns = columns
val_df.columns = columns
test_df.columns = columns

In [38]:
val_df.head()

Unnamed: 0,Token,POS,Chunk,NER
0,CRICKET,NNP,B-NP,O
1,-,:,O,O
2,LEICESTERSHIRE,NNP,B-NP,B-ORG
3,TAKE,NNP,I-NP,O
4,OVER,IN,B-PP,O


In [39]:
train_df

Unnamed: 0,Token,POS,Chunk,NER
0,EU,NNP,B-NP,B-ORG
1,rejects,VBZ,B-VP,O
2,German,JJ,B-NP,B-MISC
3,call,NN,I-NP,O
4,to,TO,B-VP,O
...,...,...,...,...
204561,three,CD,I-NP,O
204562,Swansea,NN,B-NP,B-ORG
204563,1,CD,I-NP,O
204564,Lincoln,NNP,I-NP,B-ORG


In [40]:
test_df.head()

Unnamed: 0,Token,POS,Chunk,NER
0,SOCCER,NN,B-NP,O
1,-,:,O,O
2,JAPAN,NNP,B-NP,B-LOC
3,GET,VB,B-VP,O
4,LUCKY,NNP,B-NP,O


In [53]:
# write your code here:

# Function to extract suffix using custom method
def extract_suffix(word):
    # Define common suffixes to look for
    suffixes = ["ing", "ed", "ly", "s", "es", "er", "est", "ment", "ness", "tion"]

    # Check if the word ends with any of the suffixes
    for suffix in suffixes:
        if word.endswith(suffix):
            return suffix
    return ""  # Return empty string if no suffix is found

# Function to extract prefix using spaCy
def extract_prefix(word):
    doc = nlp(word)  # Process the word with spaCy
    # Extract prefix based on patterns
    for token in doc:
        # Try to identify the prefix using simple heuristic rules
        # For example, check if the token starts with common prefixes like "un", "re", "dis" etc.
        # This can be extended with a more sophisticated method or word list.
        if token.text.startswith("un"):
            return "un"
        elif token.text.startswith("dis"):
            return "dis"
        elif token.text.startswith("re"):
            return "re"
        # Add more common prefixes as needed.
    return ""  # If no prefix found, return empty



def data_preprocessing(df):
    sentences = []  # Initialize an empty list to hold all the sentences
    sentence = []   # Initialize an empty list to hold the tokens of the current sentence

    # Iterate through rows of the dataframe with error handling
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
        try:
            token = row["Token"]  # Get the token (word) from the current row

            # Skip the -DOCSTART- line as it's not part of the actual data
            if token == "-DOCSTART-" or pd.isna(token):  # Check if the token is NaN or missing
                continue

            # Ensure token is a string before processing
            token = str(token)  # Convert token to string to avoid float/NaN issues

            # Process token using spaCy (lemmatization)
            #doc = nlp(token)  # Process the token with spaCy
            #lemmatized_token = doc[0].lemma_  # Get the lemmatized form of the token

            # Extract prefix and suffix using custom methods
            prefix = extract_prefix(token) if extract_prefix(token) else "<NONE>"
            suffix = extract_suffix(token) if extract_suffix(token) else "<NONE>"

            # Prepare features for each token
            token_lowercase = token.lower()  # Lowercase version of the token
            prev_token = sentence[-1]["token"] if sentence else "<START>"  # Previous token (for context)
            next_token = df.iloc[index + 1]["Token"] if index + 1 < len(df) else "<END>"  # Next token (for context)

            # Store features for the current token
            sentence.append({
                "token": token,  # Original token
                "token_lowercase": token_lowercase,  # Lowercased token
                #"lemmatized_token": lemmatized_token,  # Lemmatized token
                "prefix": prefix,  # Prefix of the token (if any)
                "suffix": suffix,  # Suffix of the token (if any)
                "prev_token": prev_token,  # Previous token (for context)
                "next_token": next_token,  # Next token (for context)
                "pos_tag": row["POS"],  # POS tag (e.g., noun, verb)
                "chunk_tag": row["Chunk"],  # Chunk tag (e.g., noun phrase, verb phrase)
                "is_capitalized": token[0].isupper(),  # Is the first letter capitalized?
                "has_digits": token.isdigit(),  # Only mark as True if token consists entirely of digits
                "is_alphanumeric": token.isalnum(),  # Is the token alphanumeric (letters + digits)?
                "ner_label": row["NER"]  # NER label (e.g., PERSON, ORGANIZATION, etc.)
            })

            # Sentence boundary detection: if token is punctuation, mark the end of the sentence
            if token in [".", "!", "?", ";"]:
                sentences.append(sentence)  # Add the sentence to the sentences list
                sentence = []  # Reset the sentence list for the next sentence

        except Exception as e:
            print(f"Error processing token at index {index}: {e}")
            print(f"Token: {token}")
            continue  # Skip to the next token

    # Handle the last sentence (if any)
    if sentence:
        sentences.append(sentence)

    return sentences  # Return the list of processed sentences



In [42]:
train_sentences = data_preprocessing(train_df)

# Check the preprocessed data
print(train_sentences[0])  # Print first sentence for inspection

Processing rows: 100%|██████████| 204566/204566 [28:02<00:00, 121.59it/s]

[{'token': 'EU', 'token_lowercase': 'eu', 'prefix': '<NONE>', 'suffix': '<NONE>', 'prev_token': '<START>', 'next_token': 'rejects', 'pos_tag': 'NNP', 'chunk_tag': 'B-NP', 'is_capitalized': True, 'has_digits': False, 'is_alphanumeric': True, 'ner_label': 'B-ORG'}, {'token': 'rejects', 'token_lowercase': 'rejects', 'prefix': 're', 'suffix': 's', 'prev_token': 'EU', 'next_token': 'German', 'pos_tag': 'VBZ', 'chunk_tag': 'B-VP', 'is_capitalized': False, 'has_digits': False, 'is_alphanumeric': True, 'ner_label': 'O'}, {'token': 'German', 'token_lowercase': 'german', 'prefix': '<NONE>', 'suffix': '<NONE>', 'prev_token': 'rejects', 'next_token': 'call', 'pos_tag': 'JJ', 'chunk_tag': 'B-NP', 'is_capitalized': True, 'has_digits': False, 'is_alphanumeric': True, 'ner_label': 'B-MISC'}, {'token': 'call', 'token_lowercase': 'call', 'prefix': '<NONE>', 'suffix': '<NONE>', 'prev_token': 'German', 'next_token': 'to', 'pos_tag': 'NN', 'chunk_tag': 'I-NP', 'is_capitalized': False, 'has_digits': False, 




In [43]:
val_sentences = data_preprocessing(val_df)

# Check the preprocessed data
print(val_sentences[0])  # Print first sentence for inspection

Processing rows: 100%|██████████| 51577/51577 [06:51<00:00, 125.21it/s]

[{'token': 'CRICKET', 'token_lowercase': 'cricket', 'prefix': '<NONE>', 'suffix': '<NONE>', 'prev_token': '<START>', 'next_token': '-', 'pos_tag': 'NNP', 'chunk_tag': 'B-NP', 'is_capitalized': True, 'has_digits': False, 'is_alphanumeric': True, 'ner_label': 'O'}, {'token': '-', 'token_lowercase': '-', 'prefix': '<NONE>', 'suffix': '<NONE>', 'prev_token': 'CRICKET', 'next_token': 'LEICESTERSHIRE', 'pos_tag': ':', 'chunk_tag': 'O', 'is_capitalized': False, 'has_digits': False, 'is_alphanumeric': False, 'ner_label': 'O'}, {'token': 'LEICESTERSHIRE', 'token_lowercase': 'leicestershire', 'prefix': '<NONE>', 'suffix': '<NONE>', 'prev_token': '-', 'next_token': 'TAKE', 'pos_tag': 'NNP', 'chunk_tag': 'B-NP', 'is_capitalized': True, 'has_digits': False, 'is_alphanumeric': True, 'ner_label': 'B-ORG'}, {'token': 'TAKE', 'token_lowercase': 'take', 'prefix': '<NONE>', 'suffix': '<NONE>', 'prev_token': 'LEICESTERSHIRE', 'next_token': 'OVER', 'pos_tag': 'NNP', 'chunk_tag': 'I-NP', 'is_capitalized': T




In [44]:
test_sentences = data_preprocessing(test_df)

# Check the preprocessed data
print(test_sentences[0])  # Print first  sentence for inspection

Processing rows: 100%|██████████| 46665/46665 [06:21<00:00, 122.43it/s]

[{'token': 'SOCCER', 'token_lowercase': 'soccer', 'prefix': '<NONE>', 'suffix': '<NONE>', 'prev_token': '<START>', 'next_token': '-', 'pos_tag': 'NN', 'chunk_tag': 'B-NP', 'is_capitalized': True, 'has_digits': False, 'is_alphanumeric': True, 'ner_label': 'O'}, {'token': '-', 'token_lowercase': '-', 'prefix': '<NONE>', 'suffix': '<NONE>', 'prev_token': 'SOCCER', 'next_token': 'JAPAN', 'pos_tag': ':', 'chunk_tag': 'O', 'is_capitalized': False, 'has_digits': False, 'is_alphanumeric': False, 'ner_label': 'O'}, {'token': 'JAPAN', 'token_lowercase': 'japan', 'prefix': '<NONE>', 'suffix': '<NONE>', 'prev_token': '-', 'next_token': 'GET', 'pos_tag': 'NNP', 'chunk_tag': 'B-NP', 'is_capitalized': True, 'has_digits': False, 'is_alphanumeric': True, 'ner_label': 'B-LOC'}, {'token': 'GET', 'token_lowercase': 'get', 'prefix': '<NONE>', 'suffix': '<NONE>', 'prev_token': 'JAPAN', 'next_token': 'LUCKY', 'pos_tag': 'VB', 'chunk_tag': 'B-VP', 'is_capitalized': True, 'has_digits': False, 'is_alphanumeric'




# Train a NER Classifier Model

## Implement one of the following classifiers for recognizing multiple entity types (e.g., person, organization, location): Conditional Random Field (CRF), biLSTM or multinomial logistic regression. Select only one and provide a brief explanation for your choice of model.

For the implementation of Named Entity Recognition (NER) involving multiple entity types, I have selected the biLSTM (Bidirectional Long Short-Term Memory) model over alternatives like CRF or multinomial logistic regression. It has some advantages.



Handling Sequential Data: NER is a sequence labeling task, and biLSTM is particularly well-suited for such problems. It captures context from both preceding (left) and succeeding (right) words in a sentence. This bidirectional capability enables the model to utilize the full context, which is critical for resolving ambiguities, such as distinguishing between "Apple" (the company) and "apple" (the fruit).

Proven Effectiveness for NER: biLSTM models are known for delivering excellent performance on NER tasks. They effectively learn dependencies between tokens in a sequence while overcoming the limitations of models that consider only past or future context.

Feature Flexibility: biLSTM can efficiently incorporate various features, such as word embeddings, part-of-speech tags, prefixes, and suffixes. These features are essential for accurately identifying and differentiating between entity types.

https://domino.ai/blog/named-entity-recognition-ner-challenges-and-model

https://medium.com/illuin/named-entity-recognition-with-bilstm-cnns-632ba83d3d41

https://www.quora.com/Why-do-we-use-Bi-LSTM-for-Named-Entity-Recognition-instead-of-normal-LSTM

In [45]:
# Preparing the data for training
def prepare_data(sentences):
    X, y = [], []
    for sentence in sentences:
        tokens = [word['token'] for word in sentence]
        labels = [word['ner_label'] for word in sentence]
        X.append(tokens)
        y.append(labels)
    return X, y

# Tokenization and Padding
tokenizer = Tokenizer(lower=True)
X_train, y_train = prepare_data(train_sentences)
X_val, y_val = prepare_data(val_sentences)
X_test, y_test = prepare_data(test_sentences)

# Fit the tokenizer on training data
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences
maxlen = max([len(sentence) for sentence in X_train])
X_train_seq = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_val_seq = pad_sequences(X_val_seq, maxlen=maxlen, padding='post')
X_test_seq = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

# Label Encoding
all_labels = list(set([label for labels in y_train for label in labels]))
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

y_train_encoded = [label_encoder.transform(labels) for labels in y_train]
y_val_encoded = [label_encoder.transform(labels) for labels in y_val]
y_test_encoded = [label_encoder.transform(labels) for labels in y_test]

# Padding labels
y_train_encoded = pad_sequences(y_train_encoded, maxlen=maxlen, padding='post')
y_val_encoded = pad_sequences(y_val_encoded, maxlen=maxlen, padding='post')
y_test_encoded = pad_sequences(y_test_encoded, maxlen=maxlen, padding='post')

# Convert labels to categorical for the model
y_train_categorical = to_categorical(y_train_encoded, num_classes=len(label_encoder.classes_))
y_val_categorical = to_categorical(y_val_encoded, num_classes=len(label_encoder.classes_))



In [46]:
# Build the biLSTM Model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=maxlen))
model.add(Bidirectional(LSTM(units=16, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(TimeDistributed(Dense(len(label_encoder.classes_), activation='softmax')))

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train_seq, y_train_categorical,
    validation_data=(X_val_seq, y_val_categorical),
    epochs=3,
    batch_size=8
)



Epoch 1/3
[1m931/931[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2211s[0m 2s/step - accuracy: 0.9898 - loss: 0.0838 - val_accuracy: 0.9947 - val_loss: 0.0290
Epoch 2/3
[1m931/931[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2102s[0m 2s/step - accuracy: 0.9969 - loss: 0.0101 - val_accuracy: 0.9947 - val_loss: 0.0330
Epoch 3/3
[1m931/931[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2120s[0m 2s/step - accuracy: 0.9982 - loss: 0.0063 - val_accuracy: 0.9948 - val_loss: 0.0358


In [55]:
# We can use class_weights for class imbalances


# padding sequences
maxlen = max([len(sentence) for sentence in X_train])
X_train_seq = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_val_seq = pad_sequences(X_val_seq, maxlen=maxlen, padding='post')
X_test_seq = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

# labels are integer-encoded and padded
y_train_encoded = pad_sequences(y_train_encoded, maxlen=maxlen, padding='post')
y_val_encoded = pad_sequences(y_val_encoded, maxlen=maxlen, padding='post')

# class weight calculation
flat_labels = [label for labels in y_train_encoded for label in labels if label != 0]
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(flat_labels),
    y=flat_labels
)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# Build the Model
model2 = Sequential()
model2.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=maxlen))
model2.add(Bidirectional(LSTM(units=4, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)))
model2.add(TimeDistributed(Dense(len(label_encoder.classes_), activation='softmax')))

model2.compile(optimizer=Adam(learning_rate=1.0), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


history = model2.fit(
    X_train_seq, y_train_encoded,
    validation_data=(X_val_seq, y_val_encoded),
    epochs=3,
    batch_size=8
)




Epoch 1/3
[1m931/931[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1396s[0m 1s/step - accuracy: 0.9794 - loss: 0.1494 - val_accuracy: 0.9886 - val_loss: 0.1737
Epoch 2/3
[1m931/931[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1289s[0m 1s/step - accuracy: 0.9878 - loss: 0.1692 - val_accuracy: 0.9890 - val_loss: 0.1730
Epoch 3/3
[1m931/931[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1345s[0m 1s/step - accuracy: 0.9879 - loss: 0.1784 - val_accuracy: 0.9902 - val_loss: 0.1574


# Evaluation
## Evaluate the model on the test set using metrics such as precision, recall, and F1-score

In [56]:
# Evaluate the model on the test set
y_pred = model.predict(X_test_seq)
y_pred_labels = np.argmax(y_pred, axis=-1)

[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 525ms/step


In [57]:
# Decode the predicted labels back to NER tags
y_pred_decoded = label_encoder.inverse_transform(y_pred_labels.flatten())

In [58]:
# Flatten the actual labels and decode them
y_test_flat = y_test_encoded.flatten()
y_test_decoded = label_encoder.inverse_transform(y_test_flat)

In [59]:
# Print Classification Report
print(classification_report(y_test_decoded, y_pred_decoded))

              precision    recall  f1-score   support

       B-LOC       1.00      1.00      1.00   2034857
      B-MISC       0.32      0.22      0.26       702
       B-ORG       0.20      0.16      0.17      1661
       B-PER       0.26      0.16      0.20      1617
       I-LOC       0.40      0.01      0.02       257
      I-MISC       0.10      0.01      0.02       216
       I-ORG       0.28      0.06      0.09       835
       I-PER       0.17      0.05      0.08      1156
           O       0.87      0.82      0.84     37894
         nan       0.68      0.68      0.68       421

    accuracy                           0.99   2079616
   macro avg       0.43      0.32      0.34   2079616
weighted avg       0.99      0.99      0.99   2079616



In [60]:
y_pred2 = model2.predict(X_test_seq)
y_pred2_labels = np.argmax(y_pred2, axis=-1)
y_pred2_decoded = label_encoder.inverse_transform(y_pred2_labels.flatten())
y_test_flat = y_test_encoded.flatten()
y_test_decoded = label_encoder.inverse_transform(y_test_flat)
print(classification_report(y_test_decoded, y_pred2_decoded))

[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 469ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       B-LOC       0.99      1.00      1.00   2034857
      B-MISC       0.00      0.00      0.00       702
       B-ORG       0.00      0.00      0.00      1661
       B-PER       0.00      0.00      0.00      1617
       I-LOC       0.00      0.00      0.00       257
      I-MISC       0.00      0.00      0.00       216
       I-ORG       0.00      0.00      0.00       835
       I-PER       0.00      0.00      0.00      1156
           O       0.84      0.64      0.73     37894
         nan       0.00      0.00      0.00       421

    accuracy                           0.99   2079616
   macro avg       0.18      0.16      0.17   2079616
weighted avg       0.99      0.99      0.99   2079616



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Report

I chose to use only the token_lowercase for simplicity, but if we use the features we discovered during the feature selection process, such as suffix, prefix, pos_tag, previous token, we would get a much more successful model. Of course, since we added these features, we would need a larger model and more epochs. We also need sufficient computational power to do this. We can also reduce the risk of overfitting by adding more data. I preferred to use dropout to reduce the risk of possible overfitting in the model.

We can use the compute_class_weight structure to eliminate the imbalance in the data set. This calculates the weight of each class and less common classes receive higher weights. Instead, structures such as oversampling and focal loss could have been preferred.

As I mentioned above, biLSTM produces good results for NER tasks. We can increase the success of the current model by playing with hyperparameters such as epoch numbers and number of neurons and layers. However, using more advanced models such as transformers architecture can provide more successful results.

https://keras.io/examples/nlp/ner_transformers/

https://medium.com/@pasdan/building-custom-named-entity-recognition-ner-models-transformers-9759f8d547d8