In [12]:
#traditional model

In [13]:
from datasets import load_dataset

dataset = load_dataset("conll2003", trust_remote_code=True)
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [14]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }
    if i > 0:
        word1 = sent[i - 1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper()
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(list(zip(sent["tokens"], sent["ner_tags"])), i) for i in range(len(sent["tokens"]))]

def sent2labels(sent):
    return [tag for tag in sent["ner_tags"]]


In [15]:
X_train = [sent2features(s) for s in dataset["train"]]
y_train = [sent2labels(s) for s in dataset["train"]]

X_test = [sent2features(s) for s in dataset["test"]]
y_test = [sent2labels(s) for s in dataset["test"]]

# Decode label IDs to names for evaluation later
label_names = dataset["train"].features["ner_tags"].feature.names


In [17]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

# Convert y_train and y_test to lists of strings
y_train_str = [[str(label) for label in seq] for seq in y_train]
y_test_str = [[str(label) for label in seq] for seq in y_test]

# Fit the CRF model
crf.fit(X_train, y_train_str)


In [18]:
from seqeval.metrics import classification_report

y_pred = crf.predict(X_test)

# Convert label IDs to names
y_test_str = [[label_names[i] for i in seq] for seq in y_test]
y_pred_str = [[label_names[int(i)] for i in seq] for seq in y_pred]

print(classification_report(y_test_str, y_pred_str))


              precision    recall  f1-score   support

         LOC       0.87      0.82      0.84      1668
        MISC       0.80      0.74      0.77       702
         ORG       0.78      0.66      0.72      1661
         PER       0.82      0.84      0.83      1617

   micro avg       0.82      0.77      0.79      5648
   macro avg       0.82      0.77      0.79      5648
weighted avg       0.82      0.77      0.79      5648



In [None]:
#neural network model

In [19]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import numpy as np

dataset = load_dataset("conll2003", trust_remote_code=True)
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)


In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

MAX_LEN = 128

# Build vocab
all_words = set()
for row in dataset["train"]:
    all_words.update(row["tokens"])
word2idx = {w: i + 2 for i, w in enumerate(sorted(all_words))}
word2idx["PAD"] = 0
word2idx["UNK"] = 1
idx2word = {i: w for w, i in word2idx.items()}

# Encode tags
tag2idx = {t: i for i, t in enumerate(label_list)}
idx2tag = {i: t for t, i in tag2idx.items()}

def encode_sentence(tokens):
    return [word2idx.get(w, word2idx["UNK"]) for w in tokens]

def encode_labels(tags):
    return [tag2idx[t] for t in tags]

def prepare_data(split):
    X = [encode_sentence(example["tokens"]) for example in dataset[split]]
    y = [encode_labels([label_list[i] for i in example["ner_tags"]]) for example in dataset[split]]
    X = pad_sequences(X, maxlen=MAX_LEN, padding="post")
    y = pad_sequences(y, maxlen=MAX_LEN, padding="post")
    y = [to_categorical(i, num_classes=num_labels) for i in y]
    return np.array(X), np.array(y)

X_train, y_train = prepare_data("train")
X_val, y_val = prepare_data("validation")


In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, TimeDistributed, Dense, Bidirectional

model = Sequential()
model.add(Embedding(input_dim=len(word2idx), output_dim=128, input_length=MAX_LEN, mask_zero=True))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(TimeDistributed(Dense(num_labels, activation="softmax")))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()


In [23]:
history = model.fit(X_train, y_train, batch_size=32, epochs=3, validation_data=(X_val, y_val))


Epoch 1/3
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 260ms/step - accuracy: 0.9808 - loss: 0.8120 - val_accuracy: 0.9902 - val_loss: 0.3064
Epoch 2/3
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 245ms/step - accuracy: 0.9949 - loss: 0.1600 - val_accuracy: 0.9941 - val_loss: 0.1885
Epoch 3/3
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 253ms/step - accuracy: 0.9989 - loss: 0.0430 - val_accuracy: 0.9946 - val_loss: 0.1667


In [26]:
model.build(input_shape=(None, MAX_LEN))
model.summary()


In [29]:
X_test, y_test = prepare_data("test")
y_pred = model.predict(X_test)

y_pred_labels = np.argmax(y_pred, axis=-1)
y_true_labels = np.argmax(y_test, axis=-1)

def decode_tags(y_seq, y_mask):
    return [
        [idx2tag[idx] for idx, mask in zip(row, mask_row) if mask != 0]
        for row, mask_row in zip(y_seq, y_mask)
    ]

# Create masks to exclude padding tokens
y_mask = np.argmax(y_test, axis=-1) != 0

y_pred_str = decode_tags(y_pred_labels, y_mask)
y_true_str = decode_tags(y_true_labels, y_mask)

from seqeval.metrics import classification_report
print(classification_report(y_true_str, y_pred_str))

import numpy as np

loss, accuracy = model.evaluate(X_test, y_test)
perplexity = np.exp(loss)

print("Cross-entropy Loss:", loss)
print("Perplexity:", perplexity)


[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step
              precision    recall  f1-score   support

         LOC       0.20      0.28      0.23      1668
        MISC       0.10      0.16      0.12       702
         ORG       0.14      0.15      0.14      1661
         PER       0.07      0.06      0.07      1617

   micro avg       0.14      0.16      0.15      5648
   macro avg       0.13      0.16      0.14      5648
weighted avg       0.13      0.16      0.14      5648

[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 37ms/step - accuracy: 0.9045 - loss: 2.1971
Cross-entropy Loss: 2.196669816970825
Perplexity: 8.995008541371172
