In [1]:
import os
import urllib.request
import tarfile
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
from tensorflow.keras.models import Model

# Download and extract dataset
def download_untar(url, extract_path):
    if not os.path.exists(extract_path):
        os.makedirs(extract_path, exist_ok=True)
        filename = os.path.join(extract_path, 'dataset.tar.gz')
        urllib.request.urlretrieve(url, filename)
        with tarfile.open(filename, "r:gz") as tar:
            tar.extractall(path=extract_path)
        print("Dataset ready!")
    else:
        print("Dataset already exists.")

# Download the dataset
conll_tar_url = 'http://lnsigo.mipt.ru/export/datasets/conll2003.tar.gz'
download_path = 'conll2003/'
download_untar(conll_tar_url, download_path)

# Preprocess dataset
data_types = ['train', 'test', 'valid']
dataset_dict = dict()

for data_type in data_types:
    with open(f'{download_path}{data_type}.txt', 'r') as f:
        xy_list = []
        tokens, tags = [], []
        for line in f:
            items = line.split()
            if len(items) > 1 and '-DOCSTART-' not in items[0]:
                token, tag = items
                tokens.append(token)
                tags.append(tag)
            elif tokens:
                xy_list.append((tokens, tags))
                tokens, tags = [], []
        dataset_dict[data_type] = xy_list

# Display dataset statistics
for key in dataset_dict:
    print(f'Number of sentences in {key}: {len(dataset_dict[key])}')

# Display sample data
print("\nFirst two samples from the training set:")
for i, (tokens, tags) in enumerate(dataset_dict['train'][:2]):
    print(f"Sentence {i + 1}:")
    print(f"Tokens: {tokens}")
    print(f"Tags:   {tags}")


Dataset already exists.
Number of sentences in train: 14041
Number of sentences in test: 3453
Number of sentences in valid: 3250

First two samples from the training set:
Sentence 1:
Tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
Tags:   ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
Sentence 2:
Tokens: ['Peter', 'Blackburn']
Tags:   ['B-PER', 'I-PER']


In [2]:
from sklearn.model_selection import train_test_split

# Prepare data for model
def encode_data(data, vocab, tag_vocab):
    X, y = [], []
    for tokens, tags in data:
        X.append([vocab.get(token, vocab["<UNK>"]) for token in tokens])
        y.append([tag_vocab[tag] for tag in tags])
    return X, y

# Create vocabularies
word_vocab = {"<PAD>": 0, "<UNK>": 1}
tag_vocab = {"<PAD>": 0}
for tokens, tags in dataset_dict["train"]:
    for token in tokens:
        if token not in word_vocab:
            word_vocab[token] = len(word_vocab)
    for tag in tags:
        if tag not in tag_vocab:
            tag_vocab[tag] = len(tag_vocab)

# Encode dataset
X_train, y_train = encode_data(dataset_dict["train"], word_vocab, tag_vocab)
X_test, y_test = encode_data(dataset_dict["test"], word_vocab, tag_vocab)

# Pad sequences dynamically
all_data = X_train + X_test
max_len = max(len(seq) for seq in all_data)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_len, padding="post")
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_len, padding="post")
y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train, maxlen=max_len, padding="post")
y_test = tf.keras.preprocessing.sequence.pad_sequences(y_test, maxlen=max_len, padding="post")


In [3]:
# Define model parameters
input_length = max_len
n_words = len(word_vocab)
n_tags = len(tag_vocab)
output_dim = 100  # Word embedding size
lstm_units = 128  # LSTM units

# Build the model
input_layer = Input(shape=(input_length,))
embedding_layer = Embedding(input_dim=n_words, output_dim=output_dim)(input_layer)
lstm_layer = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(embedding_layer)
dropout_layer = Dropout(0.5)(lstm_layer)
output_layer = TimeDistributed(Dense(n_tags, activation="softmax"))(dropout_layer)

model = Model(input_layer, output_layer)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Model summary
model.summary()


In [4]:
epochs = 5
batch_size = 8

history = model.fit(
    X_train, np.expand_dims(y_train, -1),
    validation_data=(X_test, np.expand_dims(y_test, -1)),
    epochs=epochs,
    batch_size=batch_size
)

Epoch 1/5
[1m1756/1756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 53ms/step - accuracy: 0.9744 - loss: 0.1079 - val_accuracy: 0.9919 - val_loss: 0.0268
Epoch 2/5
[1m1756/1756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 55ms/step - accuracy: 0.9969 - loss: 0.0109 - val_accuracy: 0.9937 - val_loss: 0.0222
Epoch 3/5
[1m1756/1756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 54ms/step - accuracy: 0.9990 - loss: 0.0038 - val_accuracy: 0.9935 - val_loss: 0.0250
Epoch 4/5
[1m1756/1756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 53ms/step - accuracy: 0.9995 - loss: 0.0021 - val_accuracy: 0.9941 - val_loss: 0.0237
Epoch 5/5
[1m1756/1756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 55ms/step - accuracy: 0.9997 - loss: 0.0012 - val_accuracy: 0.9934 - val_loss: 0.0280


In [5]:
loss, accuracy = model.evaluate(X_test, np.expand_dims(y_test, -1), batch_size=batch_size)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.9929 - loss: 0.0295
Test Loss: 0.027967985719442368
Test Accuracy: 0.9934041500091553


In [12]:
model.save(filepath="./model_pretrained.keras")