# Imports and Download/Extract Function

In [2]:
import os
import urllib.request
import tarfile
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout # type: ignore
from tensorflow.keras.models import Model # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
from tensorflow.keras.callbacks import EarlyStopping # type: ignore

In [3]:
def download_extract(url, extract_path):
    os.makedirs(extract_path, exist_ok=True)
    filename = os.path.join(extract_path, 'dataset.tar.gz')
    if not os.path.exists(filename):
        print("Téléchargement du dataset...")
        urllib.request.urlretrieve(url, filename)
        print("Extraction du dataset...")
        with tarfile.open(filename, "r:gz") as tar:
            tar.extractall(path=extract_path)
        print("Dataset prêt !")
    else:
        print("Dataset déjà existant.")

conll_tar_url = 'http://lnsigo.mipt.ru/export/datasets/conll2003.tar.gz'
download_path = 'conll2003'
download_extract(conll_tar_url, download_path)

Dataset déjà existant.


# Data Preprocessing :

In [4]:
data_types = ['train', 'test', 'valid']
dataset_dict = {}

for data_type in data_types:
    filepath = os.path.join(download_path, f'{data_type}.txt')
    with open(filepath, 'r', encoding='utf-8') as f:
        dataset_dict[data_type] = []
        sentences, tags = [], []
        for line in f:
            line = line.strip()
            if line and not line.startswith('-DOCSTART-'):
                parts = line.split()
                if len(parts) == 2:
                    token, tag = parts
                    sentences.append(token)
                    tags.append(tag)
            elif sentences:
                dataset_dict[data_type].append((sentences, tags))
                sentences, tags = [], []

for key in dataset_dict:
    print(f'Nombre de phrases dans {key}: {len(dataset_dict[key])}')

Nombre de phrases dans train: 14041
Nombre de phrases dans test: 3453
Nombre de phrases dans valid: 3250


# Vocabulary Creation and Encoding/Padding Function :

In [5]:
word_vocab = {"<PAD>": 0, "<UNK>": 1}
tag_vocab = {"<PAD>": 0}

for data_type in ['train']:
    for tokens, tags in dataset_dict[data_type]:
        for token in tokens:
            if token not in word_vocab:
                word_vocab[token] = len(word_vocab)
        for tag in tags:
            if tag not in tag_vocab:
                tag_vocab[tag] = len(tag_vocab)

def encode_and_pad(data, vocab, tag_vocab, max_len):
    X = [[vocab.get(token, vocab["<UNK>"]) for token in tokens] for tokens, _ in data]
    y = [[tag_vocab[tag] for tag in tags] for _, tags in data]
    X = pad_sequences(X, maxlen=max_len, padding="post")
    y = pad_sequences(y, maxlen=max_len, padding="post")
    return X, y

max_len = max(len(tokens) for data_type in dataset_dict for tokens, _ in dataset_dict[data_type])

X_train, y_train = encode_and_pad(dataset_dict["train"], word_vocab, tag_vocab, max_len)
X_test, y_test = encode_and_pad(dataset_dict["test"], word_vocab, tag_vocab, max_len)
X_val, y_val = encode_and_pad(dataset_dict["valid"], word_vocab, tag_vocab, max_len)

# Model Definition :

In [6]:
input_length = max_len
n_words = len(word_vocab)
n_tags = len(tag_vocab)
output_dim = 100
lstm_units = 128

input_layer = Input(shape=(input_length,))
embedding_layer = Embedding(input_dim=n_words, output_dim=output_dim)(input_layer)
lstm_layer = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(embedding_layer)
dropout_layer = Dropout(0.5)(lstm_layer)
output_layer = TimeDistributed(Dense(n_tags, activation="softmax"))(dropout_layer)

model = Model(input_layer, output_layer)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

model.summary()

# Model Training and Evaluation :

In [7]:
epochs = 6
batch_size = 2

callback = EarlyStopping(monitor='val_loss', patience=2)

history = model.fit(
    X_train, np.expand_dims(y_train, -1),
    validation_data=(X_val, np.expand_dims(y_val, -1)),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[callback]
)

loss, accuracy = model.evaluate(X_test, np.expand_dims(y_test, -1), batch_size=batch_size)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

model.save("saves/model_pretrained.keras")

Epoch 1/6
[1m7021/7021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m281s[0m 39ms/step - accuracy: 0.9834 - loss: 0.0671 - val_accuracy: 0.9946 - val_loss: 0.0186
Epoch 2/6
[1m7021/7021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 38ms/step - accuracy: 0.9983 - loss: 0.0060 - val_accuracy: 0.9958 - val_loss: 0.0151
Epoch 3/6
[1m7021/7021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 38ms/step - accuracy: 0.9993 - loss: 0.0025 - val_accuracy: 0.9960 - val_loss: 0.0157
Epoch 4/6
[1m7021/7021[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 38ms/step - accuracy: 0.9996 - loss: 0.0014 - val_accuracy: 0.9958 - val_loss: 0.0169
[1m1727/1727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 6ms/step - accuracy: 0.9938 - loss: 0.0250
Test Loss: 0.02411620318889618
Test Accuracy: 0.9941114187240601


# Saving tags and vocabs :

In [8]:
with open("saves/word_vocab.json", "w", encoding="utf-8") as f:
    json.dump(word_vocab, f, ensure_ascii=False, indent=4)

with open("saves/tag_vocab.json", "w", encoding="utf-8") as f:
    json.dump(tag_vocab, f, ensure_ascii=False, indent=4)

# Example usage :

In [9]:
model = tf.keras.models.load_model("saves/model_pretrained.keras")

with open("saves/word_vocab.json", "r", encoding="utf-8") as f:
    word_vocab = json.load(f)

with open("saves/tag_vocab.json", "r", encoding="utf-8") as f:
    tag_vocab = json.load(f)

reverse_tag_vocab = {v: k for k, v in tag_vocab.items()}

def predict_entities(sentence):
    words = sentence.split()
    encoded_sentence = [word_vocab.get(word, word_vocab["<UNK>"]) for word in words]
    padded_sentence = pad_sequences([encoded_sentence], maxlen=max_len, padding="post")  
    predictions = model.predict(padded_sentence)

    predicted_tags = []
    for prediction in predictions[0]:
        tag_index = np.argmax(prediction)
        predicted_tags.append(reverse_tag_vocab.get(tag_index))

    aligned_tags = predicted_tags[:len(words)]

    return list(zip(words, aligned_tags))

sentence = "James Bond works at Google INC in New York."
entities = predict_entities(sentence)
print(entities)

sentence2 = "Donald Trump was the president of the United States."
entities2 = predict_entities(sentence2)
print(entities2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 896ms/step
[('James', 'B-PER'), ('Bond', 'I-PER'), ('works', 'O'), ('at', 'O'), ('Google', 'B-LOC'), ('INC', 'I-LOC'), ('in', 'O'), ('New', 'B-LOC'), ('York.', 'I-LOC')]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[('Donald', 'B-PER'), ('Trump', 'I-PER'), ('was', 'O'), ('the', 'O'), ('president', 'O'), ('of', 'O'), ('the', 'O'), ('United', 'B-ORG'), ('States.', 'I-ORG')]
