In [18]:
import numpy as np
import math
from collections import defaultdict

# ========== Lecture du fichier conllu ==========
def read_conllu_file(filepath):
    sentences = []
    sentence = []
    with open(filepath, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line.startswith("#") or line == "":
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                parts = line.split('\t')
                if len(parts) == 10:
                    word = parts[1]
                    pos_tag = parts[3]
                    sentence.append((word, pos_tag))
        if sentence:
            sentences.append(sentence)
    return sentences

# ========== Comptage des transitions et émissions ==========
def create_dictionaries_fr(sentences):
    emission_counts = defaultdict(int)
    transition_counts = defaultdict(int)
    tag_counts = defaultdict(int)

    for sent in sentences:
        prev_tag = '--s--'
        tag_counts[prev_tag] += 1
        for word, tag in sent:
            transition_counts[(prev_tag, tag)] += 1
            emission_counts[(tag, word.lower())] += 1
            tag_counts[tag] += 1
            prev_tag = tag

    return emission_counts, transition_counts, tag_counts

# ========== Construction du vocabulaire avec <UNK> pour mots rares ==========
def build_vocab(emission_counts, min_freq=2):
    word_freq = defaultdict(int)
    for (tag, word), count in emission_counts.items():
        word_freq[word] += count

    frequent_words = [word for word, freq in word_freq.items() if freq >= min_freq]
    vocab = {word: i for i, word in enumerate(sorted(frequent_words))}
    vocab['<UNK>'] = len(vocab)
    return vocab

# ========== Matrice de transition ==========
def create_transition_matrix_fr(alpha, tag_counts, transition_counts):
    tags = sorted(tag_counts.keys())
    num_tags = len(tags)
    tag_to_index = {tag: i for i, tag in enumerate(tags)}

    A = np.zeros((num_tags, num_tags))

    for prev_tag in tags:
        for curr_tag in tags:
            i, j = tag_to_index[prev_tag], tag_to_index[curr_tag]
            count = transition_counts.get((prev_tag, curr_tag), 0)
            count_prev = tag_counts[prev_tag]
            A[i, j] = (count + alpha) / (count_prev + alpha * num_tags)

    return A, tag_to_index

# ========== Matrice d’émission avec <UNK> ==========
def create_emission_matrix_fr(alpha, tag_counts, emission_counts, vocab):
    tags = sorted(tag_counts.keys())
    num_tags = len(tags)
    num_words = len(vocab)
    tag_to_index = {tag: i for i, tag in enumerate(tags)}
    word_to_index = vocab

    B = np.zeros((num_tags, num_words))

    for (tag, word), count in emission_counts.items():
        word = word if word in word_to_index else '<UNK>'
        i = tag_to_index[tag]
        j = word_to_index[word]
        B[i, j] += count

    # Normalisation avec lissage
    for tag in tags:
        i = tag_to_index[tag]
        total = sum(B[i, :]) + alpha * num_words
        for word in vocab:
            j = word_to_index[word]
            B[i, j] = (B[i, j] + alpha) / total

    return B, tag_to_index, word_to_index

# ========== Viterbi Forward ==========
def viterbi_forward_fr(A, B, test_sentence, tag_to_index, word_to_index):
    num_tags = len(tag_to_index)
    m = len(test_sentence)
    best_probs = np.full((num_tags, m), -np.inf)
    best_paths = np.zeros((num_tags, m), dtype=int)

    s_idx = tag_to_index['--s--']

    first_word = test_sentence[0].lower()
    word_index = word_to_index.get(first_word, word_to_index['<UNK>'])

    for i in range(num_tags):
        if A[s_idx, i] > 0:
            emit_prob = B[i, word_index]
            best_probs[i, 0] = math.log(A[s_idx, i]) + math.log(emit_prob)

    for t in range(1, m):
        word = test_sentence[t].lower()
        word_index = word_to_index.get(word, word_to_index['<UNK>'])

        for j in range(num_tags):
            max_prob = -np.inf
            best_k = 0
            for k in range(num_tags):
                trans_prob = A[k, j]
                emit_prob = B[j, word_index]
                prob = best_probs[k, t-1] + math.log(trans_prob) + math.log(emit_prob)
                if prob > max_prob:
                    max_prob = prob
                    best_k = k
            best_probs[j, t] = max_prob
            best_paths[j, t] = best_k

    return best_probs, best_paths

# ========== Viterbi Backward ==========
def viterbi_backward_fr(best_probs, best_paths, tag_to_index, states):
    m = best_probs.shape[1]
    z = [None] * m
    pred = [None] * m

    last_tag = np.argmax(best_probs[:, m-1])
    z[m-1] = last_tag
    pred[m-1] = states[last_tag]

    for i in reversed(range(1, m)):
        z[i-1] = best_paths[z[i], i]
        pred[i-1] = states[z[i-1]]

    return pred


In [19]:

import random

def evaluate_model_accuracy(test_sentences, A, B, tag_to_index, word_to_index):
    states = sorted(tag_to_index.keys())
    correct = 0
    total = 0

    for sentence in test_sentences:
        words = [word for word, true_tag in sentence]
        true_tags = [true_tag for word, true_tag in sentence]

        best_probs, best_paths = viterbi_forward_fr(A, B, words, tag_to_index, word_to_index)
        pred_tags = viterbi_backward_fr(best_probs, best_paths, tag_to_index, states)

        for pred_tag, true_tag in zip(pred_tags, true_tags):
            if pred_tag == true_tag:
                correct += 1
            total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy


def test_random_sentence(test_sentences, A, B, tag_to_index, word_to_index):
    states = sorted(tag_to_index.keys())
    sentence = random.choice(test_sentences)
    words = [word for word, tag in sentence]
    true_tags = [tag for word, tag in sentence]

    print("=== Phrase testée ===")
    print(" ".join(words))

    best_probs, best_paths = viterbi_forward_fr(A, B, words, tag_to_index, word_to_index)
    pred_tags = viterbi_backward_fr(best_probs, best_paths, tag_to_index, states)

    print("\nMots :")
    print(words)
    print("\nÉtiquettes réelles :")
    print(true_tags)
    print("\nÉtiquettes prédites :")
    print(pred_tags)

    correct = sum(p == t for p, t in zip(pred_tags, true_tags))
    total = len(true_tags)
    accuracy = correct / total if total > 0 else 0
    print(f"\nAccuracy sur cette phrase : {accuracy:.2%}")


def test_custom_sentence(custom_sentence, A, B, tag_to_index, word_to_index):
    states = sorted(tag_to_index.keys())
    words = custom_sentence.split()

    print("=== Phrase testée ===")
    print(" ".join(words))

    best_probs, best_paths = viterbi_forward_fr(A, B, words, tag_to_index, word_to_index)
    pred_tags = viterbi_backward_fr(best_probs, best_paths, tag_to_index, states)

    print("\nMots :")
    print(words)
    print("\nÉtiquettes prédites :")
    print(pred_tags)


In [None]:
# ========= Chargement des données d'entraînement =========
train_sentences = read_conllu_file("/content/train.conllu")

# Création des dictionnaires
emission_counts, transition_counts, tag_counts = create_dictionaries_fr(train_sentences)

# Création du vocabulaire
vocab = build_vocab(emission_counts, min_freq=2)

# Smoothing
alpha = 0.001

# Matrices de transition et d'émission
A, tag_to_index = create_transition_matrix_fr(alpha, tag_counts, transition_counts)
B, tag_to_index, word_to_index = create_emission_matrix_fr(alpha, tag_counts, emission_counts, vocab)

# ========= Évaluation globale sur le test =========
test_sentences = read_conllu_file("/content/test.conllu")
accuracy = evaluate_model_accuracy(test_sentences, A, B, tag_to_index, word_to_index)
print(f"\nAccuracy du modèle sur l'ensemble du fichier test : {accuracy:.2%}\n")

# ========= Test d’une phrase aléatoire =========
dev_sentences = read_conllu_file("/content/dev.conllu")
test_random_sentence(dev_sentences, A, B, tag_to_index, word_to_index)
