## Part2-NLP Language Model : *POS Tagging*

### 01. Importing Libraries

In [1]:
import pandas as pd
from conllu import parse
from itertools import chain
import numpy as np
from collections import Counter
import math

### 02. Extracting the data

In [2]:
with open('data/DATA_3/train.conllu', 'r', encoding='utf-8') as file:
    data = file.read()

parsed_data = parse(data)

sentences = []
forms = []
lemmas = []
upos = []

for sentence in parsed_data:
    sent = []
    form = []
    lemma = []
    upos_tag = []
    for token in sentence:
        sent.append(token['sentence'])
        form.append(token['form'])
        lemma.append(token['lemma'])
        upos_tag.append(token['upostag'])
    sentences.append(' '.join(sent))
    forms.append(form)
    lemmas.append(lemma)
    upos.append(upos_tag)


In [3]:
sentences[:5]

["Les commotions cérébrales sont devenu si courantes dans ce sport qu' on les considére presque comme la routine .",
 "L' œuvre est située dans la galerie des de les batailles , dans le château de Versailles .",
 "Le comportement de la Turquie vis-à-vis du de le problème palestinien a fait qu' elle n' est plus en odeur de sainteté auprès de la communauté juive en générale , et américaine en particulier .",
 'Toutefois , les filles adorent les desserts .',
 "Ismene entre et annonce que c' est Farnace qui a mis le feu à la flotte romaine ."]

In [4]:
POS_data = {
    'sentences':sentences,
    'forms':forms,
    'lemmas':lemmas,
    'UPOS': upos
}

In [5]:
POS_df = pd.DataFrame(POS_data)
POS_df

Unnamed: 0,sentences,forms,lemmas,UPOS
0,Les commotions cérébrales sont devenu si coura...,"[Les, commotions, cérébrales, sont, devenu, si...","[le, commotion, cérébral, être, devenir, si, c...","[DET, NFP, ADJFP, AUX, VPPMS, ADV, ADJFP, PREP..."
1,L' œuvre est située dans la galerie des de les...,"[L', œuvre, est, située, dans, la, galerie, de...","[le, œuvre, être, situer, dans, le, galerie, _...","[DET, NFS, AUX, VPPFS, PREP, DETFS, NFS, _, PR..."
2,Le comportement de la Turquie vis-à-vis du de ...,"[Le, comportement, de, la, Turquie, vis-à-vis,...","[le, comportement, de, le, Turquie, vis-à-vis,...","[DETMS, NMS, PREP, DETFS, PROPN, ADV, _, PREP,..."
3,"Toutefois , les filles adorent les desserts .","[Toutefois, ,, les, filles, adorent, les, dess...","[toutefois, ,, le, fille, adorer, le, dessert, .]","[ADV, PUNCT, DET, NFP, VERB, DET, NMP, YPFOR]"
4,Ismene entre et annonce que c' est Farnace qui...,"[Ismene, entre, et, annonce, que, c', est, Far...","[Ismene, entrer, et, annoncer, que, ce, être, ...","[PROPN, VERB, COCO, VERB, COSUB, PDEMMS, AUX, ..."
...,...,...,...,...
14444,"Le 28 mars 1792 , ces territoires formèrent de...","[Le, 28, mars, 1792, ,, ces, territoires, form...","[le, 28, mars, 1792, ,, ce, territoire, former...","[DETMS, CHIF, NOUN, CHIF, PUNCT, DET, NMP, VER..."
14445,Ce débutant de l' année 1983 et double All-Sta...,"[Ce, débutant, de, l', année, 1983, et, double...","[ce, débutant, de, le, année, 1983, et, double...","[PDEMMS, NMS, PREP, DET, NFS, CHIF, COCO, ADJ,..."
14446,La population est alors indigène et fait parti...,"[La, population, est, alors, indigène, et, fai...","[le, population, être, alors, indigène, et, fa...","[DETFS, NFS, AUX, ADV, ADJFS, COCO, VERB, NFS,..."
14447,"Mais MSI propose aussi , pour 699 euros , une ...","[Mais, MSI, propose, aussi, ,, pour, 699, euro...","[mais, MSI, proposer, aussi, ,, pour, 699, eur...","[COCO, PROPN, VERB, ADV, PUNCT, PREP, CHIF, NM..."


### 3.0 Create the Transition matrix

In [6]:
def get_transition_matrix(unique_pos, adjacent_combinations, alpha, print_transition_matrix=False):

    ''' 
    This function aims to calculate the transition matrix.
    Arguments
    ---------
        alpha: number used for smoothing
        tag_counts: a dictionary mapping each tag to its respective count
        transition_counts: transition count for the previous word and tag
        print_transition_matrix: a boolean if you want to print the transition matrix
    Returns
    -------
        transition_matrix : matrix of dimension (num_tags,num_tags)
    '''
    
    num_tags = len(unique_pos)
    transition_matrix = np.zeros((num_tags, num_tags))

    possible_combo = Counter(adjacent_combinations)

    for i, tag1 in enumerate(unique_pos):
        for j, tag2 in enumerate(unique_pos):
            
            count = possible_combo.get((tag1, tag2), 0)
            count_prev_tag = sum(1 for t1, t2 in adjacent_combinations if t1 == tag1)
            transition_matrix[i][j] = (count + alpha) / (count_prev_tag + alpha * num_tags)

    if print_transition_matrix:
        print(transition_matrix)

    return transition_matrix

In [7]:
unique_pos = sorted(set(chain.from_iterable(upos)))
adjacent_combinations = [(tag1, tag2) for tags in upos for tag1, tag2 in zip(tags, tags[1:])]
print(len(Counter(adjacent_combinations)))

1752


In [8]:
transition_matrix = get_transition_matrix(unique_pos, adjacent_combinations, alpha=0.01, print_transition_matrix=True)
transition_matrix

[[1.36996399e-02 1.06353853e-03 1.05300845e-05 ... 1.06353853e-03
  1.43219679e-01 3.47598088e-02]
 [7.52622947e-04 3.37032793e-02 1.87594078e-03 ... 3.74439277e-06
  1.27687538e-01 4.15665042e-02]
 [2.84112593e-04 1.41349549e-06 3.81657917e-02 ... 1.42763044e-04
  1.06720323e-01 5.07459016e-02]
 ...
 [1.77110331e-04 3.52467094e-04 5.27823858e-04 ... 1.43443586e-01
  1.16263288e-01 8.59423497e-03]
 [5.35905681e-04 5.35905681e-04 5.35905681e-04 ... 5.35905681e-04
  5.35905681e-04 5.35905681e-04]
 [1.03224102e-06 1.03224102e-06 1.03224102e-06 ... 1.03224102e-06
  1.03224102e-06 1.03224102e-06]]


array([[1.36996399e-02, 1.06353853e-03, 1.05300845e-05, ...,
        1.06353853e-03, 1.43219679e-01, 3.47598088e-02],
       [7.52622947e-04, 3.37032793e-02, 1.87594078e-03, ...,
        3.74439277e-06, 1.27687538e-01, 4.15665042e-02],
       [2.84112593e-04, 1.41349549e-06, 3.81657917e-02, ...,
        1.42763044e-04, 1.06720323e-01, 5.07459016e-02],
       ...,
       [1.77110331e-04, 3.52467094e-04, 5.27823858e-04, ...,
        1.43443586e-01, 1.16263288e-01, 8.59423497e-03],
       [5.35905681e-04, 5.35905681e-04, 5.35905681e-04, ...,
        5.35905681e-04, 5.35905681e-04, 5.35905681e-04],
       [1.03224102e-06, 1.03224102e-06, 1.03224102e-06, ...,
        1.03224102e-06, 1.03224102e-06, 1.03224102e-06]])

### 4.0 Create the emisson matrix

In [9]:
def get_emission_matrix(alpha, tag_counts, emission_counts, vocab, print_emission_matrix=False):

    '''
    This function aims to calculate the emission matrix.
    Arguments
    --------- 
        alpha: tuning parameter used in smoothing 
        tag_counts: a dictionary mapping each tag to its respective count
        emission_counts: a dictionary where the keys are (tag, word) and the values are the counts
        vocab: a dictionary where keys are words in vocabulary and value is an index
        print_emission_matrix: a boolean if you want to print the emission matrix
    Returns
    -------
        emission_matrix: a matrix of dimension (num_tags, len(vocab))
    '''
    
    num_tags = len(tag_counts)
    all_tags = sorted(tag_counts.keys())
    num_words = len(vocab)
    
    emission_matrix = np.zeros((num_tags, num_words))
    emis_keys = set(list(emission_counts.keys()))
    
    for i in range(num_tags):
        for word in vocab:  
            
            j = vocab[word]
            count = 0      
            key = (word, all_tags[i])

            if key in emis_keys:
                count = emission_counts[key]
                
            count_tag = tag_counts[all_tags[i]]
            emission_matrix[i,j] = (count + alpha) / (count_tag + alpha * num_words)

    if print_emission_matrix:
        print(emission_matrix)


    return emission_matrix

In [10]:
# Extract unique tags and their counts
tag_counts = Counter((chain.from_iterable(POS_data['UPOS'])))

In [11]:
# Extract unique words and create a vocabulary mapping
words = [token for sublist in POS_data['forms'] for token in sublist]
vocab = {word: idx for idx, word in enumerate(set(words))}

In [12]:
# Set the smoothing parameter alpha
alpha = 0.01 

In [13]:
# Make the UPOS list iterable
upos_column = chain.from_iterable(POS_data['UPOS'])
emission_counts = Counter()

# Iterate over each pair of (tag, lemma) in upos_column and corresponding lemma
for forms_list,tag  in zip(POS_data['lemmas'],upos_column):
    for forms,tag  in zip(forms_list,unique_pos):
        emission_counts[(forms,tag)] += 1

In [14]:
emission_matrix = get_emission_matrix(alpha, tag_counts, emission_counts, vocab)

In [15]:
emission_matrix

array([[7.27066505e-06, 7.27066505e-06, 7.27066505e-06, ...,
        7.27066505e-06, 7.27066505e-06, 7.27066505e-06],
       [3.23165470e-06, 3.23165470e-06, 3.23165470e-06, ...,
        3.23165470e-06, 3.26397125e-04, 3.23165470e-06],
       [1.33202085e-06, 1.33202085e-06, 1.33202085e-06, ...,
        1.33202085e-06, 1.33202085e-06, 1.33202085e-06],
       ...,
       [1.62988824e-06, 1.62988824e-06, 1.62988824e-06, ...,
        1.62988824e-06, 1.62988824e-06, 1.62988824e-06],
       [7.25510923e-07, 7.25510923e-07, 7.25510923e-07, ...,
        7.25510923e-07, 7.25510923e-07, 7.25510923e-07],
       [9.89081529e-07, 9.89081529e-07, 9.89081529e-07, ...,
        9.89081529e-07, 9.89081529e-07, 9.89081529e-07]])

In [17]:
np.savetxt('emission_matrix.txt', emission_matrix)

### 4.0 Viterbi algorithm Implementation

In [31]:
def viterbi(obs, start_prob, trans_prob, emit_prob):

    """
    Viterbi algorithm for finding the most likely sequence of hidden states
    that generated a sequence of observed events.
    
    Arguments
    ---------
    obs: list of observed events
    states: list of possible hidden states
    start_prob: initial probability distribution of states
    trans_prob: transition probabilities between states
    emit_prob: emission probabilities of observing events from states
    
    Returns
    -------
    path: most likely sequence of hidden states
    """
    
    T = len(obs)
    N = len(trans_prob)
    
    # Initialize matrices for probabilities and backpointers
    V = np.zeros((N, T))
    backpointers = np.zeros((N, T), dtype=int)
    
    # Initialize first column of V matrix
    for i in range(N):
        V[i, 0] = start_prob[i] * emit_prob[i][0]
        backpointers[i, 0] = 0
    
    # Recursively fill up the V matrix and backpointers
    for t in range(1, T):
        for j in range(N):
            prob = [V[i, t-1] * trans_prob[i][j] * emit_prob[j][t] for i in range(N)]
            V[j, t] = max(prob)
            backpointers[j, t] = np.argmax(prob)
    
    
    # Backtrack to find the most likely path
    path = [np.argmax(V[:, T-1])]
    for t in range(T-1, 0, -1):
        p = path[0]
        path.insert(0, backpointers[p, t])
        path = path
        
    print(path)
    return path

### 5.0 Create the Sentence Tagger

In [32]:
def tagger_sentence(sentence_as_list):
    
    T = len(transition_matrix)

    # extract the emition matrix for the sentence
    # extract the starting probability of each tag
    # use virbeti's algorithm to find the path of the taggs for each word (token) as indexes
    # locate the taggs and return them

    indexed_list_of_vocab = [word for word in vocab.keys()]

    sentence_emition_matrix = np.zeros((T,len(sentence_as_list))) # emition matrix for the given sentence

    obs = []
    for i, word in enumerate(sentence_as_list):
        column_index = indexed_list_of_vocab.index(word)
        for j in range(T):
            sentence_emition_matrix[j,i] = emission_matrix[j,column_index]
        obs.append(column_index)


    start_prob = np.zeros(T)

    # in order to get the starting probability we will calculate how frequent
    # each tag start, let's do it 

    for _, row in POS_df.iterrows():
        first_pos_tag = row['UPOS'][0]
        index = unique_pos.index(first_pos_tag)
        start_prob[index] += 1

    start_prob = start_prob/sum(start_prob) 

    
    path = viterbi(obs,start_prob,transition_matrix,sentence_emition_matrix)

    upos = []

    for i in path:
        upos.append(unique_pos[i])

    return upos

In [36]:
path = tagger_sentence(["Vous","suis","belle"])
path

[9, 33, 45]


['COSUB', 'PPER1S', 'PREFS']

***
**Made By :**
- *Houda Moudni* : houda.moudni@etu.uae.ac.ma
- *Chadi Mountassir* : chadi.mountassir@etu.uae.ac.ma
