In [6]:
import json
import numpy as np
import matplotlib.pyplot as plt
import time
from collections import Counter,defaultdict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution

In [7]:
train_set = json.load(open('./corpus/fr/fr.ftb.train.json', encoding = 'utf-8'))
test_set = json.load(open('./corpus/fr/fr.ftb.test.json', encoding = 'utf-8'))

In [8]:
def words_and_labels(data_set):
    
    words = []
    labels = []
    for sentence,label in data_set:
        for w,l in zip(sentence,label):
            words.append(w)
            labels.append(l)
    
    return words,labels

train_data,train_label = words_and_labels(train_set)

In [200]:
def replace_unknown(sequence):
    """Return a copy of the input sequence where each unknown word is replaced
    by the literal string value 'nan'. Pomegranate will ignore these values
    during computation.
    """
    return [w if w in train_data else 'nan' for w in sequence]

def simplify_decoding(X, model):
    
    """X should be a 1-D sequence of observations for the model to predict"""
    
    w_list = replace_unknown(X)
    
    a = []
    
    for i in range(len(w_list)):
        if w_list[i] == 'nan':
            a.append(i)
            _, state_path_oov = model.viterbi(w_list)
            
    print(a)      
            
    _, state_path = model.viterbi(w_list)
   
    
    state = [state[1].name for state in state_path[1:-1]]
    m = state
    state_oov = [state_oov[1].name for state_oov in state_path_oov[i+1] for i in a]
    print(m)
    return state

In [143]:
def accuracy(X, Y, model):
    """Calculate the prediction accuracy by using the model to decode each sequence
    in the input X and comparing the prediction with the true labels in Y.
    
    The X should be an array whose first dimension is the number of sentences to test,
    and each element of the array should be an iterable of the words in the sequence.
    The arrays X and Y should have the exact same shape.
    
    X = [("See", "Spot", "run"), ("Run", "Spot", "run", "fast"), ...]
    Y = [(), (), ...]
    """
    
    correct = total_predictions = 0
    correct_oov = total_predictions_oov = 0
    for observations, actual_tags in zip(X, Y):
        # The model.viterbi call in simplify_decoding will return None if the HMM
        # raises an error (for example, if a test sentence contains a word that
        # is out of vocabulary for the training set). Any exception counts the
        # full sentence as an error (which makes this a conservative estimate).
        try:
            
            most_likely_tags = simplify_decoding(observations, model)
            #correct_oov += sum(p == t for p, t in zip(most_likely_tags_oov, actual_tags))
            #total_predictions_oov += len(most_likely_tags_oov)
            correct += sum(p == t for p, t in zip(most_likely_tags, actual_tags))
        except:
            pass
        total_predictions += len(observations)
        
    return correct / total_predictions, correct_oov

In [25]:
def pair_counts(data_set):
    """Return a dictionary keyed to each unique value in the first sequence list
    that counts the number of occurrences of the corresponding value from the
    second sequences list.
    
    For example, if sequences_A is tags and sequences_B is the corresponding
    words, then if 1244 sequences contain the word "time" tagged as a NOUN, then
    you should return a dictionary such that pair_counts[NOUN][time] == 1244
    """
    
    dict_label_word = defaultdict(list)
    dict_word = defaultdict(list)

    for sentence,labels in data_set:
        for word,label in zip(sentence,labels):
            dict_label_word[label].append(word)
    
    for k in dict_label_word.keys():
        dict_word[k] = Counter(dict_label_word[k])
        
    return dict_word

In [26]:
def starting_counts(data_set):
    """Return a dictionary keyed to each unique value in the input sequences list
    that counts the number of occurrences where that value is at the beginning of
    a sequence.
    
    For example, if 8093 sequences start with NOUN, then you should return a
    dictionary such that your_starting_counts[NOUN] == 8093
    """
    
    dict_starting = defaultdict(list)
    
    lab = set()
    for _,labels in data_set:
        for label in labels:
            lab.add(label)
            
    for label in lab:
        dict_starting[label] = len([labels[0] for _,labels in data_set if labels[0]==label])
    return dict_starting

In [27]:
def ending_counts(data_set):
    """Return a dictionary keyed to each unique value in the input sequences list
    that counts the number of occurrences where that value is at the end of
    a sequence.
    
    For example, if 18 sequences end with DET, then you should return a
    dictionary such that your_starting_counts[DET] == 18
    """
    
    dict_ending = defaultdict(list)
    
    lab = set()
    for _,labels in data_set:
        for label in labels:
            lab.add(label)
            
    for label in lab:
        dict_ending[label] = len([labels[-1] for _,labels in data_set if labels[-1]==label])
    return dict_ending

In [14]:
def unigram_counts(data_set):
    """Return a dictionary keyed to each unique value in the input sequence list that
    counts the number of occurrences of the value in the sequences list. The sequences
    collection should be a 2-dimensional array.
    
    For example, if the tag NOUN appears 275558 times over all the input sequences,
    then you should return a dictionary such that your_unigram_counts[NOUN] == 275558.
    """
    
    dict_tag = defaultdict(int)
    
    for _,labels in data_set:
        for label in labels:
            dict_tag[label] += 1           
    return dict_tag

In [15]:
def bigram_counts(data_set):
    """Return a dictionary keyed to each unique PAIR of values in the input sequences
    list that counts the number of occurrences of pair in the sequences list. The input
    should be a 2-dimensional array.
    
    For example, if the pair of tags (NOUN, VERB) appear 61582 times, then you should
    return a dictionary such that your_bigram_counts[(NOUN, VERB)] == 61582
    """

    dict_bigram = defaultdict(int)
    for _,labels in data_set:
        for i in range(1,len(labels)):
            dict_bigram[(labels[i-1],labels[i])] += 1
            
    return dict_bigram

In [59]:
hmm = HiddenMarkovModel(name="base-hmm-tagger")

In [60]:
label_starts = starting_counts(train_set)
label_ends = ending_counts(train_set)
label_unigrams = unigram_counts(train_set)
label_bigrams = bigram_counts(train_set)

In [61]:
count_label_and_word = pair_counts(train_set)
states = {}
for label, word_dict in count_label_and_word.items(): #data.training_set.tagset
    p_words_given_label_state = defaultdict(float)
    # for each tag/word, calculate P(word|tag)
    for word in word_dict.keys(): # data.training_set.vocab
        p_words_given_label_state[word] = count_label_and_word[label][word] / label_unigrams[label] 
    # create a new state for each tag from the dict of words that represents P(word|tag)
    emission = DiscreteDistribution(dict(p_words_given_label_state))
    states[label] = State(emission, name=label)
    
hmm.add_states(list(states.values()))

In [62]:
label_list = set()
for _,labels in train_set:
    for label in labels:
        label_list.add(label)

In [63]:
for label in label_list:        
    state = states[label]
    hmm.add_transition(hmm.start, state, label_starts[label]/len(label_list))

# Adding end states
for label in label_list: 
    state = states[label]
    hmm.add_transition(state, hmm.end, label_ends[label]/label_unigrams[label])

# Adding pairs
for label1 in label_list: 
    state1 = states[label1]
    for label2 in label_list: 
        state2 = states[label2]
        hmm.add_transition(state1, state2, label_bigrams[(label1, label2)]/label_unigrams[label1])

In [128]:
hmm.bake()

In [161]:
def sentences_and_labels(data_set):
    
    sentences = []
    labels = []
    for sentence,label in data_set:
        sentences.append(sentence)
        labels.append(label)
    
    return sentences,labels

test_data,test_label = sentences_and_labels(test_set)

In [201]:
hmm_training_acc = accuracy(test_data[:10],test_label[:10],hmm)

[]
[]
[]
[18]
[]
[]
[12]
[1, 3]
[]
[]


In [202]:
print(hmm_training_acc)

(0.0, 0)


In [159]:
index = []
print(index)

[]
