In [1]:
from __future__ import print_function
import nltk
import math
import numpy as np
import sys
from nltk.corpus import brown
from nltk.corpus import nps_chat
from nltk.corpus import conll2000

In [2]:
from sklearn.cross_validation import ShuffleSplit

In [3]:
def get_test_sents_tags(test_corpus):
    test_sents = []
    test_tags = []
    for sent in test_corpus:
        test_sents.append([word for (word, tag) in sent])
        test_tags.append([tag for (word, tag) in sent])
    return test_sents, test_tags

In [4]:
ss = ShuffleSplit(len(brown.tagged_sents()), n_iter=1, test_size=0.1, random_state=0)
for train_index, test_index in ss:
    train_brown_word_tags = np.array(brown.tagged_sents())[train_index]
    test_brown_word_tags = np.array(brown.tagged_sents())[test_index]
    brown_test_sents, brown_test_tags = get_test_sents_tags(test_brown_word_tags)

In [5]:
def get_word_tags(tagged_sents):
    word_tags = []
    for sent in tagged_sents:
        word_tags.append(("START", "START"))
        word_tags.extend([(tag[:2], word) for (word, tag) in sent])
        word_tags.append(("END", "END"))
    return word_tags
indexes = [int(x) for x in np.linspace(0, len(train_brown_word_tags), 6)]
train_brown1 = get_word_tags(train_brown_word_tags[:indexes[1]])
train_brown2 = get_word_tags(train_brown_word_tags[:indexes[2]])
train_brown3 = get_word_tags(train_brown_word_tags[:indexes[3]])
train_brown4 = get_word_tags(train_brown_word_tags[:indexes[4]])
train_brown5 = get_word_tags(train_brown_word_tags[:indexes[5]])

In [6]:
print("Size of first chunk: ", len(train_brown1))
print("Size of second chunk: ", len(train_brown2))
print("Size of third chunk: ", len(train_brown3))
print("Size of fourth chunk: ", len(train_brown4))
print("Size of fifth chunk: ", len(train_brown5))

Size of first chunk:  227837
Size of second chunk:  458765
Size of third chunk:  688937
Size of fourth chunk:  917852
Size of fifth chunk:  1150437


In [7]:
def train_hmms(tags, cpd_tags, cpd_tagwords, cfd_tagwords, word_corpus, sentence):
    distinct_tags = tags
    #word_corpus = set(brown.words())
    sentlen = len(sentence)

    viterbi = []
    backpointer = [ ]

    # Inicializacion de las variables de viterbi_1 v_s(1)
    first_viterbi = { }
    first_backpointer = { }
    for tag in distinct_tags:
        # don't record anything for the START tag
        if tag == "START": continue
        first_viterbi[ tag ] = cpd_tags["START"].prob(tag) * cpd_tagwords[tag].prob( sentence[0] )
        first_backpointer[ tag ] = "START"
    viterbi.append(first_viterbi)
    backpointer.append(first_backpointer)

    currbest = max(first_viterbi.keys(), key = lambda tag: first_viterbi[ tag ])

    for wordindex in range(1, len(sentence)):
        this_viterbi = { }
        this_backpointer = { }
        prev_viterbi = viterbi[-1]

        for tag in distinct_tags:
            if tag == "START": continue
            if sentence[wordindex] in word_corpus:
                p_ws = cpd_tagwords[tag].prob(sentence[wordindex])
            else:
                p_ws = 1./cfd_tagwords[tag].N()
            best_previous = max(prev_viterbi.keys(),
                                key = lambda prevtag: \
                                prev_viterbi[ prevtag ] * cpd_tags[prevtag].prob(tag) * 
                                p_ws)

            this_viterbi[ tag ] = prev_viterbi[ best_previous] * \
                cpd_tags[ best_previous ].prob(tag) * p_ws
            this_backpointer[ tag ] = best_previous

        currbest = max(this_viterbi.keys(), key = lambda tag: this_viterbi[ tag ])
        #print( "Word", "'" + sentence[ wordindex] + "'", "current best two-tag sequence:", 
        #      this_backpointer[ currbest], currbest)
        viterbi.append(this_viterbi)
        backpointer.append(this_backpointer)

    prev_viterbi = viterbi[-1]
    best_previous = max(prev_viterbi.keys(), key = lambda prevtag:\
                        prev_viterbi[ prevtag ] * cpd_tags[prevtag].prob("END"))

    prob_tagsequence = prev_viterbi[best_previous] * cpd_tags[ best_previous].prob("END")

    best_tagsequence = ["END", best_previous]
    backpointer.reverse()

    current_best_tag = best_previous
    for bp in backpointer:
        best_tagsequence.append(bp[current_best_tag])
        current_best_tag = bp[current_best_tag]

    best_tagsequence.reverse()
    #print( "The sentence was:", end = " ")
    #for w in sentence: print( w, end = " ")
    #print("\n")
    #print( "The best tag sequence is:", end = " ")
    #for t in best_tagsequence: print (t, end = " ")
    #print("\n")
    #print( "The probability of the best tag sequence is:", prob_tagsequence)
    return prob_tagsequence, best_tagsequence

In [8]:
def evaluateAccuracy(gs, calc, word_sec, word_dict):
    _gs = gs[:]
    _calc = calc[1:-1]
    nc=kc=nt=kt=0.
    for k in range(len(_gs)):
        if _gs[k][:2]==_calc[k][:2]:
            if word_sec[k] in word_dict:
                kc+=1.
            else:
                nc+=1.
        if word_sec[k] in word_dict:
            kt+=1.
        else:
            nt+=1.
    return (nc+kc)/(nt+kt),kc,kt,nc,nt

In [9]:
def train_model(train_brown, test_sents, test_tags, index):
    cfd_tagwords = nltk.ConditionalFreqDist(list(train_brown))
    cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist)

    tags = [tag for (tag, word) in train_brown]
    words = [word for (tag, word) in train_brown]
    cfd_tags= nltk.ConditionalFreqDist(nltk.bigrams(tags))
    cpd_tags = nltk.ConditionalProbDist(cfd_tags, nltk.MLEProbDist)

    distinct_tags = set(tags)
    word_corpus = set(words)
    #return brown_cfd_tagwords, brown_cpd_tagwords, brown_tags, brown_words, brown_cfd_tags

    #distinct_tags, cpd_tags, cpd_tagwords, cfd_tagwords, word_corpus = datasets[dataset]
    #test_sents, test_tags = [brown_test_sents, brown_test_tags]
    accuracies = []
    probabilities = []
    j = 0
    for i in range(len(test_sents)):
        prob_tagsequence, best_tagsequence = train_hmms(distinct_tags, cpd_tags, cpd_tagwords, cfd_tagwords, word_corpus, test_sents[i])
        accuracy,_,_,_,_ = evaluateAccuracy(test_tags[i], best_tagsequence, test_sents[i], word_corpus)
        accuracies.append(accuracy)
        probabilities.append(prob_tagsequence)
        if j%500 == 0: print('Samples processed: ', j)
        j += 1
    #evaluation_dict[index] = accuracies
    #prob_dict[index] = probabilities
    print('Accuracy for {data} is {acc}'.format(data=index, acc=np.mean(accuracies)))
    return accuracies, probabilities

In [None]:
evaluation_dict = {}
prob_dict = {}

accuracies, probabilities = train_model(train_brown1, brown_test_sents, brown_test_tags, 0)
evaluation_dict[0] = accuracies
prob_dict[0] = probabilities
accuracies, probabilities = train_model(train_brown2, brown_test_sents, brown_test_tags, 1)
evaluation_dict[1] = accuracies
prob_dict[1] = probabilities
accuracies, probabilities = train_model(train_brown3, brown_test_sents, brown_test_tags, 2)
evaluation_dict[2] = accuracies
prob_dict[2] = probabilities
accuracies, probabilities = train_model(train_brown4, brown_test_sents, brown_test_tags, 3)
evaluation_dict[3] = accuracies
prob_dict[3] = probabilities
accuracies, probabilities = train_model(train_brown5, brown_test_sents, brown_test_tags, 4)
evaluation_dict[4] = accuracies
prob_dict[4] = probabilities

Samples processed:  0
Samples processed:  500
Samples processed:  1000
Samples processed:  1500
Samples processed:  2000
Samples processed:  2500
Samples processed:  3000
Samples processed:  3500
Samples processed:  4000
Samples processed:  4500
Samples processed:  5000
Samples processed:  5500
Accuracy for 0 is 0.839853634683
Samples processed:  0
Samples processed:  500
Samples processed:  1000
Samples processed:  1500
Samples processed:  2000
Samples processed:  2500
Samples processed:  3000
Samples processed:  3500
Samples processed:  4000
Samples processed:  4500
Samples processed:  5000
Samples processed:  5500
Accuracy for 1 is 0.883960873057
Samples processed:  0
