# P1 - Small Language Model

In [115]:
from nltk import *
from conllu import *
from treebanks import languages, train_corpus, test_corpus, conllu_corpus
from collections import defaultdict, Counter
from math import log, exp

# HMMs
Write your own code to estimate the transition probabilities and the emission prob- abilities of an HMM, on the basis of (tagged) sentences from a training corpus from Universal Dependencies. Do not forget to involve the start-of-sentence marker ⟨s⟩ and the end-of-sentence marker ⟨/s⟩ in the estimation.
The code in this part is concerned with:
- counting occurrences of one part of speech following another in a training corpus, 
- counting occurrences of words together with parts of speech in a training corpus, 
- relative frequency estimation with smoothing.

As discussed in the lectures, smoothing is necessary to avoid zero probabilities for events that were not witnessed in the training corpus. For emission probabilities, implement Witten-Bell smoothing for unigrams, exactly as on slide 23 of Lecture 5 (use 100000 as value of z). This means that we have one smoothed probability distribution over words for each tag. For transition probabilities, implement Witten-Bell smoothing for bigrams of tags, exactly as on slide 22 of that same lecture (and this in turn will make use of Witten-Bell unigram smoothing that you will have implemented before). To be clear: do not use any existing implementations of Witten-Bell, but make your own, based on your understanding of the lecture notes.

Further write your own code to implement the Viterbi algorithm, which determines the sequence of tags for a given sentence that has the highest probability. To avoid underflow for long sentences, we need to use log probabilities. Implement code to compute accuracy of POS tagging (the percentage of correctly predicted tags).

In [113]:

class HidddenMarkovModel:

    def __init__(self, trainingCorpus, z = 100000): 
        self.z = z
        self.n = 0 # length of training data
        self.m = 0 # number of distinct words                   # TODO can be omitted its just len(self.unigramCounts)
        self.unigramCounts = defaultdict(int)
        self.bigramCount = defaultdict(int)
        self.wordTagCounts = defaultdict(int) # dictionary of dictionaries tracking word occurrences per tag
        self.tagOccurr = defaultdict(int) # total number of word occurrences for tag t
        self.uniqueWordsPerTag = defaultdict(set)
        #self.tagOccurrences = Counter()
        self.__initialiseVars(trainingCorpus)
        # print("number of distinct words including sentence markers: ", self.m)
        # print("total number of tokens including sentence markers: ", self.n)
        self.uniqueTags = []


    # TODO how do i manage beginning and end of sentence markers
    def __initialiseVars(self, trainingCorpus): 
        train_sents = conllu_corpus(train_corpus(trainingCorpus))

        for sent in train_sents:                                                # TODO this might overflow if sentence len = 1
            for i in range(len(sent)):
                if i == 0:
                    self.bigramCount[("<s>", sent[i]['upos'])] += 1
                    self.bigramCount[(sent[i]['upos'], sent[i+1]['upos'])] += 1
                elif i == len(sent) - 1: 
                    self.bigramCount[(sent[i]['upos'], "</s>")] += 1
                else: 
                    self.bigramCount[(sent[i]['upos'], sent[i+1]['upos'])] += 1
                    
        # unigram counts 
        for sent in train_sents:
            #self.addCount("<s>") # count start of sentence
            for token in sent:
                self.__addCount(token)
                #print(token['form'], sep = '', end= " ")
            #self.addCount("</s>") # count end of sentence
        self.m = len(self.unigramCounts)

        self.uniqueTags = list(set([token['upos'] for sent in train_sents for token in sent]))

        wordTagPairs = [(token['form'], token['upos']) for sent in train_sents for token in sent]
        wordTagCounter = Counter(wordTagPairs)
        #self.tagOccurrences = Counter(tag for _, tag in wordTagPairs)
        self.wordTagCounts = defaultdict(lambda: defaultdict(int))
        for (word, tag), count in wordTagCounter.items():
            self.wordTagCounts[tag][word] = count

        for word, tag in wordTagPairs:
            self.uniqueWordsPerTag[tag].add(word)
        # T_tag = {tag: len(words) for tag, words in tag_unique_words.items()} # number of possible words per tag


    def __addCount(self, token): 
        # tokens consist of a word and a tag
        self.n += 1
        self.tagOccurr[token['upos']] += 1
        self.unigramCounts[token['form']] += 1



    def unigramEmissionProbability(self, word, tag):
        tag_cts = self.wordTagCounts[tag]
        t = len(self.uniqueWordsPerTag[tag])
        n = self.tagOccurr[tag]
        # print(t)
        # print(n)
        # print(tag_cts[word])
        if word in tag_cts:
            return float(tag_cts[word] / (n+  t))
        else: #word is unseen
            return float(t/(self.z * (n + t)))
        
    # def __wordUnigramProbability(self, word):
    #     if word in self.unigramCounts:
    #         return self.unigramCounts[word] / (self.n + self.m)
    #     else: 
    #         return self.m / (self.z * (self.m + self.n))

    def __unigramProbability(self, tag):
       # print(len(self.tagOccurr))
        m = len(self.tagOccurr)
        if self.tagOccurr[tag] != 0:
            return self.tagOccurr[tag] / (self.n + m)
        else: 
            return m / (self.z * (self.n + m))
        
        
    def bigramTransitionProbability(self, precedingTag, tag):  
        precCount = self.tagOccurr[precedingTag]
        if self.tagOccurr[precedingTag] == 0:
            # if precedingTag count = 0
            lambd = 0
        else :
            # possibleFollowers counts number of tags that were seen following 'precedingTag'
            possibleFollowers = sum(1 for (tag1, tag2) in self.bigramCount if tag1 == precedingTag) 
            lambd = precCount / (precCount + possibleFollowers)

        # P(tag | prevtag)
        if precCount == 0:
            prob_tag_given_preceding = 0
        else: 
            prob_tag_given_preceding = self.bigramCount[(precedingTag, tag)] / precCount
        # find the witten bell smoothed probability of tag
        unig_prob = self.__unigramProbability(tag)
        return (lambd * prob_tag_given_preceding) + ((1 - lambd) * unig_prob)
    



    # Adding a list of probabilities represented as log probabilities.
    def __logsumexp(vals):
        min_log_prob = -float_info.max
        if len(vals) == 0:
            return min_log_prob
        m = max(vals)
        if m == min_log_prob:
            return min_log_prob
        else:
            return m + log(sum([exp(val - m) for val in vals]))


    def viterbi(self, sent): 
        # Use log probablities -> logsumexptrick.py has stuff on how to use them 
        # # columns = number of words in sent
        # num of rows = number of possible tags
        probTable = [[0] * (len(sent) + 1 ) for _ in range(len(self.uniqueTags))]
        maxProb = [0] * (len(sent) + 1)
        tagPath = [0] * (len(sent) + 1)

        # Initialise 
        for q in range(len(probTable)):
            transitionProb = log(self.bigramTransitionProbability("<s>" , self.uniqueTags[q]))
            emissionProb = log(self.unigramEmissionProbability(sent[0], self.uniqueTags[q]))
            probTable[q, 0] = self.__logsumexp([transitionProb, emissionProb])
        tagPath[0] = 

        # Compute
        for i in range(1, len(sent)):
            for q in range(len(probTable)):
                probTable[q,i] = 
            


        # Finish 

        




In [None]:
# Compute accuracy of POS tagging using a HMM
# Percentage of correctly predicted tags


# def posTaggingAccuracy(testCorpus)
    # testcorpus could either be the name of the corpus or the train sentences 
# make a function that takes in test data
# iterate through the test data 
# pass in hte sentences to viterbi iwhtout tags 
# compare the tags that were outputted by viterbi with the tags in testing data
# return the Percentage of correctly predicted tags

# Language modelling using HMMs
As discussed in the lectures, HMMs can be used to determine the probability of an input sentence, using the forward algorithm. For this, you need to be able to add prob- abilities together that are represented as log probabilities, without getting underflow in the conversion from log probabilities to probabilities and back. See the included logsumexptrick.py for a demonstration.
Write your own code to compute the perplexity of a test corpus. (Consider the length of a corpus to be the total number of actual tokens plus the number of sentences; in effect we count one additional end-of-sentence token for each sentence in addition to the actual words and punctuation tokens.)

In [None]:
# use logsumexptrick.py to add probabilities that are represented as log probabilities

In [None]:
# Compute the perplexity of a test corpus given a HMM
# use length count as described above

# Language modelling using bigrams
Further write your own code to implement estimation of bigram probabilities of input tokens (so words and punctuation tokens, not POS tags). To avoid zero probabilities, you again need Witten-Bell smoothing; you should here be able to reuse the code you implemented earlier for transition probabilities of HMMs.
Again, implement your own code to compute perplexity of a test corpus, given a trained bigram model.

In [None]:
# Estimate bigram probabilities with witten bell smoothing 
# re-use code for transition probabilities of HMMs

In [None]:
# Compute perplexity of a test corpus given a trained bigram model 

# Experiments
Run the developed code for the three treebanks. Train using the training parts of the treebanks, and test using the testing parts of the treebanks. (It is good practice to mainly use the development parts during development of the code.) 

Testing here means:
• computing accuracy of POS tagging using an HMM, 
• computing perplexity using an HMM, and
• computing perplexity using a bigram model.


In [101]:
#lang = "en" 
#lang = "orv" 
lang = "tr" 

train_sents = conllu_corpus(train_corpus(lang))
test_sents = conllu_corpus(test_corpus(lang))

for sent in train_sents:
    for token in sent:
        print(token['form'], '->', token['upos'], sep='', end=' ')
        #print(token['form'], sep = '', end= "")
    print()
    
uniqueTags = set([token['upos'] for sent in train_sents for token in sent])
print(uniqueTags)



Pittsburgh'tan->PROPN Atlanta'ya->PROPN 25->NUM Nisan'da->PROPN gidiş->NOUN 6->NUM Mayıs'ta->PROPN dönüşü->NOUN olan->ADJ bir->DET gidiş->NOUN dönüş->NOUN uçuşunun->NOUN maliyeti->NOUN nedir->VERB 
Şimdi->ADV Fort->PROPN Worth'dan->PROPN ayrılan->ADJ ve->CCONJ en->ADV geç->ADV gelecek->ADJ pazartesi->PROPN akşam->NOUN 2'ye->NOUN kadar->ADP Denver'e->PROPN varacak->ADJ bir->DET uçağa->NOUN ihtiyacım->NOUN var->ADJ 
Önümüzdeki->ADJ çarşamba->PROPN gidiş->NOUN ertesi->ADJ gün->NOUN dönüşü->NOUN olan->ADJ Kansas->PROPN City'den->PROPN Chicago'ya->PROPN giden->ADJ bir->DET uçuşa->NOUN ihtiyacım->NOUN var->ADJ 
yemek->NOUN kodu->NOUN S'in->PROPN anlamı->NOUN nedir->VERB 
Denver'den->PROPN Pittsburgh'a->PROPN yemek->NOUN servisi->NOUN yapan->ADJ yarından->NOUN sonraki->ADJ gün->NOUN için->ADP tüm->DET uçuşları->NOUN göster->VERB 
Bana->PRON yarından->NOUN sonraki->ADJ gün->NOUN için->ADP Atlanta'dan->PROPN Denver'e->PROPN tüm->DET US->NOUN Air->PROPN uçuşlarını->NOUN göster->VERB 
Dallas'tan-

In [114]:
# HMM testing
hmm = HidddenMarkovModel("en")

print("unig: ", hmm.unigramEmissionProbability("yapan", "NOUN")) 
print("unig: ", hmm.unigramEmissionProbability("yapan", "ADJ")) 
print("unig: ", hmm.unigramEmissionProbability("en", "ADV")) 
print("big: ", hmm.bigramTransitionProbability("ADP", "NOUN")) 
print("big: ", hmm.bigramTransitionProbability("NOUN", "NOUN")) 
print("big: ", hmm.bigramTransitionProbability("fgfsgs", "NOUN")) 
#wb.unigramEmissionProbability("vajajay", "NOUN")

unig:  2.400090569455451e-07
unig:  4.839650145772595e-07
unig:  1.11340206185567e-06
big:  0.1297022220263252
big:  0.17886276152344074
big:  0.17713534282602889


In [None]:
# Train - use only testing parts of treebank

In [None]:
# Test - use only testing part of treebank


# call the three functions implemented above 
# • computing accuracy of POS tagging using an HMM, 
# • computing perplexity using an HMM, and
# • computing perplexity using a bigram model.