## 3.1

Trigram probability estimation:

$$P(w_n| w_{1:n-1}) \approx P(w_n | w_{n-2:n-1}) = \frac{C(w_{n-2}\;w_{n-1}\;w_n)}{C(w_{n-2}\;w_{n-1})}$$



## 3.4

In [30]:
def laplace_smoothing(corpus, curr, prefix):
    # preprocessing
    corpus = list(map(lambda s: s.lower(), corpus))
    curr = curr.lower()
    prefix = prefix.lower()
    # generate vocab
    vocab = {w.lower for l in [s.split() for s in corpus] for w in l}
    V = len(vocab)
    # calculate original count
    bigram_count = 0
    prefix_count = 0
    for sent in corpus:
        words = sent.split()
        for ix, word in enumerate(words[:-1]):
            if word == prefix:
                prefix_count += 1
                if words[ix+1] == curr:
                    bigram_count += 1
    # calculate the add-one smoothed bigram counts (Eq 3.24)
    return (bigram_count + 1) / (prefix_count + V)

corpus = [
    '<s> I am Sam </s>',
    '<s> Sam I am </s>',
    '<s> I am Sam </s>',
    '<s> I do not like green eggs and Sam </s>'
]
print(laplace_smoothing(corpus, 'Sam', 'am'))

0.12


## 3.7

Note: probabilities should all be handled in log space :)

In [38]:
def get_unigram_mle(corpus, curr):
    # preprocessing
    corpus = list(map(lambda s: s.lower(), corpus))
    curr = curr.lower()
    # total num tokens
    N = sum(len(sent.split()) for sent in corpus)
    # count curr
    curr_count = 0
    for sent in corpus:
        for word in sent.split():
            if word == curr:
                curr_count += 1
    return curr_count / N

def get_bigram_mle(corpus, curr, prev):
    # preprocessing
    corpus = list(map(lambda s: s.lower(), corpus))
    curr = curr.lower()
    prev = prev.lower()
    # calculate original count
    bigram_count = 0
    prev_count = 0
    for sent in corpus:
        words = sent.split()
        for ix, word in enumerate(words[:-1]):
            if word == prev:
                prev_count += 1
                if words[ix+1] == curr:
                    bigram_count += 1
    # calculate the mle
    return bigram_count / prev_count

def get_interpolated_bigram_mle(corpus, curr, prev, lambda1, lambda2):
    return (lambda1 * get_unigram_mle(corpus, curr) + 
            lambda2 * get_bigram_mle(corpus, curr, prev))
corpus = [
    '<s> I am Sam </s>',
    '<s> Sam I am </s>',
    '<s> I am Sam </s>',
    '<s> I do not like green eggs and Sam </s>'
]

print(get_interpolated_bigram_mle(corpus, 'sam', 'am', 0.5, 0.5))

0.41333333333333333


## 3.8

In [57]:
from collections import defaultdict
import os

def preprocess(filename):
    # assumes lines in file have no trailing whitespace (except newlines)
    with open(filename, 'r') as f:
        lines = f.readlines()
        # join at newlines
        full_text = ' '.join(lines)
        # split sentences by .
        sentences = full_text.split('.')
        # add start/end sentence tags 
        sentences = [f'<s> {sent} </s>'.lower() for sent in sentences]
        return sentences

def get_unigrams(sentences, return_count=False):
    # get counts for all unigrams
    count_dict = defaultdict(int)
    total = 0
    for sentence in sentences:
        words = sentence.split()
        total += len(words)
        for word in words:
            count_dict[word] += 1
    # create probability dict based on counts
    prob_dict = defaultdict(float)
    for word, count in count_dict.items():
        prob_dict[word] = count / total
    if return_count:
        return count_dict
    return prob_dict

def get_bigrams(sentences, get_unigrams, return_count=False):
    # get counts for all bigrams
    count_dict = defaultdict(int)
    for sentence in sentences:
        words = sentence.split()
        for ix, word in enumerate(words[:-1]):
            count_dict[(words[ix], words[ix+1])] += 1
    # create probability dict based on counts
    prob_dict = defaultdict(float)
    unigram_count_dict = get_unigrams(sentences, return_count=True)
    for bigram, count in count_dict.items():
        prob_dict[bigram] = count / unigram_count_dict[bigram[0]]
    if return_count:
        return count_dict
    return prob_dict


## 3.9

In [64]:
sentences = preprocess(os.getcwd() + '/../misc/linguistics_paper1.txt')
ug_prob = get_unigrams(sentences)
bg_prob = get_bigrams(sentences, get_unigrams)
print('unigrams:', sorted(ug_prob.items(), key=lambda t: t[1], reverse=True)[:10])
print(sum(ug_prob.values()))
print('bigrams:', sorted(bg_prob.items(), key=lambda t: t[1], reverse=True)[:10])
print(sum(bg_prob.values()))

unigrams: [('the', 0.05179982440737489), ('of', 0.04477611940298507), ('<s>', 0.029850746268656716), ('</s>', 0.029850746268656716), ('to', 0.024582967515364356), ('a', 0.02370500438981563), ('and', 0.021071115013169446), ('in', 0.02019315188762072), ('their', 0.013169446883230905), ('&', 0.013169446883230905)]
1.0000000000000102
bigrams: [(('act', 'of'), 1.0), (('poses', 'an'), 1.0), (('interesting', 'question'), 1.0), (('question', 'of'), 1.0), (('choose', 'to'), 1.0), (('apply', 'the'), 1.0), (('syntax,', 'and'), 1.0), (('novel', 'language,'), 1.0), (('exhibit', 'a'), 1.0), (('deeper', 'understanding'), 1.0)]
430.9999999999992
