In [225]:
import nltk
from HolbrookCorpus import HolbrookCorpus
from KNBigramLanguageModel import KNBigramLanguageModel

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## data processing

In [171]:
corpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')
for sent in corpus.corpus[:10]:
    print(sent)

<s> 1 </s>
<s> nigel thrush page 48 </s>
<s> i have four in my family dad mum and sister (siter) </s>
<s> my dad works at melton </s>
<s> my sister (siter) goes (go) to tonbury </s>
<s> my mum goes out sometimes </s>
<s> i go to bridgebrook i go out sometimes on tuesday night i go to youth club (clob) </s>
<s> on thursday nights i go bellringing on saturdays i go down to the farm </s>
<s> on sundays i go to church </s>
<s> i go to bed at 10 o clock i watch (wakh) tv at 5 o clock i live in a house </s>


In [226]:
clm = KNBigramLanguageModel(corpus)

In [227]:
clm.showUnigramCounts(clm.unigramCounts, 5)

[('the', 771), ('<s>', 659), ('</s>', 659), ('and', 466), ('to', 314)]

In [228]:
clm.showUnigramCounts(clm.bigramCounts, 5)

[(('<s>', 'the'), 94),
 (('and', 'the'), 73),
 (('to', 'the'), 71),
 (('in', 'the'), 61),
 (('<s>', 'i'), 60)]

In [229]:
clm.showUnigramCounts(clm.trigramCounts, 5)

[(('the', 'end', '</s>'), 19),
 (('<s>', 'the', 'end'), 18),
 (('went', 'to', 'the'), 17),
 (('out', 'of', 'the'), 13),
 (('the', 'old', 'man'), 10)]

## kneser-ney smoothing (bigram model)

Let's compute KN smoothed probability for bigram $(w_{i-1}, w_i)=(old, man)$: 

$$P_{KN}(w_i|w_{i-1})=\frac{max(c(w_{i-1}, w_i)-d, 0)}{c(w_{i-1})}+\lambda(w_{i-1})P_C(w_i)=T_1+T_2$$

We computed them manually for each given bigram to check if precomputed probabilities are correct. Also we computed standard bigram probabilities - they should be slightly bigger than our discounted numbers.

In [230]:
bigrams = [('old', 'man'), ('one', 'day'), ('little', 'yellow')]

In [231]:
for bigram in bigrams:
    print(f'manual:{clm.getKnProbManually(bigram):.4f}', 
          f'precomputed:{clm.knProbs[bigram]:.4f}', 
          f'original:{clm.bigramCounts[bigram] / clm.unigramCounts[bigram[0]]:.4f}')

manual:0.4064 precomputed:0.4064 original:0.4222
manual:0.1276 precomputed:0.1277 original:0.1385
manual:0.0433 precomputed:0.0433 original:0.0690


Let's print out detailed computations for bigram `('old', 'man')`. This can be used for checking manual computations.

In [233]:
bigram = ('old', 'man')
T1, Pc, lambd, T2 = clm.getKnProbManually(bigram, detailed=True)
print(f'T1={T1:.4f}', f'Pc={Pc:.4f}', f'lambd={lambd:.4f}', f'T2={T2:.4f}')

T1=0.4056 Pc=0.0025 lambd=0.3333 T2=0.0008


Let's check if we have KN probabilities for all bigrams.

In [224]:
print(f'len of KN probs dictionary: {len(clm.knProbs)};', 
      f'len of bigrams dictionary: {len(clm.bigramCounts)}')

len of KN probs dictionary: 6850; len of bigrams dictionary: 6850
