In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from nltk import ngrams
from utils import get_tokenized_sentences
from math import log

In [3]:
def log2(x):
    return log(x, 2)

In [4]:
class WordCounter:
    def __init__(self, sentence_generator):
        self.sentence_generator = sentence_generator
        self.sentence_count = 0
        self.token_count = 0
        self.all_ngram_counts = {}
        for ngram_length in range(1, 6):
            self.all_ngram_counts[ngram_length] = {}
        
    def count(self):
        for sentence in self.sentence_generator:
            if sentence:
                self.sentence_count += 1
            for token in sentence:
                self.token_count += 1
            for ngram_length in range(1, 6):
                ngram_counts = self.all_ngram_counts[ngram_length]
                for i, sentence_ngram in enumerate(ngrams(sentence, ngram_length)):
                    ngram_count = ngram_counts.setdefault(sentence_ngram, {'start': 0, 'all': 0})
                    if i == 0:
                        ngram_count['start'] += 1
                    ngram_count['all'] += 1

In [5]:
trainW_generator = get_tokenized_sentences('data/trainW_token_end.txt')
trainW = WordCounter(trainW_generator)
trainW.count()

In [6]:
trainT_generator = get_tokenized_sentences('data/trainT_token_end.txt')
trainT = WordCounter(trainT_generator)
trainT.count()

In [7]:
test1_generator = get_tokenized_sentences('data/test1_token_end.txt')
test1 = WordCounter(test1_generator)
test1.count()

In [8]:
test2_generator = get_tokenized_sentences('data/test2_token_end.txt')
test2 = WordCounter(test2_generator)
test2.count()

# ngram model

### Unigram model

In [9]:
trainW_unigram_vocab_size = len(trainW.all_ngram_counts[1]) + 1
trainT_unigram_vocab_size = len(trainT.all_ngram_counts[1]) + 1

In [79]:
class UnigramModel():
    def __init__(self, train, k=1):
        self.k = k
        train_unigram_counts = train.all_ngram_counts[1]
        train_unigram_counts['<UNK>'] = {'all': 0, 'start': 0}
        
        self.train_prob_denom = trainT.token_count + len(train_unigram_counts) * self.k
        self.train_prob_noms = {}
        self.train_probs = {}
        
        for unigram, unigram_count in train_unigram_counts.items():
            prob_nom = train_unigram_counts[unigram]['all'] + self.k
            self.train_prob_noms[unigram] = prob_nom
            self.train_probs[unigram] = prob_nom / self.train_prob_denom
        
    def calculate_avg_ll(test):
        self.test_ll
        test_unigram_counts = test.

In [98]:
unigram_model_trainT = UnigramModel(trainT, k=2)

In [101]:
unigram_model_trainT.train_prob_noms['<UNK>']

2

In [104]:
2 / 397658

5.029447414612556e-06

In [105]:
unigram_model_trainT.train_probs['<UNK>']

5.029447414612556e-06

In [102]:
sum(unigram_model_trainT.train_prob_noms.values())

397658

In [103]:
unigram_model_trainT.train_prob_denom

397658

In [55]:
trainT.all_ngram_counts[1]

{('prologue',): {'start': 1, 'all': 1},
 ('<END>',): {'start': 0, 'all': 31072},
 ('the',): {'start': 2965, 'all': 19484},
 ('day',): {'start': 3, 'all': 278},
 ('was',): {'start': 54, 'all': 4294},
 ('grey',): {'start': 22, 'all': 158},
 ('and',): {'start': 767, 'all': 10462},
 ('bitter',): {'start': 1, 'all': 18},
 ('cold',): {'start': 5, 'all': 187},
 ('dogs',): {'start': 4, 'all': 77},
 ('would',): {'start': 55, 'all': 1258},
 ('not',): {'start': 145, 'all': 2047},
 ('take',): {'start': 26, 'all': 319},
 ('scent',): {'start': 0, 'all': 25},
 ('big',): {'start': 5, 'all': 184},
 ('black',): {'start': 15, 'all': 472},
 ('bitch',): {'start': 0, 'all': 15},
 ('had',): {'start': 26, 'all': 3015},
 ('taken',): {'start': 2, 'all': 113},
 ('one',): {'start': 143, 'all': 1186},
 ('sniff',): {'start': 0, 'all': 8},
 ('at',): {'start': 95, 'all': 1773},
 ('bear',): {'start': 0, 'all': 189},
 ('tracks',): {'start': 0, 'all': 3},
 ('backed',): {'start': 0, 'all': 10},
 ('off',): {'start': 4, 'a

In [47]:
sum(value for key, value in unigram_model_trainT.unigram_train_probs.items())

AttributeError: 'UnigramModel' object has no attribute 'unigram_train_probs'

In [28]:
test1_unigram_ll = 0
test1_unigram_counts = test1.all_ngram_counts[1]
trainT_unigram_counts = trainT.all_ngram_counts[1]

for unigram, unigram_count in test1_unigram_counts.items():
    unigram_trainT_count = trainT_unigram_counts.get(unigram, {}).get(unigram, 0)
    unigram_trainT_prob = (unigram_trainT_count + 1)/(trainT.token_count + trainT_unigram_vocab_size)
    test1_unigram_ll += unigram_count['all'] * log2(unigram_trainT_prob)

test1_avg_unigram_ll = test1_unigram_ll / test1.token_count

In [29]:
test1_avg_unigram_ll

-18.5502996309069

In [20]:
trainT.token_count+trainT_unigram_vocab_size

383881

In [17]:
trainT_unigram_counts

{('prologue',): {'start': 1, 'all': 1},
 ('<END>',): {'start': 0, 'all': 31072},
 ('the',): {'start': 2965, 'all': 19484},
 ('day',): {'start': 3, 'all': 278},
 ('was',): {'start': 54, 'all': 4294},
 ('grey',): {'start': 22, 'all': 158},
 ('and',): {'start': 767, 'all': 10462},
 ('bitter',): {'start': 1, 'all': 18},
 ('cold',): {'start': 5, 'all': 187},
 ('dogs',): {'start': 4, 'all': 77},
 ('would',): {'start': 55, 'all': 1258},
 ('not',): {'start': 145, 'all': 2047},
 ('take',): {'start': 26, 'all': 319},
 ('scent',): {'start': 0, 'all': 25},
 ('big',): {'start': 5, 'all': 184},
 ('black',): {'start': 15, 'all': 472},
 ('bitch',): {'start': 0, 'all': 15},
 ('had',): {'start': 26, 'all': 3015},
 ('taken',): {'start': 2, 'all': 113},
 ('one',): {'start': 143, 'all': 1186},
 ('sniff',): {'start': 0, 'all': 8},
 ('at',): {'start': 95, 'all': 1773},
 ('bear',): {'start': 0, 'all': 189},
 ('tracks',): {'start': 0, 'all': 3},
 ('backed',): {'start': 0, 'all': 10},
 ('off',): {'start': 4, 'a

In [None]:
len(set(test1_counter.all_ngram_counts[1]) - set(trainT_counter.all_ngram_counts[1]))

In [None]:
len(set(test2_counter.all_ngram_counts[1]) - set(trainT_counter.all_ngram_counts[1]))