# N-gramms models
-  is just a sequence of N consecutive words (or characters) in the text.

Help to model the language: predict which word can be next, based on previous ones.


In [20]:
#from pprint import pprint
import math
import os # Operating System - for work with files and folders
import spacy
from collections import Counter, defaultdict

## Resources
## n-gram model: https://github.com/joshualoehr/ngram-language-model/tree/master

In [11]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# read the text (eng_Anne_full_abbr.txt)
base_path = os.getcwd()
file_path = os.path.abspath(os.path.join(base_path, "..", "data", "without_chap_and_title", "eng_Anne_full_abbr.txt"))
text = open(file_path, encoding="utf-8").read()

print(text[0:500])

Mistress Rachel Lynde lived just where the Avonlea main road dipped down into a little hollow, fringed with alders and ladies' eardrops and traversed by a brook that had its source away back in the woods of the old Cuthbert place; it was reputed to be an intricate, headlong brook in its earlier course through those woods, with dark secrets of pool and cascade; but by the time it reached Lynde's Hollow it was a quiet, well-conducted little stream, for not even a brook could run past Mistress Rach


In [None]:
# preparing
SOS = "<s>"
EOS = "</s>"
UNK = "<UNK>"

In [14]:
def tokenize_sentences(text):
    doc = nlp(text)
    return [f"{SOS} {sent.text.strip()} {EOS}" for sent in doc.sents]

def replace_singletons(tokens):
    freq = Counter(tokens)
    return [t if freq[t] > 1 else UNK for t in tokens]

def preprocess(text):
    sentences = tokenize_sentences(text)
    tokens = ' '.join(sentences).split()
    return replace_singletons(tokens)

In [15]:
tokens = preprocess(text)
vocab = Counter(tokens)
V = len(vocab)

In [None]:
# сount unigrams and bigrams
unigrams_freq = Counter(tokens)
bigrams = zip(tokens[:-1], tokens[1:])
bigrams_freq = Counter(bigrams)

In [None]:
# calculate logarithmic probabilities with add-one smoothing
k = 1
probs = {}
for bigram, count in bigrams_freq.items():
    prev = bigram[0]
    prev_count = unigrams_freq[prev]
    probs[bigram] = -math.log((count + k) / (prev_count + k * V))

In [None]:
print("Top 100 bigrams with log-probs:")
for bg, p in list(probs.items())[:100]:
    print(bg, round(p, 4))

Top 10 bigrams with log-probs:
('<s>', 'Mistress') 4.8145
('Mistress', 'Rachel') 4.813
('Rachel', 'Lynde') 6.4304
('Lynde', 'lived') 7.9422
('lived', 'just') 7.9273
('just', 'where') 7.9804
('where', 'the') 6.0652
('the', 'Avonlea') 6.4855
('Avonlea', 'main') 7.9363
('main', 'road') 6.6724
('road', 'dipped') 7.9288
('dipped', 'down') 7.9241
('down', 'into') 7.0416
('into', 'a') 5.8204
('a', 'little') 4.4217
('little', 'hollow,') 7.9732
('hollow,', 'fringed') 7.9244
('fringed', 'with') 7.2313
('with', '<UNK>') 4.6677
('<UNK>', 'and') 3.2972
('and', "ladies'") 8.3645
("ladies'", '<UNK>') 7.2311
('and', '<UNK>') 2.9777
('<UNK>', 'by') 6.1126
('by', 'a') 5.6128
('a', 'brook') 6.7463
('brook', 'that') 7.5233
('that', 'had') 6.8521
('had', 'its') 8.0513
('its', '<UNK>') 5.5388
('<UNK>', 'away') 7.9043
('away', 'back') 7.2421
('back', 'in') 6.4431
('in', 'the') 2.8581
('the', 'woods') 6.6397
('woods', 'of') 7.9268
('of', 'the') 3.1861
('the', 'old') 6.08
('old', 'Cuthbert') 7.5335
('Cuthbert'