In [9]:
import math, random, string

In [10]:
################################################################################
# Part 0: Utility Functions
################################################################################

COUNTRY_CODES = ['af', 'cn', 'de', 'fi', 'fr', 'in', 'ir', 'pk', 'za']

def start_pad(c):
    ''' Returns a padding string of length n to append to the front of text
        as a pre-processing step to building n-grams '''
    return '~' * c

def ngrams(c, text):
    ''' Returns the ngrams of the text as tuples where the first element is
        the length-n context and the second is the character '''
    
    # sliding window
    ng = []
    for x in range(len(text)):
        char = text[x]
        if x < c:
            context = start_pad(c - x) + text[:x] # x = 0, 0 chars, x = 1, 1 char, etc
        else:
            context = text[x-c:x]

        ng.append((context, char))
    return ng

def create_ngram_model(model_class, path, c=2, k=0):
    ''' Creates and returns a new n-gram model '''
    model = model_class(c, k)
    with open(path, encoding='utf-8', errors='ignore') as f:
        model.update(f.read())
    return model

In [11]:
ngrams(3, 'abcabc')

[('~~~', 'a'),
 ('~~a', 'b'),
 ('~ab', 'c'),
 ('abc', 'a'),
 ('bca', 'b'),
 ('cab', 'c')]

In [12]:
################################################################################
# Part 1: Basic N-Gram Model
################################################################################

class NgramModel(object):
    ''' A basic n-gram model using add-k smoothing '''

    def __init__(self, c, k):
        self.ngram_max = c
        self.smooth_par = k
        self.vocab = set()
        self.wc = {}

    def get_vocab(self):
        ''' Returns the set of characters in the vocab '''
        return self.vocab

    def update(self, text):
        ''' Updates the model n-grams based on text '''
        unique = set(text)
        self.vocab = self.vocab | unique # | = union
        wc = self.wc
        for g in ngrams(self.ngram_max, text):
            if g[0] in wc:
                if g[1] in wc[g[0]]:
                    wc[g[0]][g[1]] += 1
                else:
                    wc[g[0]][g[1]] = 1
            else:
                wc[g[0]] = {}
                wc[g[0]][g[1]] = 1
        
        self.wc = wc

    def prob(self, context, char):
        ''' Returns the probability of char appearing after context '''
        if context not in self.wc: # return 1/len(V) for a novel context
            return 1/len(self.vocab)
        
        k = self.smooth_par
        v = len(self.vocab)
        p = 0
        counts = self.wc[context]
        if char not in counts:
            c = 0
        else:
            c = counts[char]
            
        # p(char | context) = counts(context, char)/counts(context)
        p = (c + k) / (sum(counts.values()) + (0 if k == 0 else v))

        return p

    def random_char(self, context):
        ''' Returns a random character based on the given context and the 
            n-grams learned by this model '''
        
        r = random.random()
        if context not in self.wc:
            x = r / (1/len(self.vocab)) # equal chance with no context
            x = math.floor(x)
            return list(self.vocab)[x]
        
        counts = self.wc[context]
        total =  sum(counts.values())
        curr = 0

        for k,v in counts.items():
            curr += (v / total)
            if r <= curr:
                return k

        return '-'

    def random_text(self, length):
        ''' Returns text of the specified character length based on the
            n-grams learned by this model '''
        
        c = self.ngram_max
        text = ""

        for i in range(length):
            if i >= c:
                context = text[(i - c):i]
            else:
                context = start_pad(c - i) + text[:i]

            text += self.random_char(context)

        return text

    def perplexity(self, text):
        ''' Returns the perplexity of text based on the n-grams learned by
            this model '''
        perp = 0
        c = self.ngram_max

        for w in range(len(text)): # take the geometric mean of all chars
            if w >= c:
                context = text[(w - c):w]
            else:
                context = start_pad(c - w) + text[:w]
            char = text[w]
            prob = self.prob(context, char)
            if not prob: # catch log(0) <- undefined
                return (float('inf'))

            perp -= math.log(prob) #
        
        perp = perp / (len(text))
        return math.e**(perp)

\begin{align*}
\log\left(Perplexity(W)\right) &= \log\left[\left(\prod_{i = 1}^{N} \frac{1}{P(w_i \mid w_1,\dots,w_{i-1})} \right)^{1/N}\right] \\
                               &= \frac{1}{N}\log\left[\prod_{i = 1}^{N} \frac{1}{P(w_i \mid w_1,\dots,w_{i-1})}\right] \\
                               &= \frac{1}{N} \sum_{i = 1}^{N} \log\left[\frac{1}{P(w_i \mid w_1,\dots,w_{i-1})}\right] \\
                               &= \frac{1}{N} \sum_{i = 1}^{N} \left[\log\left(1\right) - \log\left( P(w_i \mid w_1,\dots,w_{i-1} ) \right)\right] \\
                               &= \frac{1}{N} \sum_{i = 1}^{N} 0 - \log\left( P(w_i \mid w_1,\dots,w_{i-1} ) \right)\\
\log\left(Perplexity(W)\right) &= -\frac{1}{N} \sum_{i = 1}^{N} \log\left( P(w_i \mid w_1,\dots,w_{i-1} ) \right) 


\end{align*}

In [13]:
temp = NgramModel(1, 0)
temp.update('abab')
print(temp.get_vocab())
temp.update('abcd')
print(temp.get_vocab())

print(temp.prob('a','b'))
print(temp.prob('~','c'))
print(temp.prob('b','c'))

{'a', 'b'}
{'a', 'd', 'c', 'b'}
1.0
0.0
0.5


In [14]:
temp = NgramModel(0, 0)
temp.update('abab')
temp.update('abcd')
random.seed(1)
[temp.random_char('') for i in range(25)]

['a',
 'c',
 'c',
 'a',
 'b',
 'b',
 'b',
 'c',
 'a',
 'a',
 'c',
 'b',
 'c',
 'a',
 'b',
 'b',
 'a',
 'd',
 'd',
 'a',
 'a',
 'b',
 'd',
 'b',
 'a']

In [15]:
m = NgramModel(1, 0)
m.update('abab')
m.update('abcd')
random.seed(1)
m.random_text(25)

'abcdddcdabcdbabcdbcdabcda'

In [16]:
m = create_ngram_model(NgramModel, 'data/shakespeare_input.txt',2)
print(m.random_text(250))

Firt be don, bequardoseent. Lore himpon his him, Lornextee imparrociss,
And lin courepat inche
To to er somentrund arthenceight mestatch fich ourd: lothearguand lade this gracteare Dukeetche ill, way clord:
POLIA:
GRANTONSON:
It th bar le joingueelto


In [17]:
m = create_ngram_model(NgramModel, 'data/shakespeare_input.txt',3)
print(m.random_text(250))

First Cupill I
est-il?

Like Henription; the good from honough much bot: whath
where dearls to senance do,
Come stribestrust dust her ple goes itself me like no:
Sol
To wer
work, I'll comes a vilst Served:
O, be arthis.
So find with my fore nake; leg


In [18]:
m = create_ngram_model(NgramModel, 'data/shakespeare_input.txt',4)
print(m.random_text(250))

First Citizens:
Why, waited.

VALENTIO:
What you thine have no far and didst to siness Androus as conventions in me: a barbariation, sways
Drag pear it, and wealthou die, lord Pompey.

MISTRESS FORD:
Greath, I darest with his return, seed most the st


In [19]:
m = create_ngram_model(NgramModel, 'data/shakespeare_input.txt',7)
print(m.random_text(250))

First Citizen:
Ye're honest;
but yet drunk in hand: I am too choleric.

ANTIPHOLUS OF EPHESUS:
I needs as valiant youth will cut off, and be that, my lord,
As if this busy time.

CORIOLANUS:
Not I, sir; let me ha't: I have deposed.

WARWICK:
When we 


In [20]:
m = create_ngram_model(NgramModel, 'data/shakespeare_input.txt',12)
print(m.random_text(500))

First Citizen:
We'll burn his body in the holy place,
And you within the compass of a praemunire,
That therefore never flout at me for what you seek so?

LADY FAULCONBRIDGE:
King Richard thus removed,
Leaving no tract behind.

Painter:
How shall I understand me: over and beside
Signior Baptista Minola,
As if he were forgot;
And on their skins, as on the bark of trees,
Have with our niece a dowry large enough:
What, are you merry,
If worthier friends had not prevented many. Eros, ho!

CLEOPATRA:



In [21]:
temp = NgramModel(1, 0)
temp.update('abab')
temp.update('abcd')
print(temp.perplexity('abcd'))
print(temp.perplexity('abca'))
print(temp.perplexity('abcda'))

1.189207115002721
inf
1.515716566510398


In [22]:
m = NgramModel(1, 1)
m.update('abab')
m.update('abcd')
print(m.prob('a', 'a'))
print(m.prob('a', 'b'))
print(m.prob('c', 'd'))
print(m.prob('d', 'a'))

0.14285714285714285
0.5714285714285714
0.4
0.25


In [23]:
################################################################################
# Part 2: N-Gram Model with Interpolation
################################################################################

class NgramModelWithInterpolation(NgramModel):
    ''' An n-gram model with interpolation '''

    def __init__(self, c, k):
        self.ngram_max = c
        self.smooth_par = k
        self.weights = [1/(c+1)] * (c+1)
        self.vocab = set()
        self.wc = {}

    def set_lambdas(self, lambdas):
        if len(lambdas) != (self.ngram_max + 1):
            print("Error: len(lambdas) does not match max ngram")
        s = sum(lambdas)
        if s < 1:
            print("Error: Weights do not add up to 1.")
        elif s > 1:
           print("sum(lambdas) > 1, normalizing by sum(lambdas)...\n")
           t = [x / s for x in lambdas]
        else:
            t = lambdas
        self.weights = t
        print(f"lambdas = {t}")

    def get_vocab(self):
        return self.vocab

    def update(self, text):
        unique = set(text)
        self.vocab = self.vocab | unique # | = union
        wc = self.wc
        for c in range(self.ngram_max,-1,-1):
            for g in ngrams(c, text):
                if g[0] in wc:
                    if g[1] in wc[g[0]]:
                        wc[g[0]][g[1]] += 1
                    else:
                        wc[g[0]][g[1]] = 1
                else:
                    wc[g[0]] = {}
                    wc[g[0]][g[1]] = 1
            
        self.wc = wc

    def prob(self, context, char):
        p_interp = 0
        lambdas = self.weights

        for l in range(len(lambdas)):
            t_context = context[l:]
            prob = super().prob(t_context, char) # use parent prob 
            p_interp += lambdas[l] * prob

        return p_interp

In [24]:
m = NgramModelWithInterpolation(1, 0)
m.update('abab')

print(m.prob('a', 'a'))
print(m.prob('a', 'b'))

0.25
0.75


In [25]:
m.set_lambdas([3,2])

sum(lambdas) > 1, normalizing by sum(lambdas)...

lambdas = [0.6, 0.4]


In [26]:
print(m.prob('a', 'a'))
print(m.prob('a', 'b'))

0.2
0.8


In [27]:
m = NgramModelWithInterpolation(2, 1)
m.update('abab')
m.update('abcd')

print(m.prob('~a', 'b'))
print(m.prob('ba', 'b'))
print(m.prob('~c', 'd'))
print(m.prob('bc', 'd'))

0.4682539682539682
0.4349206349206349
0.27222222222222225
0.3222222222222222


In [28]:
m = create_ngram_model(NgramModelWithInterpolation, 'data/shakespeare_input.txt',c = 4, k = 1)
m.set_lambdas([10,5,2,1])

Error: len(lambdas) does not match max ngram
sum(lambdas) > 1, normalizing by sum(lambdas)...

lambdas = [0.5555555555555556, 0.2777777777777778, 0.1111111111111111, 0.05555555555555555]


In [29]:
print(m.random_text(500))

First Puck,--thick, he stand I loved,
Both his kings to-day a subjected emperiod! Welcome, cock- punk,
If the dangerous tomb.
He must beat all thee;
O, let myself,
And leave him: and what house of so for ther.

CAPULET:
Find that sense
questions, God and thee thy again! news;
Thy name,
And whore.

LUCIO:
Nay, he is not be sweet his by
This stranging contrivilely object?

Gaoler that tongue, so,
I cannot men purpose pint Albion,
'I mayst the lion's aughts,
Of mine an anothink, harve Gothsayest ho


## Analysis

In [30]:
def get_model_scores(model_class, c = 2, k = 0, lambdas = []):
    model = create_ngram_model(model_class, 'data/shakespeare_input.txt', c = c, k = k)
    scores = {
        'nytimes' : [],
        'shakespeare' : []
    }

    if len(lambdas) > 0:
        model.set_lambdas(lambdas)

    with open('data/nytimes_article.txt', encoding='utf-8', errors = 'ignore') as file:
        for f in file:
            p = model.perplexity(f)
            if f == '\n':
                continue
            scores['nytimes'].append(p)

    with open('data/shakespeare_sonnets.txt', encoding='utf-8', errors = 'ignore') as file:
        for f in file:
            p = model.perplexity(f)
            if f == '\n':
                continue
            scores['shakespeare'].append(p)
    
    return scores

---

pandas is imported for analytics, not for the model.

In [31]:
import pandas as pd

In [338]:
for x in [0,1,5,10,100]:
    s = get_model_scores(NgramModel, c = 2, k = x)
    for k,v in s.items():
        print(f"{k} = {v}")
        print(f"E[{k}] = {sum(v)/len(v)}")
    print('\n')

nytimes = [inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf]
E[nytimes] = inf
shakespeare = [inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, inf, in

In [46]:
for x in [[1,1,1],[10,5,1],[1,5,10],[10,1,10],[1,10,1]]:
    s = get_model_scores(NgramModelWithInterpolation, c = 2, k = 5, lambdas = x)
    for k,v in s.items():
        print(f"{k} = {v}")
        #print(f"E[{k}] = {sum(v)/len(v)}")

sum(lambdas) > 1, normalizing by sum(lambdas)...

lambdas = [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]
nytimes = [12.22632821612132, 11.51858950516417, 11.145620785550184, 12.30420956705738, 13.97761896184441, 12.26194618415351, 15.845342359596945, 16.395883106671416, 17.56885473839405, 12.272558890610709, 12.576266660701862, 11.987098212688835, 15.479410433424334, 14.17370201020421, 11.146382861203483, 12.861130554194302, 11.309528779985744, 10.99955249210193, 12.422142813106271, 14.054209847660823]
shakespeare = [30.3869014008627, 11.098497164605499, 10.310397253536886, 9.546311573858159, 9.619895517016074, 9.054320332396463, 13.009932714608672, 10.773241796130563, 10.47988427830137, 9.427809436723436, 9.942517333764407, 9.671464826592324, 11.938508132692064, 11.412122562243972, 8.296014805674288, 43.85649505506701, 9.919093062964448, 11.757082167585095, 10.949747537772046, 10.355874411382912, 9.731031554171404, 9.32156231451413, 10.252320703960676, 10.4687667320302