# N-Grams and Markov Chains for a Girl

First, read our input text from a file and clean it into a list of words without special characters:

In [90]:
from collections import Counter
from pprint import pprint

def clean(input_text):
    result = input_text

    special_chars = [".", "\n", ";", "?", "!", ":", ",", "(", ")", "[", "]", "\"", "“", "”", "*"]

    for char in special_chars:
        result = result.replace(char, " " if char == "\n" else "")

    return result.lower()


# Clean up the input text
def split_and_dropnulls(input_text):
    words = input_text.split(" ")

    non_empty_words = [word for word in words if word != '']
    return non_empty_words

with open('alice.txt', 'r') as alice_file: 
    alice_text = ' '.join(alice_file.readlines())

alice_words = split_and_dropnulls(clean(alice_text))
pprint(alice_words)

['start',
 'of',
 'the',
 'project',
 'gutenberg',
 'ebook',
 "alice's",
 'adventures',
 'in',
 'wonderland',
 'cover',
 'alice’s',
 'adventures',
 'in',
 'wonderland',
 'by',
 'lewis',
 'carroll',
 'the',
 'millennium',
 'fulcrum',
 'edition',
 '30',
 'contents',
 'chapter',
 'i\tdown',
 'the',
 'rabbit-hole',
 'chapter',
 'ii\tthe',
 'pool',
 'of',
 'tears',
 'chapter',
 'iii\ta',
 'caucus-race',
 'and',
 'a',
 'long',
 'tale',
 'chapter',
 'iv\tthe',
 'rabbit',
 'sends',
 'in',
 'a',
 'little',
 'bill',
 'chapter',
 'v\tadvice',
 'from',
 'a',
 'caterpillar',
 'chapter',
 'vi\tpig',
 'and',
 'pepper',
 'chapter',
 'vii\ta',
 'mad',
 'tea-party',
 'chapter',
 'viii\tthe',
 'queen’s',
 'croquet-ground',
 'chapter',
 'ix\tthe',
 'mock',
 'turtle’s',
 'story',
 'chapter',
 'x\tthe',
 'lobster',
 'quadrille',
 'chapter',
 'xi\twho',
 'stole',
 'the',
 'tarts',
 'chapter',
 'xii\talice’s',
 'evidence',
 'chapter',
 'i',
 'down',
 'the',
 'rabbit-hole',
 'alice',
 'was',
 'beginning',
 'to

Next, let's split the text into pairs of two (later, N) words at a time:

In [91]:
alice_pairs = [(alice_words[i], alice_words[i+1]) for i in range(len(alice_words)-1)]
pprint(alice_pairs)

[('start', 'of'),
 ('of', 'the'),
 ('the', 'project'),
 ('project', 'gutenberg'),
 ('gutenberg', 'ebook'),
 ('ebook', "alice's"),
 ("alice's", 'adventures'),
 ('adventures', 'in'),
 ('in', 'wonderland'),
 ('wonderland', 'cover'),
 ('cover', 'alice’s'),
 ('alice’s', 'adventures'),
 ('adventures', 'in'),
 ('in', 'wonderland'),
 ('wonderland', 'by'),
 ('by', 'lewis'),
 ('lewis', 'carroll'),
 ('carroll', 'the'),
 ('the', 'millennium'),
 ('millennium', 'fulcrum'),
 ('fulcrum', 'edition'),
 ('edition', '30'),
 ('30', 'contents'),
 ('contents', 'chapter'),
 ('chapter', 'i\tdown'),
 ('i\tdown', 'the'),
 ('the', 'rabbit-hole'),
 ('rabbit-hole', 'chapter'),
 ('chapter', 'ii\tthe'),
 ('ii\tthe', 'pool'),
 ('pool', 'of'),
 ('of', 'tears'),
 ('tears', 'chapter'),
 ('chapter', 'iii\ta'),
 ('iii\ta', 'caucus-race'),
 ('caucus-race', 'and'),
 ('and', 'a'),
 ('a', 'long'),
 ('long', 'tale'),
 ('tale', 'chapter'),
 ('chapter', 'iv\tthe'),
 ('iv\tthe', 'rabbit'),
 ('rabbit', 'sends'),
 ('sends', 'in'),
 

Now, we can find the frequency of each pair within the set of pairs:

In [92]:
pair_counts = Counter(alice_pairs)

frequencies = pair_counts.most_common(10)
pprint(frequencies)

[(('said', 'the'), 209),
 (('of', 'the'), 132),
 (('said', 'alice'), 115),
 (('in', 'a'), 98),
 (('and', 'the'), 80),
 (('in', 'the'), 79),
 (('it', 'was'), 73),
 (('to', 'the'), 69),
 (('the', 'queen'), 65),
 (('as', 'she'), 61)]


In [93]:
import sys 

def markov_model(sequence: list, n: int = 2):
    """
    Create a Markov model (represented as a dict) from the given input sequence, 
    using N-grams of size {n}
    """
    model = {}
    sequence = list(sequence[:]) + [None]
    for starting_position in range(len(sequence) - n):
        current_ngram = tuple(sequence[starting_position:starting_position + n])
        next_item = sequence[starting_position + n]
        
        if current_ngram not in model: 
            model[current_ngram] = [next_item]
        else:
            model[current_ngram].append(next_item)

    return model

alice_model = markov_model(alice_text, 5)
print(f'Finished training! The final model has size: {sys.getsizeof(alice_model)} bytes')
# pprint(model)

Finished training! The final model has size: 1310808 bytes


Now that we have a Markov model of our text, we can use it to generate more text that "looks like" the source material:

In [94]:
import random

def generate(n, model, start=None, max_length=100):
    if start is None:
        start = random.choice(list(model.keys()))
    
    output = list(start)

    for i in range(max_length):
        start = tuple(output[-n:])
        next_item = random.choice(model[start])

        if next_item is None:
            break
        else:
            output.append(next_item)

    return output


alice_result = generate(5, alice_model, max_length=2000)
for char in alice_result:
    print(char, end="")


shop of the doors all he set to get dry leaves, and perhaps I shouted three of the White Rabbit. Her life to seemed tone. And the end of it, trotting kind out, “First, that he waiting, and the mushroom again, but heavy sobbing as the Duchess beginning invited,” said the Mock Turtle, there was very glad I’ve tried that case it all say when I get the Gryphon, “you forgotten to her all: in fact she kept on their she work shaking from this voice—the best,
      Beau—ootiful, it down, and something was sentence in a growing over handed besides of executed, yawned in a game indignantly. “Let the door began in which puzzled him.”
 
 “It’s himself in a consider you’ll get here?” said Alice, if I mustard, pine-apples, it was over was dreamingo was so eagerly, found out, alas! either listenced with his was going the pope, was the Dormouse slowly back, she ran to called after her hear that last resource, she went on, “if I know.”
 
 “Who is Dinah, if—if I’d take them what you holding out when she

In [95]:
with open('bible.txt', 'r') as bible_file: 
    bible_text = ' '.join(bible_file.readlines())

# Train a model on it 
bible_model = markov_model(bible_text, n=5)

print(f'Finished training! The final model has size: {sys.getsizeof(bible_model)} bytes')

Finished training! The final model has size: 5242968 bytes


In [96]:
# Generate some more "bible" with it
bible_result = generate(n=5, model=bible_model, max_length=1000)

for char in bible_result:
    print(char, end='')

bai out overcome into all
 given as David every one great
 thou perish? is hid trembled twenty
 angels, for a townclerk had said unto then shall there a greated not hear.
 
 8:33 But his words his for you for two, who goeth eggs, and shalt not hear
 unto their own land of the mountain, and it came of than have not calleth in salvation, boldly unto Aaron’s pertains, and put the peace.
 
 41:8 Be mercy, with all they
 shall round confident to you, and
 Zephaniah, I will bread, and thy mother
 Arnon.
 
 9:31 Rejoice
 to see now a man answer, and
 lifted up
 unto
 three transgressed overtook hot again the blessed it into the son of Asher; and to Achish.
 
 16:15 Then said took upon an her give there is Hymenaeus abundant in the rods.
 
 13:1 Lord GOD, for he that every man spirits; and grey heareth God ways, the sons
 of our eyes:
 behold, even raged up their journ it into an in purposes, behold, how good day, 48:22 To devils, could not my word is uppermost
 iniquity thou hast look under t

Now, let's do it to the Grateful Dead. (Do you know the grateful dead?)

In [97]:
with open('gdead.txt', 'r') as dead_file: 
    dead_text = ' '.join(dead_file.readlines())

# Train a model on it 
dead_model = markov_model(dead_text, n=5)

print(f'Finished training! The final model has size: {sys.getsizeof(dead_model)} bytes')

Finished training! The final model has size: 589920 bytes


In [98]:
# Generate some more "dead songs" with it
dead_result = generate(n=5, model=dead_model, max_length=1000)

for char in dead_result:
    print(char, end='')

to be safe and now I'll bring harden wings and I maybe they were on there any choice,
 But if you anywhere anymore.
 
 When there anything you wanna bring its light on the days when you come.
 Gonna bring her in my bendin' about loud!
 
 Now he's gone, he's more,
 I turned around world.
 
