# Babble Notebook

This notebook provides an easy interface to create babbling text with markov chains.  A corpus of text is broken down into a set of seeds and outputs.  Each seed is a series of n words, and each output is a word that comes after the seed in the original corpus.  A seed is given to the algorithm, it selects an output, and then removes the first word in the seed and the output is appended to the end of the seed, allowing the process to be repeated ad infinitum while maintaining the style of the original corpus.

Start the notebook with Cell->Run All, and then enter a comma seperated list of all data sources.  These data sources can either be text files or directories - all contents of directories will be recursively included.  Next, a seed size must be selected (the value of n).  The seed size balances a trade-off between generating text which is comprehensible and text which diverges from the source.  A seed size that is too large will result in the notebook simply regurgitating the source text.

Since the notebook was originally written as part of a poetry/song writing project, it also includes a reverse-babbler that seeks to create phrases of a given length that end in a rhyming word.  The seed and phrase generation values are mutually exclusive between the regular babbler and reverse babbler, the active mode determined by the "reverse babble" checkbox.

Newly generated text will appear in the right hand box, and any text which you would like to preserve can be transfered over to the box on the left before the next round of text generation.

In [25]:
#Initially adapted from http://sookocheff.com/post/nlp/ngram-modeling-with-markov-chains/

import random
import pronouncing

class MarkovChain:

    def __init__(self):
        self.memory = {}
        self.reverseMemory = {}
        self.seedCount = 1
        
    def setSeedCount(self, count):
        self.seedCount = count

    #####
    ##Learning functions
    #####
        
    def _learn_key(self, key, value):
        if key not in self.memory:
            self.memory[key] = []

        self.memory[key].append(value)
        
    def _learn_reverse(self, key, value):
        if key not in self.reverseMemory:
            self.reverseMemory[key] = []

        self.reverseMemory[key].append(value)
        
    
    def learn(self, text, seedCount):
        tokens = text.replace("\n", " ").lower().split(" ")
        tokens = [st.strip() for st in tokens if st.strip() != ""]
        self.learnForward(tokens, seedCount)
        self.learnReverse(tokens, seedCount)
            
    def learnForward(self, tokens, seedCount):
        bigrams = [(tokens[i:i+seedCount], tokens[i+seedCount]) for i in range(0, len(tokens) - seedCount - 1)]
        for bigram in bigrams:
            self._learn_key(" ".join(bigram[0]), bigram[1])  
            
    def learnReverse(self, tokens, seedCount):
        bigrams = [(tokens[i:i+seedCount], tokens[i - 1]) for i in range(1, len(tokens) - seedCount)]
        for bigram in bigrams:
            self._learn_reverse(" ".join(bigram[0]), bigram[1])

            
    def ensureSeedLength(self, current_state, babbleMap, matchLastWord=False):
        #Ensure that an appropriate babble phrase has been selected
        if (len(current_state.split(" ")) > self.seedCount):
            lenDiff = len(current_state) - self.seedCount
            current_state = current_state[lenDiff:]
        
        if (len(current_state.split(" ")) < self.seedCount):
            noMatch = True
            babbleKeys = babbleMap.keys()
            random.shuffle(babbleKeys)
            for key in babbleKeys:
                if key.find(current_state):
                    keyTokens = key.split(" ")
                    current_stateTokens = current_state.split(" ")
                    if matchLastWord and (keyTokens.pop() != current_stateTokens.pop()):
                        continue;
                    current_state = key
                    noMatch = False
                    break;
                    
            if noMatch:
                current_state = random.choice(babbleMap.keys())
        
        return current_state
            
    #####
    ## selecting next words for babble
    #####
                
    def _next(self, current_state):
        next_possible = self.memory.get(current_state)

        #select a random output from a key
        if not next_possible:
            next_possible = self.memory.get(random.choice(self.memory.keys()))
            
        return random.choice(next_possible)
    
    def _workBackwards(self, rhymingWord, syllables):
        syllables -= 1
        current_state = mc.ensureSeedLength(rhymingWord, self.reverseMemory, True)
        fullPhrase = current_state

        while syllables > 0:
            next_possible = self.reverseMemory.get(current_state)
            
            if not next_possible:
                next_possible = self.memory.get(random.choice(self.reverseMemory.keys()))
            
            next_word = random.choice(next_possible)
            lastWordIndex = current_state.rfind(" ")
            current_state = next_word + " " + current_state[:lastWordIndex]

            #apply
            fullPhrase = next_word + " " + fullPhrase
            syllables -= 1
                     
        return fullPhrase

    #####
    ## Output functions
    ####
    
    def babble(self, amount, current_state):
        fullPhrase = current_state
        
        #Babble away
        while amount > 0:
            current_state = mc.ensureSeedLength(current_state, self.memory)
            nextWord = self._next(current_state)
            fullPhrase += " " + nextWord
            
            current_state = current_state.strip()
            wordBreakIndex = current_state.find(" ")
            current_state = current_state[wordBreakIndex + 1:] + " " + nextWord
            amount -= 1
            
        return fullPhrase
        
    def reverseBabble(self, rhymingWord, syllableCount, numberOfPhrases):
        if numberOfPhrases > 0:
            thisRhyme = rhymingWord
            allRhymes = pronouncing.rhymes(rhymingWord)
            allKeys = self.reverseMemory.keys()
            random.shuffle(allRhymes)
            
            for rhyme in allRhymes:
                if self.seedCount == 1:
                    if rhyme in allKeys:
                        thisRhyme = rhyme 
                        break
                else:
                    fullBreak = False
                    for key in allKeys:
                        tokens = key.split(" ")
                        if rhyme == tokens[len(tokens) - 1]:
                            thisRhyme = rhyme
                            fullBrake = True
                            break
                    if fullBreak:
                        break
            
            nextPhrase = self._workBackwards(thisRhyme, syllableCount)
            return "Phrase:\n" + nextPhrase + "\n\n" + self.reverseBabble(rhymingWord, syllableCount, numberOfPhrases - 1)
        
        return ""
        
    

In [26]:
from ipywidgets import *
from IPython.display import display
import string
import os

######
## Read in initial text code
######

mc = MarkovChain()
corpus = ""

fileList = Text(
    description='Files to read in:',
    width = '100%')


wordsInSeed = IntText(
    description="Seed Size:",
    value=1)

readInFiles = Button(description="Read in files")

def readFromFS(fileName, isDir = False):
    contents = ""
    if(isDir):
        files = os.listdir(fileName)
        for entry in files:
            fullPath = fileName + "/" + entry
            contents += readFromFS(fullPath, os.path.isdir(fullPath))
            
    else:
        f = open(fileName, "r")
        contents += f.read()
    return contents


def submitFiles(b):
    global corpus, mc
    seedCount = wordsInSeed.value
    
    if seedCount <= 0:
        seedCount = 1
    mc.setSeedCount(seedCount)
    
    lof = fileList.value
    lofArray = lof.split(",")
    lofArray = [v.strip() for v in lofArray if v != ""]
    
    if len(lofArray) > 0:

        for f in lofArray:
            isDir = os.path.isdir(f)
            corpus += readFromFS(f, isDir)
    
    
        #strip non ascii        
        corpus = ''.join(c for c in corpus if not (ord(c) < 32 or ord(c) > 126) or c == "\n")
            

        mc.learn(corpus, seedCount)
        
        fileList.close()
        wordsInSeed.close()
        readInFiles.close()
        displayTextGen()
    
readInFiles.on_click(submitFiles)
    
display(fileList)
display(wordsInSeed)
display(readInFiles)

######
## Generate new Text Code
######

reverseLabel = HTML(
value="<b>Reverse properties</b>")

keyWord = Text(
description="Babble Seed")

rhyme = Text(
description="Rhyme seed")


outputCount = IntText(
description="Generate Words",
value=100)

reverse = widgets.Checkbox(
    description='Reverse Babble',
    value=False,
)

reverseSyllables = IntText(
description="Num Syllables/Phrases",
value=5)

reversePhrases = IntText(
description="",
value=5)

preservedText = Textarea(
    description='Preserved Text:',
    width = '100%',
    height = 250
)


testText = Textarea(
    description='New Generation:',
    width = '100%',
    height = 250
)

def generateNewText(b):
    global mc
    outputText = ""
    if reverse.value:
        outputText = mc.reverseBabble(rhyme.value, reverseSyllables.value, reversePhrases.value)
    else:
        outputText = mc.babble(outputCount.value, keyWord.value)
        
    testText.value = outputText

button = Button(description="Generate New Text",
                width="30%")

button.on_click(generateNewText)

def displayTextGen():
    display(HBox((preservedText,testText)))
    display(HBox((keyWord,outputCount)))
    
    display(reverseLabel)
    display(reverse)
    display(rhyme)
    display(HBox((reverseSyllables, reversePhrases)))
    display(button)
    