From dc902ce64fdb8b100a83807d6185f24ba309f38d Mon Sep 17 00:00:00 2001 From: Erik Erlandson Date: Mon, 26 Aug 2019 13:47:38 -0700 Subject: [PATCH] CompiledText model class --- markovify/__init__.py | 1 + markovify/compiled.py | 116 ++++++++++++++++++++++++++++++++++++++++++ test/test_compiled.py | 111 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+) create mode 100644 markovify/compiled.py create mode 100644 test/test_compiled.py diff --git a/markovify/__init__.py b/markovify/__init__.py index 7355628..a2f0a88 100644 --- a/markovify/__init__.py +++ b/markovify/__init__.py @@ -3,3 +3,4 @@ from .text import Text, NewlineText from .splitters import split_into_sentences from .utils import combine +from .compiled import CompiledText diff --git a/markovify/compiled.py b/markovify/compiled.py new file mode 100644 index 0000000..bb25baf --- /dev/null +++ b/markovify/compiled.py @@ -0,0 +1,116 @@ +import random +import bisect +import re +from .chain import BEGIN, END, accumulate +from .text import Text, ParamError, DEFAULT_TRIES + +word_split_pattern = re.compile(r"\s+") +def word_split(sentence): + return re.split(word_split_pattern, sentence) + + +class CompiledText(object): + """ + A "compiled" version of a Text model, where state transition functions have been + pre-computed into a faster and more compact form. + """ + def __init__(self, model): + """ + model: a Text model to compile + """ + if not isinstance(model, Text): + raise ParamError("unrecognized markofivy model type %s" % (type(model))) + def compile_next(next_dict): + words = list(next_dict.keys()) + cff = list(accumulate(next_dict.values())) + return (words, cff) + # the compiled state transition function: + self.sxf = { state: compile_next(next_dict) for (state, next_dict) in model.chain.model.items() } + self.state_size = model.state_size + + def _gen_(self, init_state, max_words): + state = init_state or (BEGIN,) * self.state_size + prefix = list(state) + for word in prefix: + if word != BEGIN: break + prefix = prefix[1:] + seq = prefix + while True: + words, cff = self.sxf[state] + r = random.random() * cff[-1] + word = words[bisect.bisect(cff, r)] + if word == END: break + seq.append(word) + if max_words != None and len(seq) > max_words: break + state = state[1:] + (word,) + return seq + + def make_sentence(self, init_state = None, tries = DEFAULT_TRIES, max_words = None): + """ + Attempts `tries` (default: 10) times to generate a valid sentence, + based on the model. + + If successful, returns the sentence as a string. If not, returns None. + + If `init_state` (a tuple of `self.state_size` words) is not specified, + this method chooses a sentence-start at random, in accordance with + the model. + + If `max_words` is specified, it will attempt generation until a sentence + of at most max_words long is created (or number of tries expires) + """ + for _ in range(tries): + seq = self._gen_(init_state, None if (max_words == None) else max_words + 1) + if max_words != None and len(seq) > max_words: continue + return " ".join(seq) + return None + + def make_short_sentence(self, max_chars, min_chars=0, **kwargs): + """ + Tries making a sentence of no more than `max_chars` characters and optionally + no less than `min_chars` characters, passing **kwargs to `self.make_sentence`. + """ + tries = kwargs.get('tries', DEFAULT_TRIES) + + for _ in range(tries): + sentence = self.make_sentence(**kwargs) + if sentence and len(sentence) <= max_chars and len(sentence) >= min_chars: + return sentence + return None + + def make_sentence_with_start(self, beginning, strict=True, **kwargs): + """ + Tries making a sentence that begins with `beginning` string, + which should be a string of one to `self.state_size` words known + to exist in the corpus. + + If strict == True, then markovify will draw its initial inspiration + only from sentences that start with the specified word/phrase. + + If strict == False, then markovify will draw its initial inspiration + from any sentence containing the specified word/phrase. + + **kwargs are passed to `self.make_sentence` + """ + split = tuple(word_split(beginning)) + word_count = len(split) + + if word_count == self.state_size: + init_states = [ split ] + elif word_count > 0 and word_count < self.state_size: + if strict: + init_states = [ (BEGIN,) * (self.state_size - word_count) + split ] + else: + init_states = [ key for key in self.sxf.keys() + # check for starting with begin as well ordered lists + if tuple(filter(lambda x: x != BEGIN, key))[:word_count] == split ] + random.shuffle(init_states) + else: + err_msg = "`make_sentence_with_start` for this model requires a string containing 1 to {0} words. Yours has {1}: {2}".format(self.state_size, word_count, str(split)) + raise ParamError(err_msg) + + for init_state in init_states: + output = self.make_sentence(init_state, **kwargs) + if output is not None: + return output + return None diff --git a/test/test_compiled.py b/test/test_compiled.py new file mode 100644 index 0000000..ca82ef7 --- /dev/null +++ b/test/test_compiled.py @@ -0,0 +1,111 @@ +import unittest +import markovify +import sys, os +import operator + +with open(os.path.join(os.path.dirname(__file__), "texts/sherlock.txt")) as f: + sherlock = f.read() + sherlock_model = markovify.Text(sherlock) + sherlock_compiled = markovify.CompiledText(sherlock_model) + +class MarkovifyCompiledTest(unittest.TestCase): + + def test_sherlock(self): + text_model = sherlock_compiled + sent = text_model.make_sentence() + assert(len(sent) != 0) + + def test_make_sentence_with_start(self): + text_model = sherlock_compiled + start_str = "Sherlock Holmes" + sent = text_model.make_sentence_with_start(start_str) + assert(sent != None) + assert(start_str == sent[:len(start_str)]) + + def test_make_sentence_with_start_one_word(self): + text_model = sherlock_compiled + start_str = "Sherlock" + sent = text_model.make_sentence_with_start(start_str) + assert(sent != None) + assert(start_str == sent[:len(start_str)]) + + def test_make_sentence_with_start_one_word_that_doesnt_begin_a_sentence(self): + text_model = sherlock_compiled + start_str = "dog" + with self.assertRaises(KeyError) as context: + sent = text_model.make_sentence_with_start(start_str) + + def test_make_sentence_with_word_not_at_start_of_sentence(self): + text_model = sherlock_compiled + start_str = "dog" + sent = text_model.make_sentence_with_start(start_str, strict=False) + assert(sent != None) + assert(start_str == sent[:len(start_str)]) + + def test_make_sentence_with_words_not_at_start_of_sentence(self): + text_model = markovify.Text(sherlock, state_size=3) + text_model = markovify.CompiledText(text_model) + # " I was " has 128 matches in sherlock.txt + # " was I " has 2 matches in sherlock.txt + start_str = "was I" + sent = text_model.make_sentence_with_start(start_str, strict=False, tries=50) + assert(sent != None) + assert(start_str == sent[:len(start_str)]) + + def test_make_sentence_with_words_not_at_start_of_sentence_miss(self): + text_model = markovify.Text(sherlock, state_size=3) + text_model = markovify.CompiledText(text_model) + start_str = "was werewolf" + sent = text_model.make_sentence_with_start(start_str, strict=False, tries=50) + assert(sent == None) + + def test_make_sentence_with_words_not_at_start_of_sentence_of_state_size(self): + text_model = markovify.Text(sherlock, state_size=2) + text_model = markovify.CompiledText(text_model) + start_str = "was I" + sent = text_model.make_sentence_with_start(start_str, strict=False, tries=50) + assert(sent != None) + assert(start_str == sent[:len(start_str)]) + + def test_make_sentence_with_words_to_many(self): + text_model = sherlock_compiled + start_str = "dog is good" + with self.assertRaises(markovify.text.ParamError) as context: + sent = text_model.make_sentence_with_start(start_str, strict=False) + + def test_make_sentence_with_start_three_words(self): + start_str = "Sherlock Holmes was" + text_model = sherlock_compiled + try: + text_model.make_sentence_with_start(start_str) + assert(False) + except markovify.text.ParamError: + assert(True) + text_model = markovify.Text(sherlock, state_size=3) + text_model = markovify.CompiledText(text_model) + text_model.make_sentence_with_start(start_str) + sent = text_model.make_sentence_with_start("Sherlock") + assert(markovify.chain.BEGIN not in sent) + + def test_short_sentence(self): + text_model = sherlock_compiled + sent = None + while sent is None: + sent = text_model.make_short_sentence(45) + assert len(sent) <= 45 + + def test_short_sentence_min_chars(self): + text_model = sherlock_compiled + sent = None + while sent is None: + sent = text_model.make_short_sentence(100, min_chars=50) + assert len(sent) <= 100 + assert len(sent) >= 50 + + def test_max_words(self): + text_model = sherlock_compiled + sent = text_model.make_sentence(max_words=0) + assert sent is None + +if __name__ == '__main__': + unittest.main()