Skip to content

Commit

Permalink
Merge pull request #60 from jsvine/pr-55
Browse files Browse the repository at this point in the history
Incorporate PR #55 and related testing
  • Loading branch information
jsvine committed Mar 24, 2017
2 parents d3db848 + c1e27cc commit c11601b
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 30 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ Notes:

- By default, the `make_sentence` method tries, a maximum of 10 times per invocation, to make a sentence that doesn't overlap too much with the original text. If it is successful, the method returns the sentence as a string. If not, it returns `None`. To increase or decrease the number of attempts, use the `tries` keyword argument, e.g., call `.make_sentence(tries=100)`.

- By default, `markovify.Text` tries to generate sentences that don't simply regurgitate chunks of the original text. The default rule is to suppress any generated sentences that exactly overlaps the original text by 15 words or 70% of the sentence's word count. You can change this rule by passing `max_overlap_ratio` and/or `max_overlap_total` to the `make_sentence` method.
- By default, `markovify.Text` tries to generate sentences that don't simply regurgitate chunks of the original text. The default rule is to suppress any generated sentences that exactly overlaps the original text by 15 words or 70% of the sentence's word count. You can change this rule by passing `max_overlap_ratio` and/or `max_overlap_total` to the `make_sentence` method. Alternatively you can disable this check entirely by passing `test_output` as False.

## Advanced Usage

Expand Down Expand Up @@ -197,5 +197,6 @@ Many thanks to the following GitHub users for contributing code and/or ideas:
- [@wodim](https://github.com/wodim)
- [@eh11fx](https://github.com/eh11fx)
- [@ammgws](https://github.com/ammgws)
- [@OtakuMegane](https://github.com/OtakuMegane)

Developed at [BuzzFeed](https://www.buzzfeed.com).
19 changes: 16 additions & 3 deletions markovify/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ def test_sentence_input(self, sentence):
"""
reject_pat = re.compile(r"(^')|('$)|\s'|'\s|[\"(\(\)\[\])]")
# Decode unicode, mainly to normalize fancy quotation marks
if sentence.__class__.__name__ == "str":
if sentence.__class__.__name__ == "str": # pragma: no cover
decoded = sentence
else:
else: # pragma: no cover
decoded = unidecode(sentence)
# Sentence shouldn't contain problematic characters
if re.search(reject_pat, decoded): return False
Expand Down Expand Up @@ -143,10 +143,18 @@ def make_sentence(self, init_state=None, **kwargs):
If `init_state` (a tuple of `self.chain.state_size` words) is not specified,
this method chooses a sentence-start at random, in accordance with
the model.
If `test_output` is set as False then the `test_sentence_output` check
will be skipped.
If `max_words` is specified, the word count for the sentence will be
evaluated against the provided limit.
"""
tries = kwargs.get('tries', DEFAULT_TRIES)
mor = kwargs.get('max_overlap_ratio', DEFAULT_MAX_OVERLAP_RATIO)
mot = kwargs.get('max_overlap_total', DEFAULT_MAX_OVERLAP_TOTAL)
test_output = kwargs.get('test_output', True)
max_words = kwargs.get('max_words', None)

for _ in range(tries):
if init_state != None:
Expand All @@ -157,7 +165,12 @@ def make_sentence(self, init_state=None, **kwargs):
else:
prefix = []
words = prefix + self.chain.walk(init_state)
if self.test_sentence_output(words, mor, mot):
if max_words != None and len(words) > max_words:
continue
if test_output:
if self.test_sentence_output(words, mor, mot):
return self.word_join(words)
else:
return self.word_join(words)
return None

Expand Down
34 changes: 22 additions & 12 deletions test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,31 @@
def get_sorted(chain_json):
return sorted(chain_json, key=operator.itemgetter(0))

class MarkovifyTest(unittest.TestCase):
with open(os.path.join(os.path.dirname(__file__), "texts/sherlock.txt")) as f:
sherlock = f.read()
sherlock_model = markovify.Text(sherlock)

def setUp(self):
with open(os.path.join(os.path.dirname(__file__), "texts/sherlock.txt")) as f:
self.sherlock = f.read()
class MarkovifyTest(unittest.TestCase):

def test_text_too_small(self):
text = u"Example phrase. This is another example sentence."
text_model = markovify.Text(text)
assert(text_model.make_sentence() == None)

def test_sherlock(self):
text_model = markovify.Text(self.sherlock)
text_model = sherlock_model
sent = text_model.make_sentence()
assert(len(sent) != 0)

def test_json(self):
text_model = markovify.Text(self.sherlock)
text_model = sherlock_model
json_model = text_model.to_json()
new_text_model = markovify.Text.from_json(json_model)
sent = text_model.make_sentence()
assert(len(sent) != 0)

def test_chain(self):
text_model = markovify.Text(self.sherlock)
text_model = sherlock_model
chain_json = text_model.chain.to_json()

stored_chain = markovify.Chain.from_json(chain_json)
Expand All @@ -43,38 +43,48 @@ def test_chain(self):
assert(len(sent) != 0)

def test_make_sentence_with_start(self):
text_model = markovify.Text(self.sherlock)
text_model = sherlock_model
start_str = "Sherlock Holmes"
sent = text_model.make_sentence_with_start(start_str)
assert(sent != None)
assert(start_str == sent[:len(start_str)])

def test_make_sentence_with_start_one_word(self):
text_model = markovify.Text(self.sherlock)
text_model = sherlock_model
start_str = "Sherlock"
sent = text_model.make_sentence_with_start(start_str)
assert(sent != None)
assert(start_str == sent[:len(start_str)])

def test_make_sentence_with_start_three_words(self):
start_str = "Sherlock Holmes was"
text_model = markovify.Text(self.sherlock)
text_model = sherlock_model
try:
text_model.make_sentence_with_start(start_str)
assert(False)
except markovify.text.ParamError:
assert(True)
text_model = markovify.Text(self.sherlock, state_size=3)
text_model = markovify.Text(sherlock, state_size=3)
text_model.make_sentence_with_start(start_str)
text_model.make_sentence_with_start("Sherlock")

def test_short_sentence(self):
text_model = markovify.Text(self.sherlock)
text_model = sherlock_model
sent = None
while sent == None:
sent = text_model.make_short_sentence(45)
assert len(sent) < 45

def test_dont_test_output(self):
text_model = sherlock_model
sent = text_model.make_sentence(test_output=False)
assert sent is not None

def test_max_words(self):
text_model = sherlock_model
sent = text_model.make_sentence(max_words=0)
assert sent is None

def test_newline_text(self):
with open(os.path.join(os.path.dirname(__file__), "texts/senate-bills.txt")) as f:
model = markovify.NewlineText(f.read())
Expand Down
28 changes: 14 additions & 14 deletions test/test_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,32 @@
def get_sorted(chain_json):
return sorted(chain_json, key=operator.itemgetter(0))

class MarkovifyTest(unittest.TestCase):
with open(os.path.join(os.path.dirname(__file__), "texts/sherlock.txt")) as f:
sherlock = f.read()
sherlock_model = markovify.Text(sherlock)

def setUp(self):
with open(os.path.join(os.path.dirname(__file__), "texts/sherlock.txt")) as f:
self.sherlock = f.read()
class MarkovifyTest(unittest.TestCase):

def test_simple(self):
text_model = markovify.Text(self.sherlock)
text_model = sherlock_model
combo = markovify.combine([ text_model, text_model ], [ 0.5, 0.5 ])
assert(combo.chain.model == text_model.chain.model)

def test_double_weighted(self):
text_model = markovify.Text(self.sherlock)
text_model = sherlock_model
combo = markovify.combine([ text_model, text_model ])
assert(combo.chain.model != text_model.chain.model)

def test_combine_chains(self):
chain = markovify.Text(self.sherlock).chain
chain = sherlock_model.chain
combo = markovify.combine([ chain, chain ])

def test_combine_dicts(self):
_dict = markovify.Text(self.sherlock).chain.model
_dict = sherlock_model.chain.model
combo = markovify.combine([ _dict, _dict ])

def test_combine_lists(self):
_list = list(markovify.Text(self.sherlock).chain.model.items())
_list = list(sherlock_model.chain.model.items())
combo = markovify.combine([ _list, _list ])

def test_bad_types(self):
Expand All @@ -40,19 +40,19 @@ def test_bad_types(self):

def test_bad_weights(self):
with self.assertRaises(Exception) as context:
text_model = markovify.Text(self.sherlock)
text_model = sherlock_model
combo = markovify.combine([ text_model, text_model ], [ 0.5 ])

def test_mismatched_state_sizes(self):
with self.assertRaises(Exception) as context:
text_model_a = markovify.Text(self.sherlock, state_size=2)
text_model_b = markovify.Text(self.sherlock, state_size=3)
text_model_a = markovify.Text(sherlock, state_size=2)
text_model_b = markovify.Text(sherlock, state_size=3)
combo = markovify.combine([ text_model_a, text_model_b ])

def test_mismatched_model_types(self):
with self.assertRaises(Exception) as context:
text_model_a = markovify.Text(self.sherlock)
text_model_b = markovify.NewlineText(self.sherlock)
text_model_a = sherlock_model
text_model_b = markovify.NewlineText(sherlock)
combo = markovify.combine([ text_model_a, text_model_b ])

if __name__ == '__main__':
Expand Down

0 comments on commit c11601b

Please sign in to comment.