Skip to content

Commit

Permalink
Merge a7b7b1a into 6968649
Browse files Browse the repository at this point in the history
  • Loading branch information
kade-robertson committed Feb 11, 2019
2 parents 6968649 + a7b7b1a commit 3c3aa19
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
20 changes: 16 additions & 4 deletions markovify/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ class ParamError(Exception):

class Text(object):

def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, retain_original=True):
reject_pat = re.compile(r"(^')|('$)|\s'|'\s|[\"(\(\)\[\])]")

def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, retain_original=True, well_formed=True, reject_reg=''):
"""
input_text: A string.
state_size: An integer, indicating the number of words in the model's state.
Expand All @@ -24,7 +26,18 @@ def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None,
contains the steps (e.g. words) in the run. If you want to simulate
an infinite process, you can come very close by passing just one, very
long run.
retain_original: Indicates whether to keep the original corpus.
well_formed: Indicates whether sentences should be well-formed, preventing
unmatched quotes, parenthesis by default, or a custom regular expression
can be provided.
reject_reg: If well_formed is True, this can be provided to override the
standard rejection pattern.
"""

self.well_formed = well_formed
if well_formed and reject_reg != '':
self.reject_pat = re.compile(reject_reg)

can_make_sentences = parsed_sentences is not None or input_text is not None
self.retain_original = retain_original and can_make_sentences
self.state_size = state_size
Expand Down Expand Up @@ -96,19 +109,18 @@ def word_join(self, words):

def test_sentence_input(self, sentence):
"""
A basic sentence filter. This one rejects sentences that contain
A basic sentence filter. The default rejects sentences that contain
the type of punctuation that would look strange on its own
in a randomly-generated sentence.
"""
if len(sentence.strip()) == 0: return False
reject_pat = re.compile(r"(^')|('$)|\s'|'\s|[\"(\(\)\[\])]")
# Decode unicode, mainly to normalize fancy quotation marks
if sentence.__class__.__name__ == "str": # pragma: no cover
decoded = sentence
else: # pragma: no cover
decoded = unidecode(sentence)
# Sentence shouldn't contain problematic characters
if re.search(reject_pat, decoded): return False
if self.well_formed and self.reject_pat.search(decoded): return False
return True

def generate_corpus(self, text):
Expand Down
4 changes: 4 additions & 0 deletions test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,5 +147,9 @@ def test_bad_json(self):
with self.assertRaises(Exception) as context:
markovify.Chain.from_json(1)

def test_custom_regex(self):
with self.assertRaises(Exception) as context:
model = markovify.NewlineText('This sucks.', reject_reg=r'sucks')

if __name__ == '__main__':
unittest.main()

0 comments on commit 3c3aa19

Please sign in to comment.