Skip to content

Commit

Permalink
Merge pull request #75 from jsvine/linewise
Browse files Browse the repository at this point in the history
Accept file-like objects, and to discard original
  • Loading branch information
jsvine committed Sep 2, 2017
2 parents 4880754 + 8d65da4 commit a7ec50e
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 12 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,18 @@ reconstituted_model.make_short_sentence(140)

You can also export the underlying Markov chain on its own — i.e., excluding the original corpus and the `state_size` metadata — via `my_text_model.chain.to_json()`.

### Generating `markovify.Text` models from very large corpora

By default, the `markovify.Text` class loads, and retains, the your textual corpus, so that it can compare generated sentences with the original (and only emit novel sentences). But, with very large corpora, loading the entire text at once (and retaining it) can be memory-intensive. To overcome this, you can `(a)` read in the corpus line-by-line, and `(b)` tell Markovify not to retain the original:

```python
with open("path/to/my/huge/corpus.txt") as f:
text_model = markovify.Text(f, retain_original=False)

print(text_model.make_sentence())
```


## Markovify In The Wild

- BuzzFeed's [Tom Friedman Sentence Generator](http://www.buzzfeed.com/jsvine/the-tom-friedman-sentence-generator) / [@mot_namdeirf](https://twitter.com/mot_namdeirf).
Expand Down
2 changes: 1 addition & 1 deletion markovify/__version__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION_TUPLE = (0, 6, 0)
VERSION_TUPLE = (0, 6, 1)
__version__ = ".".join(map(str, VERSION_TUPLE))
2 changes: 0 additions & 2 deletions markovify/chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ def build(self, corpus, state_size):
for the "next" item in the chain, along with the count of times it
appears.
"""
if (type(corpus) != list) or (type(corpus[0]) != list):
raise Exception("`corpus` must be list of lists")

# Using a DefaultDict here would be a lot more convenient, however the memory
# usage is far higher.
Expand Down
30 changes: 21 additions & 9 deletions markovify/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class ParamError(Exception):

class Text(object):

def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None):
def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, retain_original=True):
"""
input_text: A string.
state_size: An integer, indicating the number of words in the model's state.
Expand All @@ -25,11 +25,17 @@ def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None):
long run.
"""
self.state_size = state_size
self.parsed_sentences = parsed_sentences or list(self.generate_corpus(input_text))
self.retain_original = retain_original

# Rejoined text lets us assess the novelty of generated sentences
self.rejoined_text = self.sentence_join(map(self.word_join, self.parsed_sentences))
self.chain = chain or Chain(self.parsed_sentences, state_size)
if retain_original:
self.parsed_sentences = parsed_sentences or list(self.generate_corpus(input_text))

# Rejoined text lets us assess the novelty of generated sentences
self.rejoined_text = self.sentence_join(map(self.word_join, self.parsed_sentences))
self.chain = chain or Chain(self.parsed_sentences, state_size)
else:
parsed = parsed_sentences or self.generate_corpus(input_text)
self.chain = chain or Chain(parsed, state_size)

def to_dict(self):
"""
Expand All @@ -38,7 +44,7 @@ def to_dict(self):
return {
"state_size": self.state_size,
"chain": self.chain.to_json(),
"parsed_sentences": self.parsed_sentences
"parsed_sentences": self.parsed_sentences if self.retain_original else None
}

def to_json(self):
Expand All @@ -53,7 +59,7 @@ def from_dict(cls, obj):
None,
state_size=obj["state_size"],
chain=Chain.from_json(obj["chain"]),
parsed_sentences=obj["parsed_sentences"]
parsed_sentences=obj.get("parsed_sentences")
)

@classmethod
Expand Down Expand Up @@ -91,6 +97,7 @@ def test_sentence_input(self, sentence):
the type of punctuation that would look strange on its own
in a randomly-generated sentence.
"""
if len(sentence.strip()) == 0: return False
reject_pat = re.compile(r"(^')|('$)|\s'|'\s|[\"(\(\)\[\])]")
# Decode unicode, mainly to normalize fancy quotation marks
if sentence.__class__.__name__ == "str": # pragma: no cover
Expand All @@ -107,7 +114,12 @@ def generate_corpus(self, text):
"sentences," each of which is a list of words. Before splitting into
words, the sentences are filtered through `self.test_sentence_input`
"""
sentences = self.sentence_split(text)
if isinstance(text, str):
sentences = self.sentence_split(text)
else:
sentences = []
for line in text:
sentences += self.sentence_split(line)
passing = filter(self.test_sentence_input, sentences)
runs = map(self.word_split, passing)
return runs
Expand Down Expand Up @@ -167,7 +179,7 @@ def make_sentence(self, init_state=None, **kwargs):
words = prefix + self.chain.walk(init_state)
if max_words != None and len(words) > max_words:
continue
if test_output:
if test_output and hasattr(self, "rejoined_text"):
if self.test_sentence_output(words, mor, mot):
return self.word_join(words)
else:
Expand Down
25 changes: 25 additions & 0 deletions test/test_itertext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import unittest
import markovify
import sys, os
import operator

class MarkovifyTest(unittest.TestCase):

def test_simple(self):
with open(os.path.join(os.path.dirname(__file__), "texts/sherlock.txt")) as f:
sherlock_model = markovify.Text(f)
sent = sherlock_model.make_sentence()
assert sent is not None
assert len(sent) != 0

def test_without_retaining(self):
with open(os.path.join(os.path.dirname(__file__), "texts/senate-bills.txt")) as f:
senate_model = markovify.Text(f, retain_original=False)
sent = senate_model.make_sentence()
assert sent is not None
assert len(sent) != 0

if __name__ == '__main__':
unittest.main()


0 comments on commit a7ec50e

Please sign in to comment.