Merge pull request #75 from jsvine/linewise

Accept file-like objects, and to discard original
jsvine · Sep 2, 2017 · a7ec50e · a7ec50e
2 parents 4880754 + 8d65da4
commit a7ec50e
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -145,6 +145,18 @@ reconstituted_model.make_short_sentence(140)
 
 You can also export the underlying Markov chain on its own — i.e., excluding the original corpus and the `state_size` metadata — via `my_text_model.chain.to_json()`.
 
+### Generating `markovify.Text` models from very large corpora
+
+By default, the `markovify.Text` class loads, and retains, the your textual corpus, so that it can compare generated sentences with the original (and only emit novel sentences). But, with very large corpora, loading the entire text at once (and retaining it) can be memory-intensive. To overcome this, you can `(a)` read in the corpus line-by-line, and `(b)` tell Markovify not to retain the original:
+
+```python
+with open("path/to/my/huge/corpus.txt") as f:
+    text_model = markovify.Text(f, retain_original=False)
+
+print(text_model.make_sentence())
+```
+
+
 ## Markovify In The Wild
 
 - BuzzFeed's [Tom Friedman Sentence Generator](http://www.buzzfeed.com/jsvine/the-tom-friedman-sentence-generator) / [@mot_namdeirf](https://twitter.com/mot_namdeirf).

diff --git a/markovify/__version__.py b/markovify/__version__.py
@@ -1,2 +1,2 @@
-VERSION_TUPLE = (0, 6, 0)
+VERSION_TUPLE = (0, 6, 1)
 __version__ = ".".join(map(str, VERSION_TUPLE))
diff --git a/markovify/chain.py b/markovify/chain.py
@@ -52,8 +52,6 @@ def build(self, corpus, state_size):
         for the "next" item in the chain, along with the count of times it
         appears.
         """
-        if (type(corpus) != list) or (type(corpus[0]) != list):
-            raise Exception("`corpus` must be list of lists")
 
         # Using a DefaultDict here would be a lot more convenient, however the memory
         # usage is far higher.

diff --git a/markovify/text.py b/markovify/text.py
@@ -13,7 +13,7 @@ class ParamError(Exception):
 
 class Text(object):
 
-    def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None):
+    def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, retain_original=True):
         """
         input_text: A string.
         state_size: An integer, indicating the number of words in the model's state.
@@ -25,11 +25,17 @@ def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None):
               long run.
         """
         self.state_size = state_size
-        self.parsed_sentences = parsed_sentences or list(self.generate_corpus(input_text))
+        self.retain_original = retain_original
 
-        # Rejoined text lets us assess the novelty of generated sentences
-        self.rejoined_text = self.sentence_join(map(self.word_join, self.parsed_sentences))
-        self.chain = chain or Chain(self.parsed_sentences, state_size)
+        if retain_original:
+            self.parsed_sentences = parsed_sentences or list(self.generate_corpus(input_text))
+
+            # Rejoined text lets us assess the novelty of generated sentences
+            self.rejoined_text = self.sentence_join(map(self.word_join, self.parsed_sentences))
+            self.chain = chain or Chain(self.parsed_sentences, state_size)
+        else:
+            parsed = parsed_sentences or self.generate_corpus(input_text)
+            self.chain = chain or Chain(parsed, state_size)
 
     def to_dict(self):
         """
@@ -38,7 +44,7 @@ def to_dict(self):
         return {
             "state_size": self.state_size,
             "chain": self.chain.to_json(),
-            "parsed_sentences": self.parsed_sentences
+            "parsed_sentences": self.parsed_sentences if self.retain_original else None
         }
 
     def to_json(self):
@@ -53,7 +59,7 @@ def from_dict(cls, obj):
             None,
             state_size=obj["state_size"],
             chain=Chain.from_json(obj["chain"]),
-            parsed_sentences=obj["parsed_sentences"]
+            parsed_sentences=obj.get("parsed_sentences")
         )
 
     @classmethod
@@ -91,6 +97,7 @@ def test_sentence_input(self, sentence):
         the type of punctuation that would look strange on its own
         in a randomly-generated sentence. 
         """
+        if len(sentence.strip()) == 0: return False
         reject_pat = re.compile(r"(^')|('$)|\s'|'\s|[\"(\(\)\[\])]")
         # Decode unicode, mainly to normalize fancy quotation marks
         if sentence.__class__.__name__ == "str": # pragma: no cover
@@ -107,7 +114,12 @@ def generate_corpus(self, text):
         "sentences," each of which is a list of words. Before splitting into 
         words, the sentences are filtered through `self.test_sentence_input`
         """
-        sentences = self.sentence_split(text)
+        if isinstance(text, str):
+            sentences = self.sentence_split(text)
+        else:
+            sentences = []
+            for line in text:
+                sentences += self.sentence_split(line)
         passing = filter(self.test_sentence_input, sentences)
         runs = map(self.word_split, passing)
         return runs
@@ -167,7 +179,7 @@ def make_sentence(self, init_state=None, **kwargs):
             words = prefix + self.chain.walk(init_state)
             if max_words != None and len(words) > max_words:
                 continue
-            if test_output:
+            if test_output and hasattr(self, "rejoined_text"):
                 if self.test_sentence_output(words, mor, mot):
                     return self.word_join(words)
             else:

diff --git a/test/test_itertext.py b/test/test_itertext.py
@@ -0,0 +1,25 @@
+import unittest
+import markovify
+import sys, os
+import operator
+
+class MarkovifyTest(unittest.TestCase):
+
+    def test_simple(self):
+        with open(os.path.join(os.path.dirname(__file__), "texts/sherlock.txt")) as f:
+            sherlock_model = markovify.Text(f)
+        sent = sherlock_model.make_sentence()
+        assert sent is not None
+        assert len(sent) != 0
+
+    def test_without_retaining(self):
+        with open(os.path.join(os.path.dirname(__file__), "texts/senate-bills.txt")) as f:
+            senate_model = markovify.Text(f, retain_original=False)
+        sent = senate_model.make_sentence()
+        assert sent is not None
+        assert len(sent) != 0
+
+if __name__ == '__main__':
+    unittest.main()
+
+