Add Dickens example, bump version number to 0.1

larsmans · Jun 14, 2011 · f4466a7 · f4466a7
1 parent 928254c
commit f4466a7
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 6 deletions.
diff --git a/README.rst b/README.rst
@@ -2,7 +2,7 @@
 be used to create word clouds.
 
 WeighWords does not do visualization of word clouds. For that, you can paste
-its output into http://wordle.net, the `IBM Word-Cloud Generator
+its output into a tool like http://wordle.net or the `IBM Word-Cloud Generator
 <http://www.alphaworks.ibm.com/tech/wordcloud>`_.
 
 Rather than use simple word frequency, it weighs words by statistical models
@@ -14,8 +14,11 @@ word cloud of a single document; you need a bunch of document to compare to.
 
 References
 ----------
-D. Hiemstra, S. Robertson and H. Zaragoza (2004). Parsimonious Language Models
-for Information Retrieval. Proc. SIGIR'04.
+D. Hiemstra, S. Robertson and H. Zaragoza (2004). `Parsimonious Language Models
+for Information Retrieval
+<http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.5806>`_.
+Proc. SIGIR'04.
 
-R. Kaptein, D. Hiemstra and J. Kamps (2010). How different are Language Models
-and word clouds? Proc. ECIR.
+R. Kaptein, D. Hiemstra and J. Kamps (2010). `How different are Language Models
+and word clouds? <http://riannekaptein.woelmuis.nl/2010/kapt-how10.pdf>`_
+Proc. ECIR.
diff --git a/example/1400.txt.utf8.gz b/example/1400.txt.utf8.gz
diff --git a/example/730.txt.utf8.gz b/example/730.txt.utf8.gz
diff --git a/example/766.txt.utf8.gz b/example/766.txt.utf8.gz
diff --git a/example/dickens.py b/example/dickens.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+
+# Find terms that distinguish various novels by Charles Dickens.
+# Note: if the w parameter is set wisely, no stop list is needed.
+
+from weighwords import ParsimoniousLM
+import gzip
+import logging
+import numpy as np
+import re
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+top_k = 20  # How many terms per book to retrieve
+
+books = [
+    ('Oliver Twist',       '730'),
+    ('David Copperfield',  '766'),
+    ('Great Expectations', '1400'),
+]
+
+startbook = """*** START OF THIS PROJECT GUTENBERG EBOOK """
+
+
+def read_book(title, num):
+    """Returns generator over words in book num"""
+
+    logger.info("Fetching terms from %s" % title)
+    path = "%s.txt.utf8.gz" % num
+    in_book = False
+    for ln in gzip.open(path):
+        if in_book:
+            for w in re.sub(r"[.,:;!?\"']", " ", ln).lower().split():
+                yield w
+        elif ln.startswith(startbook):
+            in_book = True
+
+
+book_contents = [(title, list(read_book(title, num))) for title, num in books]
+
+model = ParsimoniousLM([terms for title, terms in book_contents], w=.01)
+
+for title, terms in book_contents:
+    print("Top %d words in %s:" % (top_k, title))
+    for term, p in model.top(top_k, terms):
+        print("    %s %.4f" % (term, np.exp(p)))
+    print("")
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name = "weighwords",
-    version = "0.0",
+    version = "0.1",
     author = "Lars Buitinck",
     author_email = "L.J.Buitinck@uva.nl",
     description = "Python library for creating word weights/word clouds from text",