Parse HTML for a product and read the title.

hammer · Apr 24, 2010 · 38aa9d7 · 38aa9d7
1 parent 614a9ff
commit 38aa9d7
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 19 deletions.
diff --git a/fixtures/product.html b/fixtures/product.html
@@ -0,0 +1,27 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+    <title>Wordtracker Keywords</title>
+    <link rel="shortcut icon" href="/favicon.gif" type="image/gif" />
+  </head>
+  <body>
+
+
+<div class="hproduct">
+  <a href="http://api.getsatisfaction.com/companies/wordtracker" class="brand">Wordtracker</a>
+  <a href="http://api.getsatisfaction.com/products/45632" class="name uri">Wordtracker Keywords</a>
+
+
+    <img alt="Product_default_small" class="image thumb" src="http://assets1.getsatisfaction.com/images/product_default_small.png" />
+    <p class="description">
+      Wordtracker's keyword research tool.
+    </p>
+
+    <a href="http://api.getsatisfaction.com/products/45632/topics" rel="topics">Topics associated with Wordtracker Keywords</a>
+    <a href="http://api.getsatisfaction.com/products/45632/companies" rel="companies">Companies associated with Wordtracker Keywords</a>
+
+</div>
+  </body>
+</html>
diff --git a/satisfaction.py b/satisfaction.py
@@ -1,26 +1,69 @@
+import urllib
+
 import feedparser
+import lxml.html
 
 
 class ResourceNotFound(RuntimeError): pass
 
 
 class Resource:
 
+    def __init__(self, resource_id):
+        self.resource_id = resource_id
+        self._document = None
+
     @classmethod
-    def url(cls, resource_id, page):
+    def url(cls, resource_id, page=None):
         url = cls.URL % resource_id
         if page:
             url += '?page=%s' % page
         return url
 
+    def resource_not_found(self):
+        name = self.__class__.__name__
+        raise ResourceNotFound('%s not found: %s' % (name, self.resource_id))
+
+    @property
+    def document(self):
+        if self._document is None:
+            self.load_document()
+        return self._document
+
+
+class AtomParser:
+
+    def load_document(self):
+        document = feedparser.parse(self.url(self.resource_id, self._page))
+        if document.get('status', None) == 404:
+            self.resource_not_found()
+        self._document = document
+
 
-class Topic(Resource):
+class HtmlParser:
+
+    def load_document(self):
+        response = urllib.urlopen(self.url(self.resource_id))
+        if response.headers.getheader('status') == '404':
+            self.resource_not_found()
+        self._document = lxml.html.document_fromstring(html)
+
+
+class Product(Resource, HtmlParser):
+
+    URL = 'http://api.getsatisfaction.com/products/%s'
+
+    @property
+    def title(self):
+        return self.document.cssselect('a.name').text_content()
+
+
+class Topic(Resource, AtomParser):
 
     URL = 'http://api.getsatisfaction.com/topics/%s'
 
-    def __init__(self, resource_id):
-        self.resource_id = resource_id
-        self._document = None
+    def __init__(self, *args):
+        Resource.__init__(self, *args)
         self._page = 1
 
     def __iter__(self):
@@ -44,16 +87,6 @@ def load_next_page(self):
         self._document = None
         self._page += 1
 
-    @property
-    def document(self):
-        if self._document is None:
-            url = self.url(self.resource_id, self._page)
-            self._document = feedparser.parse(url)
-        if self._document.get('status', None) == 404:
-            name = self.__class__.__name__
-            raise ResourceNotFound('%s not found: %s' % (name, self.resource_id))
-        return self._document
-
     @property
     def title(self):
         return self.document.feed.title

diff --git a/satisfaction_test.py b/satisfaction_test.py
@@ -16,17 +16,29 @@ def tearDown(self):
             for cls, func in self.original_functions:
                 cls.url = func
 
-    def useFixture(self, cls, name):
-        def stubbed_url(self, topic_id, page):
-            filename = '%s-%s-page-%s.xml' % (cls.__name__.lower(), name, page)
+    def useFixture(self, cls, name=None):
+        def stubbed_url(self, topic_id, page=None):
+            filename = cls.__name__.lower()
+            if name:
+                filename += '-%s' % name
+            if page:
+                filename += '-page-%s' % page
+            filename += '.xml'
             return os.path.join(os.getcwd(), 'fixtures', filename)
         self.original_functions.append((cls, cls.url))
         cls.url = stubbed_url
 
     def topic(self):
         # TODO: topic id is ignored in tests - does it work?
         return satisfaction.Topic('1234')
+
+
+class MissingProductTest(TestHelper):
 
+    def test_product_not_found(self):
+        with self.assertRaises(satisfaction.ResourceNotFound):
+            satisfaction.Product('bad_product_name').title
+
 
 class MissingTopicTest(TestHelper):
 
@@ -35,7 +47,7 @@ def test_topic_not_found(self):
             self.topic().title
 
 
-class TopicWithNoRepliesTest(TestHelper):
+class TopicWithoutRepliesTest(TestHelper):
 
     def withFixtures(self):
         self.useFixture(satisfaction.Topic, 'without-replies')