Refactor and rewrite Post().get() and its sub-methods

- refactor the init() cases into two situation: ids / metas - define two scenarios: from id(s) and meta(s) - and turns out 4 sub-tasks: * get_content, get_links, get_commments_{serial/parallel} - redefine PostsResult format method into two for two cases
leVirve · Jul 29, 2016 · 79c4744 · 79c4744
1 parent 7f81726
commit 79c4744
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 70 deletions.
diff --git a/dcard/forums.py b/dcard/forums.py
@@ -2,7 +2,6 @@
 from __future__ import unicode_literals, absolute_import
 
 import logging
-import itertools
 
 from dcard import api
 from dcard.utils import Client, flatten_lists
@@ -50,7 +49,7 @@ def get_metas(self, num=30, sort='new', callback=None):
     def _get_paged_metas(self, pages, sort):
         params = {'popular': False} if sort == 'new' else {}
 
-        for page in range(pages):  
+        for page in range(pages):
             data = self.client.get(self.posts_meta_url, params=params)
 
             if len(data) == 0:

diff --git a/dcard/posts.py b/dcard/posts.py
@@ -13,46 +13,76 @@
 class Post:
 
     reduce_threshold = 1000
+    comments_per_page = 30
     client = Client()
 
-    def __init__(self, metas):
-        if isinstance(metas, list):
-            first = metas[0]
-            ids = [meta['id'] for meta in metas] if isinstance(first, dict) \
-                else metas
+    def __init__(self, metadata):
+        metadata = metadata if isinstance(metadata, list) else [metadata]
+        self.only_id = type(metadata[0]) is int
+
+        self.ids = metadata if self.only_id else [m['id'] for m in metadata]
+        self.metas = metadata if not self.only_id else None
+
+    def get(self, **kwargs):
+        if self.only_id:
+            raw_posts = self.get_posts_by_id(**kwargs)
+            return PostsResult(raw_posts, massive=False)
         else:
-            ids = [metas['id']] if isinstance(metas, dict) \
-                else [metas]
-        self.ids = ids
-
-    def get(self, content=True, comments=True, links=True, callback=None):
-        bundle = {}
-        if links:
-            bundle['links_futures'] = [
-                [
-                    self.client.fut_get(api.post_links_url_pattern.format(post_id=post_id))
-                    for post_id in ids
-                ]
-                for ids in chunks(self.ids, chunck_size=Post.reduce_threshold)
-            ]
-        if content:
-            bundle['content_futures'] = [
-                [
-                    self.client.fut_get(api.post_url_pattern.format(post_id=post_id))
-                    for post_id in ids
-                ]
-                for ids in chunks(self.ids, chunck_size=Post.reduce_threshold)
-            ]
-        if comments:
-            bundle['comments_async'] = [
-                self.client.parallel_tasks(Post._serially_get_comments, ids)
-                for ids in chunks(self.ids, chunck_size=Post.reduce_threshold)
-            ]
-
-        return PostsResult(self.ids, bundle, callback)
-
-    @staticmethod
-    def _serially_get_comments(post_id):
+            raw_posts = self.get_post_by_meta(**kwargs)
+            return PostsResult(raw_posts)
+
+    def get_posts_by_id(self, content=True, links=True, comments=True):
+        return {
+            'content': self.get_content(self.ids) if content else [],
+            'links': self.get_links(self.ids) if links else [],
+            'comments': (
+                self.get_comments_serial(post_id)
+                for post_id in self.ids
+                if comments
+            )
+        }
+
+    def get_post_by_meta(self, content=True, links=True, comments=True):
+        return {
+            'content': self.get_content(self.ids) if content else [],
+            'links': self.get_links(self.ids) if links else [],
+            'comments': (
+                self.get_comments_parallel(meta['id'], meta['commentCount'])
+                for meta in self.metas
+                if comments
+            )
+        }
+
+    @classmethod
+    def get_content(cls, post_ids):
+        content_futures = (
+            cls.client.fut_get(
+                api.post_url_pattern.format(post_id=post_id))
+            for post_id in post_ids
+        )
+        return content_futures
+
+    @classmethod
+    def get_links(cls, post_ids):
+        links_futures = (
+            cls.client.fut_get(
+                api.post_links_url_pattern.format(post_id=post_id))
+            for post_id in post_ids
+        )
+        return links_futures
+
+    @classmethod
+    def get_comments_parallel(cls, post_id, comments_count):
+        pages = -(-comments_count // cls.comments_per_page)
+        comments_futures = (
+            self.client.fut_get(api.post_url_pattern.format(post_id=post_id),
+                params={'after': page * self.comments_per_page})
+            for page in range(pages)
+        )
+
+    @classmethod
+    def get_comments_serial(cls, post_id):
+        print('comment of %d' % post_id)
         comments_url = api.post_comments_url_pattern.format(post_id=post_id)
 
         params = {}
@@ -69,10 +99,14 @@ def _serially_get_comments(post_id):
 
 class PostsResult:
 
-    def __init__(self, ids, bundle, callback=None):
-        self.ids = ids
-        self.results = self.format(bundle, callback)
-        self.downloader = Downloader()
+    downloader = Downloader()
+
+    def __init__(self, bundle, massive=True, callback=None):
+        self.results = list(
+            self.format(bundle, callback)
+            if massive else
+            self.simple_format(bundle, callback)
+        )
 
     def __len__(self):
         return len(self.results)
@@ -83,34 +117,30 @@ def __iter__(self):
     def __getitem__(self, key):
         return self.results[int(key)]
 
+    def simple_format(self, bundle, callback):
+        for content, links, comments in zip_longest(
+            bundle['content'], bundle['links'], bundle['comments']
+        ):
+            post = {}
+            post.update(content.result().json()) if content else None
+            post.update({
+                'links': links.result().json() if links else None,
+                'comments': comments
+            })
+            yield post
+
     def format(self, bundle, callback):
-        logger.info('[PostResult reducer] takes hand.')
-        links_blocks = bundle.get('links_futures', [])
-        content_blocks = bundle.get('content_futures', [])
-        comments_blocks = bundle.get('comments_async', [])
-
-        results = []
-        for links, content, comments in zip_longest(links_blocks, content_blocks, comments_blocks):
-            posts = []
-
-            links = links or []
-            content = content or []
-            comments = comments.get() if comments else []
-            for lnks, cont, cmts in zip_longest(links, content, comments):
-                post = {}
-                post.update(cont.result().json()) if cont else None
-                post.update({
-                    'links': lnks.result().json() if lnks else None,
-                    'comments': cmts,
-                })
-                posts.append(post)
-            results.append(callback(posts) if callback else posts)
-            logger.info('[PostResult reducer] {0} posts processed.'.format(len(posts)))
-
-        if len(results) and isinstance(results[0], list):
-            results = flatten_lists(results)
-
-        return results
+        for content, links, comments in zip_longest(
+            bundle['content'], bundle['links'], bundle['comments']
+        ):
+            post = {}
+            post.update(content.result().json()) if content else None
+            post.update({
+                'links': links.result().json() if links else None,
+                'comments': flatten_lists([cmts.result().json() for cmts in comments]) if comments else None
+            })
+            print(post)
+            yield post
 
     def parse_resources(self):
         parser = ContentParser(self.results)