Skip to content

Commit

Permalink
Refactor and rewrite Post().get() and its sub-methods
Browse files Browse the repository at this point in the history
- refactor the init() cases into two situation: ids / metas
- define two scenarios: from id(s) and meta(s)
- and turns out 4 sub-tasks:
  * get_content, get_links, get_commments_{serial/parallel}
- redefine PostsResult format method into two for two cases
  • Loading branch information
leVirve committed Jul 29, 2016
1 parent 7f81726 commit 79c4744
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 70 deletions.
3 changes: 1 addition & 2 deletions dcard/forums.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from __future__ import unicode_literals, absolute_import

import logging
import itertools

from dcard import api
from dcard.utils import Client, flatten_lists
Expand Down Expand Up @@ -50,7 +49,7 @@ def get_metas(self, num=30, sort='new', callback=None):
def _get_paged_metas(self, pages, sort):
params = {'popular': False} if sort == 'new' else {}

for page in range(pages):
for page in range(pages):
data = self.client.get(self.posts_meta_url, params=params)

if len(data) == 0:
Expand Down
166 changes: 98 additions & 68 deletions dcard/posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,46 +13,76 @@
class Post:

reduce_threshold = 1000
comments_per_page = 30
client = Client()

def __init__(self, metas):
if isinstance(metas, list):
first = metas[0]
ids = [meta['id'] for meta in metas] if isinstance(first, dict) \
else metas
def __init__(self, metadata):
metadata = metadata if isinstance(metadata, list) else [metadata]
self.only_id = type(metadata[0]) is int

self.ids = metadata if self.only_id else [m['id'] for m in metadata]
self.metas = metadata if not self.only_id else None

def get(self, **kwargs):
if self.only_id:
raw_posts = self.get_posts_by_id(**kwargs)
return PostsResult(raw_posts, massive=False)
else:
ids = [metas['id']] if isinstance(metas, dict) \
else [metas]
self.ids = ids

def get(self, content=True, comments=True, links=True, callback=None):
bundle = {}
if links:
bundle['links_futures'] = [
[
self.client.fut_get(api.post_links_url_pattern.format(post_id=post_id))
for post_id in ids
]
for ids in chunks(self.ids, chunck_size=Post.reduce_threshold)
]
if content:
bundle['content_futures'] = [
[
self.client.fut_get(api.post_url_pattern.format(post_id=post_id))
for post_id in ids
]
for ids in chunks(self.ids, chunck_size=Post.reduce_threshold)
]
if comments:
bundle['comments_async'] = [
self.client.parallel_tasks(Post._serially_get_comments, ids)
for ids in chunks(self.ids, chunck_size=Post.reduce_threshold)
]

return PostsResult(self.ids, bundle, callback)

@staticmethod
def _serially_get_comments(post_id):
raw_posts = self.get_post_by_meta(**kwargs)
return PostsResult(raw_posts)

def get_posts_by_id(self, content=True, links=True, comments=True):
return {
'content': self.get_content(self.ids) if content else [],
'links': self.get_links(self.ids) if links else [],
'comments': (
self.get_comments_serial(post_id)
for post_id in self.ids
if comments
)
}

def get_post_by_meta(self, content=True, links=True, comments=True):
return {
'content': self.get_content(self.ids) if content else [],
'links': self.get_links(self.ids) if links else [],
'comments': (
self.get_comments_parallel(meta['id'], meta['commentCount'])
for meta in self.metas
if comments
)
}

@classmethod
def get_content(cls, post_ids):
content_futures = (
cls.client.fut_get(
api.post_url_pattern.format(post_id=post_id))
for post_id in post_ids
)
return content_futures

@classmethod
def get_links(cls, post_ids):
links_futures = (
cls.client.fut_get(
api.post_links_url_pattern.format(post_id=post_id))
for post_id in post_ids
)
return links_futures

@classmethod
def get_comments_parallel(cls, post_id, comments_count):
pages = -(-comments_count // cls.comments_per_page)
comments_futures = (
self.client.fut_get(api.post_url_pattern.format(post_id=post_id),
params={'after': page * self.comments_per_page})
for page in range(pages)
)

@classmethod
def get_comments_serial(cls, post_id):
print('comment of %d' % post_id)
comments_url = api.post_comments_url_pattern.format(post_id=post_id)

params = {}
Expand All @@ -69,10 +99,14 @@ def _serially_get_comments(post_id):

class PostsResult:

def __init__(self, ids, bundle, callback=None):
self.ids = ids
self.results = self.format(bundle, callback)
self.downloader = Downloader()
downloader = Downloader()

def __init__(self, bundle, massive=True, callback=None):
self.results = list(
self.format(bundle, callback)
if massive else
self.simple_format(bundle, callback)
)

def __len__(self):
return len(self.results)
Expand All @@ -83,34 +117,30 @@ def __iter__(self):
def __getitem__(self, key):
return self.results[int(key)]

def simple_format(self, bundle, callback):
for content, links, comments in zip_longest(
bundle['content'], bundle['links'], bundle['comments']
):
post = {}
post.update(content.result().json()) if content else None
post.update({
'links': links.result().json() if links else None,
'comments': comments
})
yield post

def format(self, bundle, callback):
logger.info('[PostResult reducer] takes hand.')
links_blocks = bundle.get('links_futures', [])
content_blocks = bundle.get('content_futures', [])
comments_blocks = bundle.get('comments_async', [])

results = []
for links, content, comments in zip_longest(links_blocks, content_blocks, comments_blocks):
posts = []

links = links or []
content = content or []
comments = comments.get() if comments else []
for lnks, cont, cmts in zip_longest(links, content, comments):
post = {}
post.update(cont.result().json()) if cont else None
post.update({
'links': lnks.result().json() if lnks else None,
'comments': cmts,
})
posts.append(post)
results.append(callback(posts) if callback else posts)
logger.info('[PostResult reducer] {0} posts processed.'.format(len(posts)))

if len(results) and isinstance(results[0], list):
results = flatten_lists(results)

return results
for content, links, comments in zip_longest(
bundle['content'], bundle['links'], bundle['comments']
):
post = {}
post.update(content.result().json()) if content else None
post.update({
'links': links.result().json() if links else None,
'comments': flatten_lists([cmts.result().json() for cmts in comments]) if comments else None
})
print(post)
yield post

def parse_resources(self):
parser = ContentParser(self.results)
Expand Down

0 comments on commit 79c4744

Please sign in to comment.