Skip to content
Browse files

pdf分月归档

  • Loading branch information...
1 parent 6ffa8a2 commit d59a265ab1a0591b1484e5f661f997bc51d66458 @laiwei committed Apr 18, 2012
Showing with 159 additions and 61 deletions.
  1. +61 −24 cronjob/generate_pdf.py
  2. +10 −6 past/model/status.py
  3. +12 −2 past/model/user.py
  4. +2 −2 past/templates/status.html
  5. +6 −0 past/utils/__init__.py
  6. +39 −21 past/utils/pdf.py
  7. +29 −6 past/view/timelines.py
View
85 cronjob/generate_pdf.py
@@ -5,36 +5,73 @@
import time
import datetime
+import calendar
activate_this = '../env/bin/activate_this.py'
execfile(activate_this, dict(__file__=activate_this))
from past.utils.pdf import generate_pdf, get_pdf_filename, is_pdf_file_exists
from past.model.user import User, UserAlias
+from past.model.status import Status
from past import config
+
+def generate(user_id, date, order='asc'):
+ try:
+ uas = UserAlias.gets_by_user_id(user_id)
+ if not uas:
+ return
+
+ start_date = datetime.datetime(date.year, date.month, 1)
+ end_date = datetime.datetime(date.year, date.month,
+ calendar.monthrange(date.year, date.month)[1], 23, 59, 59)
+ pdf_filename = get_pdf_filename(user_id, date.strftime("%Y%m"))
+ print '----generate pdf:', start_date, ' to ', end_date, ' file is', pdf_filename
+
+ if is_pdf_file_exists(pdf_filename):
+ print '---- %s exists, so ignore...' % pdf_filename
+ return
+
+ status_ids = Status.get_ids_by_date(user_id, start_date, end_date)
+ if order == 'asc':
+ status_ids = status_ids[::-1]
+ if not status_ids:
+ print '----- status ids is none', status_ids
+ return
+ generate_pdf(pdf_filename, user_id, status_ids, capacity=-1)
+ if not is_pdf_file_exists(pdf_filename):
+ print '----%s generate pdf for user:%s fail' % (datetime.datetime.now(), user_id)
+ else:
+ print '----%s generate pdf for user:%s succ' % (datetime.datetime.now(), user_id)
+ except Exception, e:
+ import traceback
+ print '%s %s' % (datetime.datetime.now(), traceback.format_exc())
+
+def generate_pdf_by_user(user_id):
+ user = User.get(user_id)
+ if not user:
+ return
+
+ start_date = Status.get_oldest_create_time(None, user_id)
+ if not start_date:
+ return
+ now = datetime.datetime.now()
+ now = datetime.datetime(now.year, now.month, now.day) - datetime.timedelta(days = calendar.monthrange(now.year, now.month)[1])
+
+ d = start_date
+ while d <= now:
+ generate(user_id, d)
+
+ days = calendar.monthrange(d.year, d.month)[1]
+ d += datetime.timedelta(days=days)
+ d = datetime.datetime(d.year, d.month, 1)
+
+
if __name__ == "__main__":
-
- #for uid in User.get_ids(0, 10000000):
- for uid in range(634,652):
- try:
- uas = UserAlias.gets_by_user_id(uid)
- if not uas:
- continue
- types = [x.type for x in uas]
- count = 300
- if config.OPENID_TYPE_DICT[config.OPENID_SINA] in types \
- or config.OPENID_TYPE_DICT[config.OPENID_QQ] in types:
- count = 250
- pdf_filename = get_pdf_filename(uid)
- print pdf_filename
- generate_pdf(pdf_filename, uid, 0, count, capacity=-1)
- if not is_pdf_file_exists(pdf_filename):
- print '%s generate pdf for user:%s fail' % (datetime.datetime.now(), uid)
- else:
- print '%s generate pdf for user:%s succ' % (datetime.datetime.now(), uid)
- except Exception, e:
- import traceback
- print '%s %s' % (datetime.datetime.now(), traceback.format_exc())
-
- time.sleep(1)
+ start = 0
+ limit = 100
+ while start <= 200:
+ for uid in User.get_ids_asc(start=start, limit=limit):
+ print '------begin generate pdf of user:', uid
+ generate_pdf_by_user(uid)
+ start += limit
View
16 past/model/status.py
@@ -86,7 +86,7 @@ def _generate_bare_text(self, offset=140):
##TODO:这个clear_cache需要拆分
@classmethod
- def _clear_cache(self, user_id, status_id, cate=None):
+ def _clear_cache(cls, user_id, status_id, cate=None):
if status_id:
mc.delete("status:%s" % status_id)
if user_id:
@@ -197,9 +197,9 @@ def _get_ids(cls, user_id, start=0, limit=20, order="create_time desc", cate=Non
@classmethod
def get_ids_by_date(cls, user_id, start_date, end_date):
cursor = db_conn.execute('''select id from status
- where user_id=%s and create_time>=%s and create_time<=%s
- order by time desc''',
- (user_id, start_date, end_date))
+ where user_id=%s and category!=%s and create_time>=%s and create_time<=%s
+ order by create_time desc''',
+ (user_id, config.CATE_DOUBAN_NOTE, start_date, end_date))
rows = cursor.fetchall()
cursor and cursor.close()
return [x[0] for x in rows]
@@ -244,8 +244,12 @@ def get_min_origin_id(cls, cate, user_id):
## just for tecent_weibo
@classmethod
def get_oldest_create_time(cls, cate, user_id):
- cursor = db_conn.execute('''select min(create_time) from status
- where category=%s and user_id=%s''', (cate, user_id))
+ if cate:
+ cursor = db_conn.execute('''select min(create_time) from status
+ where category=%s and user_id=%s''', (cate, user_id))
+ else:
+ cursor = db_conn.execute('''select min(create_time) from status
+ where user_id=%s''', user_id)
row = cursor.fetchone()
cursor and cursor.close()
if row:
View
14 past/model/user.py
@@ -82,9 +82,19 @@ def gets(cls, ids):
@classmethod
@pcache("user:ids")
- def get_ids(cls, start=0, limit=20, order="id desc"):
+ def get_ids(cls, start=0, limit=20):
sql = """select id from user
- order by """ + order + """ limit %s, %s"""
+ order by id desc limit %s, %s"""
+ cursor = db_conn.execute(sql, (start, limit))
+ rows = cursor.fetchall()
+ cursor and cursor.close()
+ return [x[0] for x in rows]
+
+ @classmethod
+ @pcache("user:ids:asc")
+ def get_ids_asc(cls, start=0, limit=20):
+ sql = """select id from user
+ order by id asc limit %s, %s"""
cursor = db_conn.execute(sql, (start, limit))
rows = cursor.fetchall()
cursor and cursor.close()
View
4 past/templates/status.html
@@ -161,14 +161,14 @@
{% set origin_uri = s.get_origin_uri() %}
<a href="{{origin_uri[1]}}">{{s.title}}</a><br/>
{%if pdf%}
- {{s.text|wrap_long_line}}
+ {{s.text}}
{%else%}
{{s.summary}}
<a href="/post/{{s.id}}">&nbsp;read more</a>
{%endif%}
{%if pdf%}
<div class="time">
- From: {{origin_uri()[0]}} {{s.create_time.strftime("%Y-%m-%d %H:%M:%S")}}</div>
+ From: {{origin_uri[0]}} {{s.create_time.strftime("%Y-%m-%d %H:%M:%S")}}</div>
{%endif%}
{%- endmacro %}
View
6 past/utils/__init__.py
@@ -3,6 +3,7 @@
import re
import time
import datetime
+import imghdr
import httplib2
import random
import string
@@ -71,3 +72,8 @@ def is_valid_email(email):
return EMAILRE.match(email) != None
return False
+
+def is_valid_image(content):
+ return content and imghdr.what(content) in \
+ [ 'rgb' ,'gif' ,'pbm' ,'pgm' ,
+ 'ppm' ,'tiff' ,'rast' ,'xbm' ,'jpeg' ,'bmp' ,'png']
View
60 past/utils/pdf.py
@@ -13,10 +13,11 @@
from past import app
from past.model.user import User
from past.model.status import Status
-from past.utils import wrap_long_line, filters, randbytes
+from past.utils import wrap_long_line, filters, randbytes, is_valid_image
+from past.utils.escape import clear_html_element
from past import config
-def generate_pdf(filename, uid, start, count, cate=None, with_head=True, capacity=50*1024):
+def generate_pdf(filename, uid, status_ids, with_head=True, capacity=50*1024):
#########Set FONT################
from xhtml2pdf.default import DEFAULT_FONT
@@ -39,8 +40,7 @@ def generate_pdf(filename, uid, start, count, cate=None, with_head=True, capacit
return None
# get status
- ids = Status.get_ids(user_id=uid, start=start, limit=count, cate=cate)
- status_list = Status.gets(ids)
+ status_list = Status.gets(status_ids)
_html = render(user, status_list, with_head)
_pdf = pisaDocument(_html, result, default_css=css, link_callback=link_callback, capacity=capacity)
result.close()
@@ -51,6 +51,10 @@ def generate_pdf(filename, uid, start, count, cate=None, with_head=True, capacit
return None
def render(user, status_list, with_head=True):
+ if not status_list:
+ return
+ date = status_list[0].create_time.strftime("%Y年%m月")
+ date = date.decode("utf8")
if with_head:
_html = u"""<html> <body>
<div id="Top">
@@ -61,14 +65,15 @@ def render(user, status_list, with_head=True):
<div class="box">
""" % (os.path.join(app.root_path, "static/img/logo.png"),
- datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), user.name)
+ date, user.name)
else:
_html = u"""<html> <body><div class="box">"""
from jinja2 import Environment, PackageLoader
env = Environment(loader=PackageLoader('past', 'templates'))
env.filters['wrap_long_line'] = wrap_long_line
env.filters['nl2br'] = filters.nl2br
+ env.filters['clear_html_element'] = clear_html_element
t = env.get_template('status.html')
m = t.module
for s in status_list:
@@ -82,21 +87,29 @@ def render(user, status_list, with_head=True):
r = m.twitter_status(s, pdf=True)
elif s.category == config.CATE_QQWEIBO_STATUS:
r = m.qq_weibo_status(s, pdf=True)
+ elif s.category == config.CATE_WORDPRESS_POST:
+ r = m.wordpress_status(s, pdf=True)
else:
r = ''
if not r:
continue
_html += '''<div class="cell">''' + r + '''</div>'''
- s._clear_cache(user_id = s.user_id, status_id = s.id)
+ Status._clear_cache(user_id = s.user_id, status_id = s.id)
_html += """</div></body></html>"""
return _html
def link_callback(uri, rel):
lower_uri = uri.lower()
- print '%s getting %s' % (datetime.datetime.now(), lower_uri)
+ print '%s getting %s' % (datetime.datetime.now(), uri)
if not (lower_uri.startswith('http://') or
- lower_uri.startswith('https://') or lower_uri.startswith('ftp://')):
- return uri
+ lower_uri.startswith('https://') or
+ lower_uri.startswith('ftp://')):
+ return ''
+ if lower_uri.find(" ") != -1:
+ return ''
+
+ if lower_uri.find("\n") != -1:
+ return ''
d = hashlib.md5()
d.update(uri)
@@ -114,21 +127,26 @@ def link_callback(uri, rel):
if os.path.exists(cache_file) and os.path.getsize(cache_file) > 0:
return cache_file
- resp, content = httplib2.Http().request(uri)
- if resp.status == 200:
- with open(cache_file, 'w') as f:
- f.write(content)
- return cache_file
- else:
- print 'get %s fail, status_code is %s, so return none' % (uri,resp.status)
- return ''
+ ##XXX:暂时不缓存图片文件了,因为磁盘不够用
+ #resp, content = httplib2.Http().request(uri)
+ #if resp.status == 200:
+ # with open(cache_file, 'w') as f:
+ # f.write(content)
+ # return cache_file
+ #else:
+ # print 'get %s fail, status_code is %s, so return none' % (uri,resp.status)
+ # return ''
return uri
-def get_pdf_filename(uid):
- return "thepast.me_%s.pdf" % uid
+def get_pdf_filename(uid, suffix=None):
+ if suffix:
+ return "thepast.me_%s_%s.pdf" % (uid, suffix)
+ else:
+ return "thepast.me_%s.pdf" % uid
def get_pdf_full_filename(filename):
+ filename = filename.replace("..", "").replace("/", "")
pdf_file_dir = config.PDF_FILE_DOWNLOAD_DIR
if not os.path.isdir(pdf_file_dir):
@@ -144,6 +162,6 @@ def is_pdf_file_exists(filename):
return True
return False
-def is_user_pdf_file_exists(uid):
- f = get_pdf_filename(uid)
+def is_user_pdf_file_exists(uid, suffix=None):
+ f = get_pdf_filename(uid, suffix)
return is_pdf_file_exists(f)
View
35 past/view/timelines.py
@@ -1,6 +1,8 @@
#-*- coding:utf-8 -*-
import os
-from datetime import datetime
+from datetime import datetime, timedelta
+import calendar
+from collections import defaultdict
from flask import g, request, redirect, url_for, abort, render_template,\
make_response
@@ -179,12 +181,35 @@ def pdf(uid):
user = User.get(uid)
if not user:
abort(404, "No such user")
-
- pdf_filename = get_pdf_filename(user.id)
+
+ intros = [g.user.get_thirdparty_profile(x).get("intro") for x in config.OPENID_TYPE_DICT.values()]
+ intros = filter(None, intros)
+
+ pdf_files = []
+ start_date = Status.get_oldest_create_time(None, user.id)
+ now = datetime.now()
+ d = start_date
+ while d <= now:
+ pdf_filename = get_pdf_filename(user.id, d.strftime("%Y%m"))
+ if is_pdf_file_exists(pdf_filename):
+ pdf_files.append([d, pdf_filename])
+
+ days = calendar.monthrange(d.year, d.month)[1]
+ d += timedelta(days=days)
+ d = datetime(d.year, d.month, 1)
+ files_dict = defaultdict(list)
+ for date, filename in pdf_files:
+ files_dict[date.year].append([date, filename])
+ return render_template("pdf.html", **locals())
+
+@app.route("/pdf/<filename>")
+@require_login()
+def pdf_down(filename):
+ pdf_filename = filename
if not is_pdf_file_exists(pdf_filename):
abort(404, "Please wait one day to download the PDF version, because the vps memory is limited")
- full_file_name = os.path.join(config.PDF_FILE_DOWNLOAD_DIR, pdf_filename)
+ full_file_name = get_pdf_full_filename(pdf_filename)
resp = make_response()
resp.headers['Cache-Control'] = 'no-cache'
resp.headers['Content-Type'] = 'application/pdf'
@@ -194,8 +219,6 @@ def pdf(uid):
resp.headers['X-Accel-Redirect'] = redir
return resp
-
-
## 把status_list构造为month,day的层级结构
def statuses_timelize(status_list):

0 comments on commit d59a265

Please sign in to comment.
Something went wrong with that request. Please try again.