-
Notifications
You must be signed in to change notification settings - Fork 0
/
worker.py
240 lines (201 loc) · 8.37 KB
/
worker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import, print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, hojel'
__docformat__ = 'restructuredtext ko'
import socket, re, datetime
from collections import OrderedDict
from threading import Thread
from lxml.html import fromstring, tostring
from calibre.ebooks.metadata.book.base import Metadata
from calibre.library.comments import sanitize_comments_html
from calibre.utils.cleantext import clean_ascii_chars
class Worker(Thread): # Get details
'''
Get book details from YES24 book page in a separate thread
'''
def __init__(self, url, result_queue, browser, log, relevance, plugin, timeout=20):
Thread.__init__(self)
self.daemon = True
self.url, self.result_queue = url, result_queue
self.log, self.timeout = log, timeout
self.relevance, self.plugin = relevance, plugin
self.browser = browser.clone_browser()
self.cover_url = self.yes24_id = self.isbn = None
self.kyobo_cover = False
def run(self):
try:
self.get_details()
except:
self.log.exception('get_details failed for url: %r'%self.url)
def get_details(self):
try:
self.log.info('YES24 url: %r'%self.url)
raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
except Exception as e:
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
self.log.error('URL malformed: %r'%self.url)
return
attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
msg = 'YES24 timed out. Try again later.'
self.log.error(msg)
else:
msg = 'Failed to make details query: %r'%self.url
self.log.exception(msg)
return
raw = raw.decode('euc-kr', errors='replace')
#open('P:\\yes24.html', 'wb').write(raw)
if 'HTTP 404.' in raw:
self.log.error('URL malformed: %r'%self.url)
return
try:
root = fromstring(clean_ascii_chars(raw))
except:
msg = 'Failed to parse YES24 details page: %r'%self.url
self.log.exception(msg)
return
self.parse_details(root)
def parse_details(self, root):
try:
yes24_id = self.parse_yes24_id(self.url)
except:
self.log.exception('Error parsing YES24 id for url: %r'%self.url)
yes24_id = None
try:
(title, series, series_index) = self.parse_title_series(root)
except:
self.log.exception('Error parsing title and series for url: %r'%self.url)
title = series = series_index = None
try:
authors = self.parse_authors(root)
except:
self.log.exception('Error parsing authors for url: %r'%self.url)
authors = []
if not title or not authors or not yes24_id:
self.log.error('Could not find title/authors/YES24 id for %r'%self.url)
self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title,
authors))
return
mi = Metadata(title, authors)
if series:
mi.series = series
mi.series_index = series_index
mi.set_identifier('yes24', yes24_id)
self.yes24_id = yes24_id
try:
isbn = self.parse_isbn(root)
if isbn:
self.isbn = mi.isbn = isbn
except:
self.log.exception('Error parsing ISBN for url: %r'%self.url)
try:
mi.comments = self.parse_comments(root)
except:
self.log.exception('Error parsing comments for url: %r'%self.url)
try:
self.cover_url = self.parse_cover(root)
except:
self.log.exception('Error parsing cover for url: %r'%self.url)
mi.has_cover = bool(self.cover_url)
mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!
try:
mi.publisher = self.parse_publisher(root)
except:
self.log.exception('Error parsing publisher for url: %r'%self.url)
try:
mi.pubdate = self.parse_published_date(root)
except:
self.log.exception('Error parsing published date for url: %r'%self.url)
mi.language = 'ko'
mi.source_relevance = self.relevance
if self.yes24_id:
if self.isbn:
self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id)
self.plugin.clean_downloaded_metadata(mi)
self.result_queue.put(mi)
def parse_yes24_id(self, url):
return re.search('yes24.com/24/[Gg]oods/(\d+)', url).groups(0)[0]
def parse_title_series(self, root):
title_node = root.xpath('//h1/a')
if not title_node:
title_node = root.xpath('//meta[@property="og:title"]/@content')
if not title_node:
return (None, None, None)
title_text = title_node[0].text.strip()
# 시리즈
series_node = root.xpath('//span[@class="series"]/a')
if series_node:
series_grp = series_node[0].text.strip().rsplit('-',1)
series_name = series_grp[0]
series_index = float(series_grp[1]) if len(series_grp)==2 else None
return (title_text, series_name, series_index)
else:
return (title_text, None, None)
def parse_authors(self, root):
brief_nodes = root.xpath('//div[@id="title"]/p')
if brief_nodes:
author_nodes = root.xpath('a[contains(@href,"author_yn=Y")]')
if author_nodes:
return [ a.text.strip() for a in author_nodes ]
else:
bgrp = brief_nodes[0].text_content().split('|')
author_text = bgrp[0].split(u" 저/")[0]
return [ a.strip() for a in author_text.split(',') ]
def parse_isbn(self, root):
detail_node = root.xpath('//dd[@class="isbn10"]/p')
if detail_node:
return detail_node[0].text.strip()
def parse_publisher(self, root):
publ_nodes = root.xpath('//div[@id="title"]/p/a[contains(@href,"company_yn=Y")]')
if publ_nodes:
return publ_nodes[0].text.strip()
def parse_published_date(self, root):
date_node = root.xpath('//dd[@class="pdDate"]/p')
if date_node:
return self._convert_date_text(date_node[0].text.strip())
def _convert_date_text(self, date_text):
# 2011년 8월 30일
year_s, month_s, day_s = re.match(u'^(\d+)년 (\d+)월 (\d+)일$', date_text).group(1,2,3)
year = int(year_s)
month = int(month_s)
day = int(day_s)
return datetime.datetime(year, month, day)
def parse_comments(self, root):
comments = ''
description_node = root.xpath('//div/h2/img[@title="책소개"]/../../p')
if description_node:
comments = tostring(description_node[0], method='html').strip()
if comments:
return comments
def parse_cover(self, root):
image_node = root.xpath('//meta[@property="og:image"]/@content')
if image_node:
page_url = image_node[0].strip()
if page_url.endswith('/M'):
page_url = page_url[:-2]+'/L'
if self.kyobo_cover:
new_cover = self._kyobo_hires_image(self.isbn)
if new_cover:
page_url = new_cover
print("Cover URL: ", page_url)
if self.yes24_id:
self.plugin.cache_identifier_to_cover_url(self.yes24_id, page_url)
# Lower our relevance factor in favour of an ISBN that has a full cover if possible
self.relevance += 5
return page_url
def _is_valid_image(self, img_url):
return True
def _kyobo_hires_image(self, isbn):
if len(isbn) != 13:
return None
img_url = "http://image.kyobobook.co.kr/images/book/xlarge/{1:s}/x{0:s}.jpg".format(isbn, isbn[-3:])
self.log.info("try kyobo image: "+img_url)
try:
resp = self.browser.open_novisit(img_url, timeout=self.timeout)
except:
return None
return img_url