-
Notifications
You must be signed in to change notification settings - Fork 1
/
pipelines.py
29 lines (23 loc) · 1000 Bytes
/
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import os
import signals
class SaveFilePipeline:
save_dir = None
def open_spider(self, spider):
save_dir = getattr(spider, 'save_dir', os.path.join(os.getcwd(), 'data/files/'))
if not os.path.isdir(save_dir):
os.mkdir(save_dir)
self.save_dir = save_dir
def process_item(self, item, spider):
file = item.pop('file')
file_name = item['file_name']
is_pdf = False
if file.startswith(b'%PDF'):
is_pdf = True
with open(os.path.join(self.save_dir, file_name), 'wb') as f:
f.write(file)
spider.crawler.signals.send_catch_log(signal=signals.file_downloaded,
file_name=file_name,
total_count=spider.articles_count,
downloaded_count=spider.downloaded_count,
is_pdf=is_pdf)
return item