Permalink
Browse files

fixed photo spider

  • Loading branch information...
1 parent c63fee3 commit fef4a3a5aa4897807d575f9842db6026d797c272 @simonzg simonzg committed Jul 17, 2011
View
@@ -70,3 +70,14 @@ class DianpingImageItem(Item):
image_urls = Field()
images = Field()
image_name = Field()
+
+ def __repr__(self):
+ vals = self.__dict__['_values']
+ result = ""
+ if 'image_name' in vals.keys():
+ result += vals['image_name'] +'| '
+ if 'image_urls' in vals.keys():
+ result += vals['image_urls'][0]
+ if result == '':
+ return 'No INFO!!!'
+ return result
@@ -9,24 +9,10 @@
from dianping.items import DianpingShopItem
from dianping.db import get_connection
-class ShopIdMiddleware(object):
- image2shop_dict = {}
- props2remember = ['Shop-Id','Image-Name']
-
- def process_request(self, request, spider):
- header_dict = {}
- for prop_name in self.props2remember:
- if prop_name in request.headers.keys():
- header_dict[prop_name] = request.headers[prop_name]
- del request.headers[prop_name]
- if header_dict != {}:
- self.image2shop_dict[request.url] = header_dict
-
+class RefererMiddleware(object):
def process_response(self, request, response, spider):
- if response.url in self.image2shop_dict.keys():
- response.headers.update( self.image2shop_dict[response.url] )
- # response.headers['ShopId'] = self.image2shop_dict[response.url]
- del self.image2shop_dict[response.url]
+ if 'Referer' in request.headers.keys():
+ response.headers['Referer'] = request.headers['Referer']
return response
class RateLimitMiddleware(object):
@@ -45,7 +31,6 @@ def process_response(self, request, response, spider):
return response
class IgnoreExistingURLMiddleware(object):
-
db = get_connection()
def process_request(self, request, spider):
@@ -4,7 +4,7 @@
# See: http://doc.scrapy.org/topics/item-pipeline.html
from scrapy import log
from os.path import join
-from scrapy.exceptions import IgnoreRequest
+from scrapy.exceptions import DropItem
import gridfs
from dianping.db import get_connection
@@ -19,10 +19,10 @@ def __init__(self):
def process_item(self, item, spider):
if isinstance(item, DianpingShopItem):
- self.process_detail_item(item, spider)
+ return self.process_detail_item(item, spider)
if isinstance(item, DianpingImageItem):
- self.process_image_item(item, spider)
+ return self.process_image_item(item, spider)
def process_detail_item(self, item, spider):
val_dict = item.__dict__['_values']
@@ -43,8 +43,11 @@ def process_image_item(self, item, spider):
assert len(item['images']) > 0
assert len(item['image_name']) > 0
except:
- raise IgnoreRequest
+ raise DropItem
+ # drop existing images
+ if self.fs.exists({'url':item['images'][0]['url']}):
+ raise DropItem
file = open(join(IMAGES_STORE, item['images'][0]['path']),'r')
self.fs.put(file, filename=item['image_name']+'.jpg', shop_id=item['shop_id'], url=item['images'][0]['url'])
@@ -23,17 +23,21 @@
# --- Middlewares ---
DOWNLOADER_MIDDLEWARES = [
- 'dianping.middlewares.ShopIdMiddleware',
- 'dianping.middlewares.IgnoreVisitedUrlMiddleware',
- 'dianping.middlewares.IgnoreExistingURLMiddleware',
- 'dianping.middlewares.RateLimitMiddleware',
- 'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware',
- 'scrapy.contrib.spidermiddleware.depth.DepthMiddleware',
+ 'scrapy.contrib.spidermiddleware.referer.RefererMiddleware', # add 'Referer' to request based on response
+ 'dianping.middlewares.RefererMiddleware', # update 'Referer' field on response based on request
+ 'dianping.middlewares.IgnoreVisitedUrlMiddleware', # prevent re-visit a url
+ 'dianping.middlewares.IgnoreExistingURLMiddleware', # prevent re-visit a url based on database
+ 'dianping.middlewares.RateLimitMiddleware', # prevent over limit
+ 'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware', # Cache
+ #'scrapy.contrib.spidermiddleware.depth.DepthMiddleware', # depth control
]
# --- Pipelines ---
IMAGES_STORE = '/tmp/images/'
-ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline','dianping.pipelines.DianpingPipeline']
+ITEM_PIPELINES = [
+ 'scrapy.contrib.pipeline.images.ImagesPipeline',
+ 'dianping.pipelines.DianpingPipeline',
+]
# --- Depth limit ---
# DEPTH_LIMIT=10
@@ -79,6 +83,5 @@
if match:
shop_id = match.group(1)
urls.append('http://www.dianping.com/shop/%s/photos' % shop_id)
-print len(urls)
SEEDS['photo'] = urls
# SEED_FILE=join(dirname(__file__), 'seeds', 'major-cities.txt')
@@ -10,6 +10,7 @@
from scrapy.selector import HtmlXPathSelector
from scrapy.exceptions import IgnoreRequest
+from scrapy import log
from dianping.items import DianpingImageItem
from dianping import settings
@@ -20,11 +21,11 @@ class ShopImageSpider(CrawlSpider):
allowed_domains = ['dianping.com', 'i1.dpfile.com','i2.dpfile.com','i3.dpfile.com']
rules = (
Rule(SgmlLinkExtractor(allow=('shop/\d+/photos(\?pg=\d+)*'), restrict_xpaths="//a[@class='NextPage']", unique=True), callback="parse_image_list_page"), # page list page & next page
- #Rule(SgmlLinkExtractor(allow=('photos/\d+'), restrict_xpaths="//div[@class='gallery-list-wrapper page-block']", unique=True), callback="extract_image"), # photo page
+ Rule(SgmlLinkExtractor(allow=('photos/\d+'), restrict_xpaths="//div[@class='gallery-list-wrapper page-block']", unique=True), callback="extract_image"), # photo page
Rule(SgmlLinkExtractor(allow=('.+\d+p\d+(n\d+)?/?.*$'), restrict_xpaths="//a[@class='NextPage']", unique=True)), # next page
)
- start_urls = ['http://www.dianping.com/shop/2830624/photos',]
+# start_urls = ['http://www.dianping.com/shop/1999627/photos',]
def start_requests(self):
if 'SEEDS' in settings.__dict__.keys():
@@ -42,29 +43,35 @@ def parse_image_list_page(self, response):
hxs = HtmlXPathSelector(response)
selector = SgmlLinkExtractor(allow=('photos/\d+'), restrict_xpaths="//div[@class='gallery-list-wrapper page-block']", unique=True)
next_page_link = SgmlLinkExtractor(allow=('shop/\d+/photos(\?pg=\d+)*'), restrict_xpaths="//a[@class='NextPage']", unique=True)
- match = self.image_list_url_pattern.match(response.url)
- if match:
- shop_id = match.group(1)
-
# Prepare cookies
cookies = {}
if 'Set-Cookie' in response.headers:
for eq in response.headers['Set-Cookie'].split(';'):
k,v = eq.strip().split('=')
cookies[k] = v
+ requests = []
# follow next-page
for link in next_page_link.extract_links(response):
- yield Request(link.url, cookies=cookies, callback=self.parse_image_list_page)
+ req = Request(link.url, cookies=cookies, callback=self.parse_image_list_page)
+ requests.append(req)
# follow image link
for link in selector.extract_links(response):
- img_name = hxs.select('//a[@href="%s"]/img/../../../..//strong/@title' %urlparse(link.url).path).extract()[0]
- yield Request(link.url, headers={'Shop-Id':shop_id, "Image-Name":img_name}, cookies=cookies, callback=self.extract_image)
+ req = Request(link.url, cookies=cookies, callback=self.extract_image)
+ requests.append(req)
+
+ for req in requests:
+ yield req
def extract_image(self, response):
- shop_id = response.headers.get("Shop-Id",'')
- img_name = response.headers.get("Image-Name",'')
+ hxs = HtmlXPathSelector(response)
+
+ shop_id_match = self.image_list_url_pattern.match(response.headers.get('Referer'))
+ if shop_id_match:
+ shop_id = shop_id_match.group(1)
+ img_name = hxs.select('//div[@class="page-main-title"]/h1/text()').extract()[0]
+
if len(shop_id) <= 0 or len(img_name) <= 0:
raise IgnoreRequest
item = DianpingImageItem()
No changes.
@@ -0,0 +1,11 @@
+from .. import db
+from db import get_connection
+import gridfs
+
+db = get_connection()
+fs = gridfs.GridFS(db, collection='images')
+
+for img in db.images.files.find():
+ if db.images.files.find({'url':img['url']}).count() > 1:
+ print "DELETING: " + img['filename']
+ fs.delete(img['_id'])

0 comments on commit fef4a3a

Please sign in to comment.