Skip to content

Commit

Permalink
fixed photo spider
Browse files Browse the repository at this point in the history
  • Loading branch information
simonzg committed Jul 17, 2011
1 parent c63fee3 commit fef4a3a
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 41 deletions.
11 changes: 11 additions & 0 deletions dianping/dianping/items.py
Expand Up @@ -70,3 +70,14 @@ class DianpingImageItem(Item):
image_urls = Field() image_urls = Field()
images = Field() images = Field()
image_name = Field() image_name = Field()

def __repr__(self):
vals = self.__dict__['_values']
result = ""
if 'image_name' in vals.keys():
result += vals['image_name'] +'| '
if 'image_urls' in vals.keys():
result += vals['image_urls'][0]
if result == '':
return 'No INFO!!!'
return result
21 changes: 3 additions & 18 deletions dianping/dianping/middlewares.py
Expand Up @@ -9,24 +9,10 @@
from dianping.items import DianpingShopItem from dianping.items import DianpingShopItem
from dianping.db import get_connection from dianping.db import get_connection


class ShopIdMiddleware(object): class RefererMiddleware(object):
image2shop_dict = {}
props2remember = ['Shop-Id','Image-Name']

def process_request(self, request, spider):
header_dict = {}
for prop_name in self.props2remember:
if prop_name in request.headers.keys():
header_dict[prop_name] = request.headers[prop_name]
del request.headers[prop_name]
if header_dict != {}:
self.image2shop_dict[request.url] = header_dict

def process_response(self, request, response, spider): def process_response(self, request, response, spider):
if response.url in self.image2shop_dict.keys(): if 'Referer' in request.headers.keys():
response.headers.update( self.image2shop_dict[response.url] ) response.headers['Referer'] = request.headers['Referer']
# response.headers['ShopId'] = self.image2shop_dict[response.url]
del self.image2shop_dict[response.url]
return response return response


class RateLimitMiddleware(object): class RateLimitMiddleware(object):
Expand All @@ -45,7 +31,6 @@ def process_response(self, request, response, spider):
return response return response


class IgnoreExistingURLMiddleware(object): class IgnoreExistingURLMiddleware(object):

db = get_connection() db = get_connection()


def process_request(self, request, spider): def process_request(self, request, spider):
Expand Down
11 changes: 7 additions & 4 deletions dianping/dianping/pipelines.py
Expand Up @@ -4,7 +4,7 @@
# See: http://doc.scrapy.org/topics/item-pipeline.html # See: http://doc.scrapy.org/topics/item-pipeline.html
from scrapy import log from scrapy import log
from os.path import join from os.path import join
from scrapy.exceptions import IgnoreRequest from scrapy.exceptions import DropItem
import gridfs import gridfs


from dianping.db import get_connection from dianping.db import get_connection
Expand All @@ -19,10 +19,10 @@ def __init__(self):


def process_item(self, item, spider): def process_item(self, item, spider):
if isinstance(item, DianpingShopItem): if isinstance(item, DianpingShopItem):
self.process_detail_item(item, spider) return self.process_detail_item(item, spider)


if isinstance(item, DianpingImageItem): if isinstance(item, DianpingImageItem):
self.process_image_item(item, spider) return self.process_image_item(item, spider)


def process_detail_item(self, item, spider): def process_detail_item(self, item, spider):
val_dict = item.__dict__['_values'] val_dict = item.__dict__['_values']
Expand All @@ -43,8 +43,11 @@ def process_image_item(self, item, spider):
assert len(item['images']) > 0 assert len(item['images']) > 0
assert len(item['image_name']) > 0 assert len(item['image_name']) > 0
except: except:
raise IgnoreRequest raise DropItem


# drop existing images
if self.fs.exists({'url':item['images'][0]['url']}):
raise DropItem


file = open(join(IMAGES_STORE, item['images'][0]['path']),'r') file = open(join(IMAGES_STORE, item['images'][0]['path']),'r')
self.fs.put(file, filename=item['image_name']+'.jpg', shop_id=item['shop_id'], url=item['images'][0]['url']) self.fs.put(file, filename=item['image_name']+'.jpg', shop_id=item['shop_id'], url=item['images'][0]['url'])
Expand Down
19 changes: 11 additions & 8 deletions dianping/dianping/settings.py
Expand Up @@ -23,17 +23,21 @@


# --- Middlewares --- # --- Middlewares ---
DOWNLOADER_MIDDLEWARES = [ DOWNLOADER_MIDDLEWARES = [
'dianping.middlewares.ShopIdMiddleware', 'scrapy.contrib.spidermiddleware.referer.RefererMiddleware', # add 'Referer' to request based on response
'dianping.middlewares.IgnoreVisitedUrlMiddleware', 'dianping.middlewares.RefererMiddleware', # update 'Referer' field on response based on request
'dianping.middlewares.IgnoreExistingURLMiddleware', 'dianping.middlewares.IgnoreVisitedUrlMiddleware', # prevent re-visit a url
'dianping.middlewares.RateLimitMiddleware', 'dianping.middlewares.IgnoreExistingURLMiddleware', # prevent re-visit a url based on database
'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware', 'dianping.middlewares.RateLimitMiddleware', # prevent over limit
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware', 'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware', # Cache
#'scrapy.contrib.spidermiddleware.depth.DepthMiddleware', # depth control
] ]


# --- Pipelines --- # --- Pipelines ---
IMAGES_STORE = '/tmp/images/' IMAGES_STORE = '/tmp/images/'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline','dianping.pipelines.DianpingPipeline'] ITEM_PIPELINES = [
'scrapy.contrib.pipeline.images.ImagesPipeline',
'dianping.pipelines.DianpingPipeline',
]


# --- Depth limit --- # --- Depth limit ---
# DEPTH_LIMIT=10 # DEPTH_LIMIT=10
Expand Down Expand Up @@ -79,6 +83,5 @@
if match: if match:
shop_id = match.group(1) shop_id = match.group(1)
urls.append('http://www.dianping.com/shop/%s/photos' % shop_id) urls.append('http://www.dianping.com/shop/%s/photos' % shop_id)
print len(urls)
SEEDS['photo'] = urls SEEDS['photo'] = urls
# SEED_FILE=join(dirname(__file__), 'seeds', 'major-cities.txt') # SEED_FILE=join(dirname(__file__), 'seeds', 'major-cities.txt')
29 changes: 18 additions & 11 deletions dianping/dianping/spiders/photo.py
Expand Up @@ -10,6 +10,7 @@
from scrapy.selector import HtmlXPathSelector from scrapy.selector import HtmlXPathSelector


from scrapy.exceptions import IgnoreRequest from scrapy.exceptions import IgnoreRequest
from scrapy import log


from dianping.items import DianpingImageItem from dianping.items import DianpingImageItem
from dianping import settings from dianping import settings
Expand All @@ -20,11 +21,11 @@ class ShopImageSpider(CrawlSpider):
allowed_domains = ['dianping.com', 'i1.dpfile.com','i2.dpfile.com','i3.dpfile.com'] allowed_domains = ['dianping.com', 'i1.dpfile.com','i2.dpfile.com','i3.dpfile.com']
rules = ( rules = (
Rule(SgmlLinkExtractor(allow=('shop/\d+/photos(\?pg=\d+)*'), restrict_xpaths="//a[@class='NextPage']", unique=True), callback="parse_image_list_page"), # page list page & next page Rule(SgmlLinkExtractor(allow=('shop/\d+/photos(\?pg=\d+)*'), restrict_xpaths="//a[@class='NextPage']", unique=True), callback="parse_image_list_page"), # page list page & next page
#Rule(SgmlLinkExtractor(allow=('photos/\d+'), restrict_xpaths="//div[@class='gallery-list-wrapper page-block']", unique=True), callback="extract_image"), # photo page Rule(SgmlLinkExtractor(allow=('photos/\d+'), restrict_xpaths="//div[@class='gallery-list-wrapper page-block']", unique=True), callback="extract_image"), # photo page
Rule(SgmlLinkExtractor(allow=('.+\d+p\d+(n\d+)?/?.*$'), restrict_xpaths="//a[@class='NextPage']", unique=True)), # next page Rule(SgmlLinkExtractor(allow=('.+\d+p\d+(n\d+)?/?.*$'), restrict_xpaths="//a[@class='NextPage']", unique=True)), # next page
) )


start_urls = ['http://www.dianping.com/shop/2830624/photos',] # start_urls = ['http://www.dianping.com/shop/1999627/photos',]


def start_requests(self): def start_requests(self):
if 'SEEDS' in settings.__dict__.keys(): if 'SEEDS' in settings.__dict__.keys():
Expand All @@ -42,29 +43,35 @@ def parse_image_list_page(self, response):
hxs = HtmlXPathSelector(response) hxs = HtmlXPathSelector(response)
selector = SgmlLinkExtractor(allow=('photos/\d+'), restrict_xpaths="//div[@class='gallery-list-wrapper page-block']", unique=True) selector = SgmlLinkExtractor(allow=('photos/\d+'), restrict_xpaths="//div[@class='gallery-list-wrapper page-block']", unique=True)
next_page_link = SgmlLinkExtractor(allow=('shop/\d+/photos(\?pg=\d+)*'), restrict_xpaths="//a[@class='NextPage']", unique=True) next_page_link = SgmlLinkExtractor(allow=('shop/\d+/photos(\?pg=\d+)*'), restrict_xpaths="//a[@class='NextPage']", unique=True)
match = self.image_list_url_pattern.match(response.url)
if match:
shop_id = match.group(1)

# Prepare cookies # Prepare cookies
cookies = {} cookies = {}
if 'Set-Cookie' in response.headers: if 'Set-Cookie' in response.headers:
for eq in response.headers['Set-Cookie'].split(';'): for eq in response.headers['Set-Cookie'].split(';'):
k,v = eq.strip().split('=') k,v = eq.strip().split('=')
cookies[k] = v cookies[k] = v


requests = []
# follow next-page # follow next-page
for link in next_page_link.extract_links(response): for link in next_page_link.extract_links(response):
yield Request(link.url, cookies=cookies, callback=self.parse_image_list_page) req = Request(link.url, cookies=cookies, callback=self.parse_image_list_page)
requests.append(req)


# follow image link # follow image link
for link in selector.extract_links(response): for link in selector.extract_links(response):
img_name = hxs.select('//a[@href="%s"]/img/../../../..//strong/@title' %urlparse(link.url).path).extract()[0] req = Request(link.url, cookies=cookies, callback=self.extract_image)
yield Request(link.url, headers={'Shop-Id':shop_id, "Image-Name":img_name}, cookies=cookies, callback=self.extract_image) requests.append(req)

for req in requests:
yield req


def extract_image(self, response): def extract_image(self, response):
shop_id = response.headers.get("Shop-Id",'') hxs = HtmlXPathSelector(response)
img_name = response.headers.get("Image-Name",'')
shop_id_match = self.image_list_url_pattern.match(response.headers.get('Referer'))
if shop_id_match:
shop_id = shop_id_match.group(1)
img_name = hxs.select('//div[@class="page-main-title"]/h1/text()').extract()[0]

if len(shop_id) <= 0 or len(img_name) <= 0: if len(shop_id) <= 0 or len(img_name) <= 0:
raise IgnoreRequest raise IgnoreRequest
item = DianpingImageItem() item = DianpingImageItem()
Expand Down
Empty file.
11 changes: 11 additions & 0 deletions dianping/dianping/utils/remove_duplicate_image.py
@@ -0,0 +1,11 @@
from .. import db
from db import get_connection
import gridfs

db = get_connection()
fs = gridfs.GridFS(db, collection='images')

for img in db.images.files.find():
if db.images.files.find({'url':img['url']}).count() > 1:
print "DELETING: " + img['filename']
fs.delete(img['_id'])

0 comments on commit fef4a3a

Please sign in to comment.