Skip to content

Commit

Permalink
db connects to mongodb@vega
Browse files Browse the repository at this point in the history
  • Loading branch information
simonzg committed Jun 9, 2011
1 parent 3968afe commit fe98134
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 19 deletions.
3 changes: 2 additions & 1 deletion dianping/dianping/db.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import pymongo

whaleshark = '128.210.189.88'
vega = '204.62.14.55'
def get_connection():
conn = pymongo.Connection(whaleshark).dianping
conn = pymongo.Connection(vega).dianping
conn.authenticate('dianping','crawler')
return conn

21 changes: 12 additions & 9 deletions dianping/dianping/middlewares.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# coding=utf-8
import time

from scrapy import log
from scrapy.http import Request
from scrapy.project import crawler
from scrapy.item import BaseItem
from scrapy.utils.request import request_fingerprint
from scrapy.exceptions import IgnoreRequest
Expand All @@ -12,11 +11,14 @@
from dianping.db import get_connection

class PeepMiddleware(object):
limit_indicators = (
'对不起,你访问的太快了',
#'发布的所有内容,未经许可,不得转载',
)

def __init__(self, *args, **kwargs):
super(PeepMiddleware, self).__init__(*args, **kwargs)
self.db = get_connection()
self.n_deny = 0
self.interval = 1

def process_request(self, request, spider):
if self.db.shops.find_one({'link_url':request.url}):
Expand All @@ -27,11 +29,12 @@ def process_request(self, request, spider):

def process_response(self, request, response, spider):
# log.msg('Response: %s'%response.url, log.INFO)
if "对不起,你访问的太快了" in response.body:
self.n_deny += 1
sleep_sec = self.n_deny*self.interval
log.msg('Too FAST, wait for %d seconds' %sleep_sec, log.WARNING)
time.sleep(sleep_sec)
for indicator in self.limit_indicators:
if indicator in response.body:
log.msg('TOOOOOO FAST!', log.WARNING)
crawler.engine.close_spider(spider, 'over the limit')
raise IgnoreRequest

return response

class IgnoreVisitedUrlMiddleware(object):
Expand Down
5 changes: 4 additions & 1 deletion dianping/dianping/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,15 @@
#CONCURRENT_REQUESTS_PER_SPIDER=1
#CONCURRENT_SPIDERS=1

DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 2
DOWNLOAD_TIMEOUT = 20
RANDOMIZE_DOWNLOAD_DELAY = True

# LOG_FILE = 'crawl.log'
ITEM_PIPELINES = ['dianping.pipelines.DianpingPipeline']

# SEED that the spider starts with
SEEDS= (
'http://www.dianping.com/beijing',
)
SEED_FILE=join(dirname(__file__), 'seeds', 'major-cities.txt')
17 changes: 9 additions & 8 deletions dianping/dianping/spiders/shop.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,15 @@ class ShopDetailSpider(CrawlSpider):
# start_urls = ['http://www.dianping.com/search/category/9/10/r1629g4479',]

def start_requests(self):
seed_file = join(dirname(abspath(__file__)), pardir, 'seeds', 'shanghai.txt')
for line in open(settings.SEED_FILE,'r').readlines():
if line.startswith('#'): continue
yield Request(line.strip(), dont_filter=True)
if 'SEEDS' in settings.__dict__.keys():
for seed in settings.SEEDS:
yield Request(seed, dont_filter=True)
elif 'SEED_FILE' in settings.__dict__.keys():
for line in open(settings.SEED_FILE,'r').readlines():
if line.startswith('#'): continue
yield Request(line.strip(), dont_filter=True)
else:
raise KeyError('neither SEEDS nor SEED_FILE defined in settings.py')

def parse_shop_detail(self, response):
print response.url
Expand Down Expand Up @@ -121,7 +126,3 @@ def parse_name_count(self, beautifulsoup_tag, tag_name='strong'):
ret.append( (name,count,))
return ret




return item

0 comments on commit fe98134

Please sign in to comment.