Skip to content

Commit

Permalink
removed whitespaces
Browse files Browse the repository at this point in the history
  • Loading branch information
jmg committed Nov 14, 2011
1 parent a14ee46 commit fad959e
Show file tree
Hide file tree
Showing 27 changed files with 236 additions and 237 deletions.
4 changes: 2 additions & 2 deletions crawley/crawlers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ def _initialize_scrapers(self):
"""
Instanciates all the scraper classes
"""
self.scrapers = [scraper_class(settings=self.settings) for scraper_class in self.scrapers]

self.scrapers = [scraper_class(settings=self.settings) for scraper_class in self.scrapers]

def _make_request(self, url, data=None):
"""
Expand Down
4 changes: 2 additions & 2 deletions crawley/crawlers/fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from crawley.http.managers import FastRequestManager

class FastCrawler(BaseCrawler):

def __init__(self, *args, **kwargs):

BaseCrawler.__init__(self, *args, **kwargs)
self.request_manager = FastRequestManager()
32 changes: 16 additions & 16 deletions crawley/crawlers/offline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,40 @@
from StringIO import StringIO

class OffLineCrawler(BaseCrawler):

def __init__(self, *args, **kwargs):

BaseCrawler.__init__(self, *args, **kwargs)

def _get_response(self, url, data=None):

response = BaseCrawler._get_response(self, url, data)
fixer = HTMLFixer(self._url_regex, url, response.raw_html)

fixer = HTMLFixer(self._url_regex, url, response.raw_html)
html = fixer.get_fixed_html()

return html


class HTMLFixer(object):

def __init__(self, url_regex, url, html):

self._url_regex = url_regex
self.url = url
self.html_tree = XPathExtractor().get_object(html)

def get_fixed_html(self):

self._fix_tags("link", "href")
self._fix_tags("img", "src")

return etree.tostring(self.html_tree.getroot(), pretty_print=True, method="html")

def _fix_tags(self, tag, attrib):

tags = self.html_tree.xpath("//%s" % tag)

for tag in tags:
if not self._url_regex.match(tag.attrib[attrib]):
tag.attrib[attrib] = "%s/%s" % (self.url, tag.attrib[attrib])
20 changes: 10 additions & 10 deletions crawley/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,38 @@ class AuthenticationError(Exception):
"""
Raised when a login error occurs
"""

def __init__(self, *args, **kwargs):

Exception.__init__(self, *args, **kwargs)


class TemplateSyntaxError(Exception):
"""
DSL Template sintax error
"""

def __init__(self, line=0, *args, **kwargs):

self.line = line
Exception.__init__(self, *args, **kwargs)


class ScraperCantParseError(Exception):
"""
Raised when a scraper can't parse an html page
"""

def __init__(self, *args, **kwargs):

Exception.__init__(self, *args, **kwargs)


class InvalidProjectError(Exception):
"""
Raised when the user opens a invalid directory with the browser
"""

def __init__(self, *args, **kwargs):

Exception.__init__(self, *args, **kwargs)
28 changes: 14 additions & 14 deletions crawley/http/cookies.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,38 @@
import urllib2
import cookielib
import tempfile

class CookieHandler(urllib2.HTTPCookieProcessor):
"""
Cookie jar wrapper for save and load cookie from a file
"""

COOKIES_FILE = "crawley_cookies"

def _make_temp_file(self):
tmp = tempfile.gettempdir()

tmp = tempfile.gettempdir()
self.cookie_file = os.path.join(tmp, self.COOKIES_FILE)
def __init__(self, *args, **kwargs):

def __init__(self, *args, **kwargs):

self._make_temp_file()

self._jar = cookielib.LWPCookieJar(self.cookie_file)
urllib2.HTTPCookieProcessor.__init__(self, self._jar, *args, **kwargs)

def load_cookies(self):
"""
Load cookies from the file
"""

if os.path.isfile(self.cookie_file):
self._jar.load()

def save_cookies(self):
"""
Save cookies if the jar is not empty
"""
if self._jar is not None:

if self._jar is not None:
self._jar.save()
18 changes: 9 additions & 9 deletions crawley/http/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,25 +42,25 @@ def __init__(self, settings=None, delay=None, deviation=None):
self.delay = delay
self.deviation = deviation
self.settings = settings

self._install_opener()

def _install_opener(self):

if has_valid_attr(self.settings,'PROXY_HOST') and has_valid_attr(self.settings,'PROXY_PORT'):

proxy_info = { #proxy information
'user' : getattr(self.settings, 'PROXY_USER', ''),
'pass' : getattr(self.settings, 'PROXY_PASS', ''),
'host' : getattr(self.settings, 'PROXY_HOST', ''), #localhost
'port' : getattr(self.settings, 'PROXY_PORT', 80)
}
}

# build a new opener that uses a proxy requiring authorization
proxy = urllib2.ProxyHandler({"http" :"http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info})
proxy = urllib2.ProxyHandler({"http" :"http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info})
self.opener = urllib2.build_opener(proxy, self.cookie_handler)
else:

else:
self.opener = urllib2.build_opener(self.cookie_handler)

def _get_request(self, url):
Expand Down
12 changes: 6 additions & 6 deletions crawley/http/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,17 @@ def get_response(self, data=None, delay_factor=1):
Returns the response object from a request.
Cookies are supported via a CookieHandler object
"""

"""The proxy settings is used as the following dictionary"""

self._normalize_url()

request = urllib2.Request(self.url, data, self.headers)

args = {}
if config.REQUEST_TIMEOUT is not None:
args["timeout"] = config.REQUEST_TIMEOUT

response = self.opener.open(request, **args)
self.cookie_handler.save_cookies()

Expand All @@ -68,14 +68,14 @@ def __init__(self, delay=0, deviation=0, **kwargs):
deviation = deviation * FACTOR
randomize = random.randint(-deviation, deviation) / FACTOR

self.delay = delay + randomize
self.delay = delay + randomize
Request.__init__(self, **kwargs)

def get_response(self, data=None, delay_factor=1):
"""
Waits [delay] miliseconds and then make the request
"""

delay = self.delay * delay_factor
time.sleep(delay)
return Request.get_response(self, data)
4 changes: 2 additions & 2 deletions crawley/http/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ def __init__(self, raw_html=None, extracted_html=None, url=None, response=None):

self.raw_html = raw_html
self.html = extracted_html
self.url = url
self.url = url

if response is not None:
self.headers = response.headers
self.code = response.getcode()
12 changes: 6 additions & 6 deletions crawley/manager/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,20 @@ def run_cmd(args):
"""
Runs a crawley's command
"""

if len(args) <= 1:
exit_with_error("Subcommand not specified")

cmd_name = args[1]
cmd_args = args[2:]
cmd = commands[cmd_name](cmd_args)

cmd = commands[cmd_name](cmd_args)
cmd.checked_execute()


def manage():
"""
Called when using crawley command from cmd line
"""
"""
run_cmd(sys.argv)

6 changes: 3 additions & 3 deletions crawley/manager/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
from syncdb import SyncDbCommand

class CommandsDict(dict):

def __getitem__(self, key):

if key in self:
return dict.__getitem__(self, key)
else:
exit_with_error("[%s] Subcommand not valid" % (key))


commands = CommandsDict()

Expand Down
14 changes: 7 additions & 7 deletions crawley/manager/commands/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@ class BrowserCommand(BaseCommand):
"""
Runs a browser
"""

name = "browser"

def validations(self):

return [(len(self.args) >= 1, "No given url")]
def execute(self):

def execute(self):

app = QtGui.QApplication(sys.argv)
main = Browser(self.args[0])
main.show()
sys.exit(app.exec_())

Loading

0 comments on commit fad959e

Please sign in to comment.