Skip to content

Commit

Permalink
Bugfix causing crash by invalid URLs
Browse files Browse the repository at this point in the history
Fixed bug that caused the scraper to crash when an invaid URL occurs.
  • Loading branch information
joelbarmettlerUZH committed Mar 14, 2018
1 parent c9ca591 commit 0360d61
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 15 deletions.
Binary file removed dist/scrapeasy-0.1.tar.gz
Binary file not shown.
Binary file added dist/scrapeasy-0.11.tar.gz
Binary file not shown.
17 changes: 9 additions & 8 deletions scrapeasy/Page.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@ def __init__(self, url, verify=True):

# Define verify behaviour and extract domain from url
self._verify = verify
url = url.replace("%2F", "/")
self._domain = self.findDomain(url)

# Normalize URL to not contain anything before the domain / subdomain
self._url = url[url.index(self._domain):]
try:
self._url = url[url.index(self._domain):]
except ValueError as ve:
self._url = url
if not validators.url("http://"+self._url):
raise Exception("Not valid URL: "+url+"!")
raise ValueError("Not valid URL: "+url+"!")

# Try getting the header via http request.head
try:
Expand All @@ -36,7 +40,7 @@ def __init__(self, url, verify=True):
self._media = {}

def __str__(self):
return "Page object <"+self._url+"> under the domain "+self._domain
return self._url

# Getters for private Page content
def getURL(self):
Expand Down Expand Up @@ -89,9 +93,6 @@ def update(self, tries=5):
if tries > 0:
time.sleep(1)
self.update(tries=tries-1)
else:
print("Current Webpage could not be fetched, url seems to be invalid")
print(self)

# Exctract links from all urls that do not define some well-known filetypes that for sure do not contain any html text (unless .txt or .md could, in theory, contain such links)
def findLinks(self):
Expand All @@ -101,7 +102,6 @@ def findLinks(self):
".webm", ".zip", ".ogg"]
for end in endings:
if self._url.lower().endswith(end):
print("Returning due to non-hypertext file")
return

# Parse request as lxml and extract a-tags
Expand Down Expand Up @@ -248,6 +248,7 @@ def __init__(self, url, verify=True):
# Testing
if __name__=="__main__":
web = Page("http://mathcourses.ch/mat182.html")
print(web.download("pdf", "mathcourses/pdf-files"))
print(web)
#web.download("pdf", "mathcourses/pdf-files")


14 changes: 9 additions & 5 deletions scrapeasy/Website.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@

class Website(object):
def __init__(self, url, verify=True):
url = url.replace("%2F", "/")
self._domain = self.findDomain(url)
self._mainPage = Page(url, self)
self._mainPage = Page(url, verify=verify)

#Define empty subpages list and empty media dict
self._subpages = []
Expand Down Expand Up @@ -108,7 +109,7 @@ def download(self, filetype, folder, reinit=False):
# Find internal links of all subpages, starting from the provided main page
def findSubpages(self):
i = 0
self._subpages = [Page(self._domain, verify=self._verify)]
self._subpages = [self._mainPage]
while i < len(self._subpages):
# print("Finding subpage of: "+self._subpages[i].getURL())
# Ignore these internal rinks when reached
Expand All @@ -123,10 +124,13 @@ def findSubpages(self):
new_links = self._subpages[i].getLinks(intern=True, extern=False)
for link in new_links:
if link not in self.getSubpagesLinks():
self._subpages.append(Page(link, verify=self._verify))
try:
self._subpages.append(Page(link, verify=self._verify))
except ValueError:
print("Invalid URL: "+link)
i += 1

# Testing
if __name__ == "__main__":
web = Website("https://www.icu.uzh.ch/events/id/207")
print(web.get("pdf"))
web = Website("http://www.ksreussbuehl.ch/")
print(web.getSubpages())
8 changes: 6 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@
setup(
name = 'scrapeasy',
packages = ['scrapeasy'],
version = '0.1',
version = '0.12',
license='MIT',
description = 'Scraping in python made easy - receive the content you like in just one line of code!',
author = 'Joel Barmettler',
author_email = 'joel.barmettler@uzh.ch',
url = 'https://github.com/joelbarmettlerUZH/Scrapeasy',
download_url = 'https://github.com/joelbarmettlerUZH/Scrapeasy/archive/pypi-0_1_2.tar.gz',
download_url = 'https://github.com/joelbarmettlerUZH/Scrapeasy/archive/pypi-0_1_3.tar.gz',
keywords = ['scraping', 'easy', 'scraper', 'website', 'download', 'links', 'images', 'videos'],
install_requires=[
'validators',
'beautifulsoup4',
],
classifiers=[ # Optional
# How mature is this project? Common values are
# 3 - Alpha
Expand Down

0 comments on commit 0360d61

Please sign in to comment.