Bugfix causing crash by invalid URLs

Fixed bug that caused the scraper to crash when an invaid URL occurs.
joelbarmettlerUZH · Mar 14, 2018 · 0360d61 · 0360d61
1 parent c9ca591
commit 0360d61
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 15 deletions.
diff --git a/dist/scrapeasy-0.1.tar.gz b/dist/scrapeasy-0.1.tar.gz
diff --git a/dist/scrapeasy-0.11.tar.gz b/dist/scrapeasy-0.11.tar.gz
diff --git a/scrapeasy/Page.py b/scrapeasy/Page.py
@@ -10,12 +10,16 @@ def __init__(self, url, verify=True):
 
         # Define verify behaviour and extract domain from url
         self._verify = verify
+        url = url.replace("%2F", "/")
         self._domain = self.findDomain(url)
 
         # Normalize URL to not contain anything before the domain / subdomain
-        self._url = url[url.index(self._domain):]
+        try:
+            self._url = url[url.index(self._domain):]
+        except ValueError as ve:
+            self._url = url
         if not validators.url("http://"+self._url):
-            raise Exception("Not valid URL: "+url+"!")
+            raise ValueError("Not valid URL: "+url+"!")
 
         # Try getting the header via http request.head
         try:
@@ -36,7 +40,7 @@ def __init__(self, url, verify=True):
         self._media = {}
 
     def __str__(self):
-        return "Page object <"+self._url+"> under the domain "+self._domain
+        return self._url
 
     # Getters for private Page content
     def getURL(self):
@@ -89,9 +93,6 @@ def update(self, tries=5):
             if tries > 0:
                 time.sleep(1)
                 self.update(tries=tries-1)
-            else:
-                print("Current Webpage could not be fetched, url seems to be invalid")
-                print(self)
 
     # Exctract links from all urls that do not define some well-known filetypes that for sure do not contain any html text (unless .txt or .md could, in theory, contain such links)
     def findLinks(self):
@@ -101,7 +102,6 @@ def findLinks(self):
                    ".webm", ".zip", ".ogg"]
         for end in endings:
             if self._url.lower().endswith(end):
-                print("Returning due to non-hypertext file")
                 return
 
         # Parse request as lxml and extract a-tags
@@ -248,6 +248,7 @@ def __init__(self, url, verify=True):
 # Testing
 if __name__=="__main__":
     web = Page("http://mathcourses.ch/mat182.html")
-    print(web.download("pdf", "mathcourses/pdf-files"))
+    print(web)
+    #web.download("pdf", "mathcourses/pdf-files")
 
 
diff --git a/scrapeasy/Website.py b/scrapeasy/Website.py
@@ -3,8 +3,9 @@
 
 class Website(object):
     def __init__(self, url, verify=True):
+        url = url.replace("%2F", "/")
         self._domain = self.findDomain(url)
-        self._mainPage = Page(url, self)
+        self._mainPage = Page(url, verify=verify)
 
         #Define empty subpages list and empty media dict
         self._subpages = []
@@ -108,7 +109,7 @@ def download(self, filetype, folder, reinit=False):
     # Find internal links of all subpages, starting from the provided main page
     def findSubpages(self):
         i = 0
-        self._subpages = [Page(self._domain, verify=self._verify)]
+        self._subpages = [self._mainPage]
         while i < len(self._subpages):
             # print("Finding subpage of: "+self._subpages[i].getURL())
             # Ignore these internal rinks when reached
@@ -123,10 +124,13 @@ def findSubpages(self):
                 new_links = self._subpages[i].getLinks(intern=True, extern=False)
                 for link in new_links:
                     if link not in self.getSubpagesLinks():
-                        self._subpages.append(Page(link, verify=self._verify))
+                        try:
+                            self._subpages.append(Page(link, verify=self._verify))
+                        except ValueError:
+                            print("Invalid URL: "+link)
             i += 1
 
 # Testing
 if __name__ == "__main__":
-    web = Website("https://www.icu.uzh.ch/events/id/207")
-    print(web.get("pdf"))
+    web = Website("http://www.ksreussbuehl.ch/")
+    print(web.getSubpages())
diff --git a/setup.py b/setup.py
@@ -2,14 +2,18 @@
 setup(
   name = 'scrapeasy',
   packages = ['scrapeasy'],
-  version = '0.1',
+  version = '0.12',
   license='MIT',
   description = 'Scraping in python made easy - receive the content you like in just one line of code!',
   author = 'Joel Barmettler',
   author_email = 'joel.barmettler@uzh.ch',
   url = 'https://github.com/joelbarmettlerUZH/Scrapeasy',
-  download_url = 'https://github.com/joelbarmettlerUZH/Scrapeasy/archive/pypi-0_1_2.tar.gz',
+  download_url = 'https://github.com/joelbarmettlerUZH/Scrapeasy/archive/pypi-0_1_3.tar.gz',
   keywords = ['scraping', 'easy', 'scraper', 'website', 'download', 'links', 'images', 'videos'],
+  install_requires=[
+          'validators',
+          'beautifulsoup4',
+      ],
   classifiers=[  # Optional
     # How mature is this project? Common values are
     #   3 - Alpha