Permalink
Browse files

get_links.py added

  • Loading branch information...
1 parent 8ef4cc5 commit 667551a48a7a380785131eeafc9360a6b4165a87 @jabbalaci committed May 8, 2011
Showing with 62 additions and 0 deletions.
  1. +5 −0 README
  2. +57 −0 get_links.py
View
5 README
@@ -7,6 +7,11 @@ Author: Laszlo Szathmary, 2011 (jabba.laci@gmail.com)
Github: https://github.com/jabbalaci/Bash-Utils
+get_links.py
+ Website: https://pythonadventures.wordpress.com/2011/03/10/extract-all-links-from-a-web-page/
+ Extract all links from a web page.
+ Usage: get_links <URL>
+
open_in_tabs.py
Website: https://ubuntuincident.wordpress.com/2011/03/09/open-urls-in-browser-tabs-simultaneously/
Read URLs from the standard input and open them in separated tabs.
View
57 get_links.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+"""
+Extract all links from a web page
+=================================
+Author: Laszlo Szathmary, 2011 (jabba.laci@gmail.com)
+Website: https://pythonadventures.wordpress.com/2011/03/10/extract-all-links-from-a-web-page/
+GitHub: https://github.com/jabbalaci/Bash-Utils
+
+Given a webpage, extract all links.
+
+Usage:
+------
+./get_links.py <URL>
+"""
+
+import sys
+import urllib
+import urlparse
+
+from BeautifulSoup import BeautifulSoup
+
+
+class MyOpener(urllib.FancyURLopener):
+ version = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15'
+
+
+def process(url):
+ myopener = MyOpener()
+ #page = urllib.urlopen(url)
+ page = myopener.open(url)
+
+ text = page.read()
+ page.close()
+
+ soup = BeautifulSoup(text)
+
+ for tag in soup.findAll('a', href=True):
+ tag['href'] = urlparse.urljoin(url, tag['href'])
+ print tag['href']
+# process(url)
+
+
+def main():
+ if len(sys.argv) == 1:
+ print "Jabba's Link Extractor v0.1"
+ print "Usage: %s URL [URL]..." % sys.argv[0]
+ sys.exit(1)
+ # else, if at least one parameter was passed
+ for url in sys.argv[1:]:
+ process(url)
+# main()
+
+#############################################################################
+
+if __name__ == "__main__":
+ main()

0 comments on commit 667551a

Please sign in to comment.