Skip to content

Commit

Permalink
introduce -c / --content switch which you can use to only use to iden…
Browse files Browse the repository at this point in the history
…tify the book

=> it's mutually exclusive to the -l switch

Thanks to flo for this idea and a basic patch.
  • Loading branch information
milianw committed Apr 8, 2009
1 parent be63c25 commit d3c15dc
Showing 1 changed file with 44 additions and 31 deletions.
75 changes: 44 additions & 31 deletions springer_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,55 +17,64 @@ def main(argv):
error("You have to install pdftk.")
if not findInPath("iconv"):
error("You have to install iconv.")

try:
opts, args = getopt.getopt(argv, "hl:", ["help", "link="])
opts, args = getopt.getopt(argv, "hl:c:", ["help", "link=","content="])
except getopt.GetoptError:
error()

link = ""

for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-c", "--content"):
if link != "":
error("-c and -l arguments are mutually exclusive")

link = "http://springerlink.com/content/" + arg
elif opt in ("-l", "--link"):
if link != "":
error("-c and -l arguments are mutually exclusive")

link = arg

if link == "":
error("You have to define a link.")
if not re.match("https?://(www\.)?springerlink.(com|de)/content/[a-z0-9]+(/\?[^/]*)?$", link):
error("Bad link given. See LINK below.")

# remove all arguments from link
link = re.sub(r"/?\?[^/]*$", "/", link)

baseLink = link

chapters = list()
hasFrontMatter = False
hasBackMatter = False

loader = urllib.FancyURLopener()

bookTitle = ""

while True:
# download page source
try:
print "Please wait, link source is being downloaded...\n\t%s" % link
page = loader.open(link).read()
except IOError, e:
error("Bad link given (%s)" % e)

if bookTitle == "":
match = re.search(r'<h2 class="MPReader_Profiles_SpringerLink_Content_PrimitiveHeadingControlName">([^<]+)</h2>', page)
if not match or match.group(1).strip() == "":
error("Could not evaluate book title - bad link?")
else:
bookTitle = match.group(1).strip()
print "\nThe book you are trying to download is called '%s'\n" % bookTitle


# get chapters
for match in re.finditer('href="([^"]+.pdf)"', page):
chapterLink = match.group(1)
Expand All @@ -78,52 +87,52 @@ def main(argv):
if chapterLink[:7] == "http://":
continue
chapters.append(match.group(1))

# get next page
match = re.search(r'<a href="([^"]+)">Next</a>', page)
if match:
link = "http://springerlink.com" + match.group(1).replace("&amp;", "&")
else:
break

if hasFrontMatter:
chapters.insert(0, "front-matter.pdf")

if hasBackMatter:
chapters.append("back-matter.pdf")

if len(chapters) == 0:
error("No chapters found - bad link?")

print "found %d chapters" % len(chapters)

# setup
curDir = os.getcwd()
tempDir = tempfile.mkdtemp()
tempDir = tempfile.mkdtemp()
os.chdir(tempDir)

i = 1
fileList = list()

for chapterLink in chapters:
if chapterLink[0] == "/":
chapterLink = "http://springerlink.com" + chapterLink
else:
chapterLink = baseLink + chapterLink

print "downloading chapter %d/%d" % (i, len(chapters))
localFile, mimeType = geturl(chapterLink, "%d.pdf" % i)

if mimeType.gettype() != "application/pdf":
os.chdir(curDir)
shutil.rmtree(tempDir)
error("downloaded chapter %s has invalid mime type %s - are you allowed to download it?" % (chapterLink, mimeType.gettype()))

fileList.append(localFile)
i += 1

print "merging chapters"

p1 = subprocess.Popen(["echo", bookTitle], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["iconv", "-f", "UTF-8", "-t" ,"ASCII//TRANSLIT"], stdin=p1.stdout, stdout=subprocess.PIPE)
bookTitlePath = p2.communicate()[0]
Expand All @@ -132,40 +141,44 @@ def main(argv):
os.chdir(curDir)
shutil.rmtree(tempDir)
error("could not transliterate book title %s" % bookTitle)

bookTitlePath = bookTitlePath.replace("/", "-")
bookTitlePath = re.sub("\s+", "_", bookTitlePath)

bookTitlePath = curDir + "/%s.pdf" % bookTitlePath

if len(fileList) == 1:
shutil.move(fileList[0], bookTitlePath)
else:
os.system("pdftk %s cat output '%s'" % (" ".join(fileList), bookTitlePath))

# cleanup
os.chdir(curDir)
shutil.rmtree(tempDir)

print "book %s was successfully downloaded, it was saved to %s" % (bookTitle, bookTitlePath)

sys.exit()

# give a usage message
def usage():
print """Usage:
%s [OPTIONS]
Options:
-h, --help Display this usage message
-l LINK, --link=LINK define the link of the book to start downloading
-l LINK, --link=LINK defines the link of the book you intend to download
-c HASH, --content=HASH builds the link from a given HASH (see below)
You have to set exactly one of these options.
LINK:
The link to your the detail page of the ebook of your choice on SpringerLink.
It lists book metadata and has a possibly paginated list of the chapters of the book.
It has the form:
http://springerlink.com/content/HASH/STUFF
Where: HASH is a string consisting of lower-case, latin chars and numbers.
It alone identifies the book you intent do download.
STUFF is optional and looks like ?p=...&p_o=... or similar. Will be stripped.
""" % os.path.basename(sys.argv[0])

Expand Down

0 comments on commit d3c15dc

Please sign in to comment.