https://bitbucket.org/richardpenman/browsercookie

Sciencedirect does a cookie-based authentification. Thus you need to send the respective cookies with the request.
The ANONDRA_COOKIE seems to be responsible for login, however, to be sure we just send all available cookies for sciencedirect with the request.
For finding the sciencedirect cookies, I used browsercookie to get *all* available cookies on the computer (or at least a lot). In theory you could add that whole cookiejar to the request, but I thought it is nicer to send only the cookies for that domain (Does requests.get() actually blow all the cookies out into the wild or only those specific for the domain?) So I try endlessly until I found a relatively short version. Apparantly, retrieving cookies from your computer and cleaning a cookiejar are not standard tasks.


In [1]:
import os, glob
import urllib
import requests
import lxml.html
import browsercookie
import re
import string
import http.cookiejar
import subprocess

In [4]:
path = os.path.join(os.path.expanduser('~'), 'Documents', 'Literatur')+os.path.sep
print(path)
cj = browsercookie.load()
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0)'

C:\Users\Stefan Boltzmann\Documents\Literatur\


In [3]:
pattern = 'sciencedirect'
k = list(cj._cookies.keys())
for domain in k:
    if pattern not in domain:
        cj.clear(domain)
print(cj)

<CookieJar[<Cookie AMCV_4D6368F454EC41940A4C98A6%40AdobeOrg=2096510701%7CMCIDTS%7C17576%7CMCMID%7C88478019393280596753794609358934452507%7CMCAID%7CNONE%7CMCOPTOUT-1518562007s%7CNONE%7CMCAAMLH-1519159607%7C6%7CMCAAMB-1519159607%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CMCSYNCSOP%7C411-17583%7CvVersion%7C2.0.0 for .sciencedirect.com/>, <Cookie ANONRA_COOKIE=E67072C6D9DA673BA0938E3725B6D3764C8396F1DC3F52B22BFEA1A28F426D5E8A6AF4FE369FD210663A5B7C8DBCA1E174F2AE2D64780FC1 for .sciencedirect.com/>, <Cookie EUID=4ce2724b-d76e-40cf-8e7e-5c6acceaaed5 for .sciencedirect.com/>, <Cookie SD_ORCH=B9EB07B84A0FDA1D0583D0281DD3F5A970B60710F842C3E9B1AD19947A1FCD1C for .sciencedirect.com/>, <Cookie SD_REMOTEACCESS_GL=2f82b3241cca23c868af74a73683337d8bfab1fd8437504c for .sciencedirect.com/>, <Cookie ak_bmsc=E8B82027EB5C7156855C9AD613E1F28D686BD9AD372700006A3FAF5B29DD1D57~pleWSozFH4xYvmjf6nneqLiK+iq7CXlftnr6+0yte4Gphm91yS5VWYTyJayxI7jf+5YetJ315gNopvYwVzZ5nrsV63dpTzEakvclKkO5kNlxi1TvHCK5omCFtAfrys1

In [4]:
url = "http://www.sciencedirect.com/science/article/pii/S037702731830129X"
header = {'User-Agent' :user_agent}
r = requests.get(url, headers=header, cookies=cj)
print("citation_pdf_url" in r.text)
lines = r.text.splitlines()
#print(lines)

True


In [5]:
dom = lxml.html.fromstring(r.text)
#meta = dom.xpath('//meta/@name')
metas = dom.findall('head/meta')# get_element_by_id('citation_pdf_url')

foundtitle=False
founddate=False
foundpdf=False
print(len(metas))

while ~(foundtitle and founddate and foundpdf) and len(metas)>0:
    litem = metas.pop()
    meta = dict(litem.items())
    #print(meta.values())
    if 'citation_pdf_url' in meta.values() :
        pdflink = meta['content']
        foundpdf = True
    elif 'citation_online_date' in meta.values():
        date = meta['content']
        founddate = True
    elif 'citation_title' in meta.values():
        title = meta['content']
        foundtitle = True
        
#print(foundpdf, founddate, foundtitle)
print(pdflink, date, title, sep='\n')
author = [author.text for author in dom.find_class('text surname')][0]
print(author)

repl_punct = str.maketrans(' ', ' ', string.punctuation)
title = title.translate(repl_punct).replace(' ', '-')

year, month, day = date.split('/')

pdfname = '{}{}_{}_{}.pdf'.format(path, author, year, title[:75])
print(pdfname)

22
https://www.sciencedirect.com/science/article/pii/S037702731830129X/pdfft?md5=ed9673a5d8599a1bcf4a26357d3c8604&pid=1-s2.0-S037702731830129X-main.pdf
2018/05/30
Physical and mechanical property relationships of a shallow intrusion and volcanic host rock, Pinnacle Ridge, Mt. Ruapehu, New Zealand
Mordensky
C:\Users\Stefan Boltzmann\Documents\Literatur\Mordensky_2018_Physical-and-mechanical-property-relationships-of-a-shallow-intrusion-and-v.pdf


In [6]:
params = {'download':'True', 'isDTMRedir':'True'}
pdf = requests.get(pdflink, headers=header, cookies=cj, allow_redirects=True, params=params)
with open(pdfname, 'wb') as f:
    f.write(pdf. content)

In [6]:
params = {'format': 'text/x-bibtex', 'pii':url.split('/')[-1], 'withabstract':'true'}
citation_url = 'https://www.sciencedirect.com/sdfe/arp/cite'
bibtex = requests.get(citation_url, headers=header, params=params)
with open(path+'tmp.bibtex', 'w') as f:
    f.write(bibtex.content.decode( 'ascii', errors='ignore'))

In [None]:
pathitems =  "\AppData\Local\JabRef".split('\\')
path2jabref =  os.path.join(os.path.expanduser('~'), *pathitems) + os.path.sep
subprocess.run(['java','-jar', path2jabref+'JabRef-4.3.1.jar', '--importToOpen', path+'tmp.bibtex' ])

In [14]:
path2jabref

'C:\\AppData\\Local\\JabRef'

In [None]:
"C:\\Users\Stefan Boltzmann\AppData\Local\JabRef"

In [7]:
os.path.expanduser('~')

'C:\\Users\\Stefan Boltzmann'

In [13]:
glob.glob(os.path.expanduser('~')+'/*/JabRef', recursive=False)

['C:\\Users\\Stefan Boltzmann\\Lokale Einstellungen\\JabRef']

In [None]:
os.path.expandvars() , '--importToOpen', path+'tmp.bibtex'