In [48]:
import re
import json
import typing
from uuid import uuid4
from pathlib import Path

import requests

In [49]:
PDFS = Path("..").resolve() / "assets" / "pdfs"
RE_URL = "(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-].pdf)"

In [7]:
DictLikeJSON = typing.Dict[str, typing.Any]
ListLikeJSON = typing.List[DictLikeJSON]
JSON = typing.Union[DictLikeJSON, ListLikeJSON]

In [2]:
with open("../assets/test.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [55]:
def assign_identifiers(corpus: JSON) -> None:
    """Assign unique identifiers."""
    for document in corpus:
        document["uuid"] = uuid4().hex
        
def prepare_links(corpus: JSON) -> typing.Iterator[typing.Dict[str, str]]:
    """Retrieve urls from .access key"""
    for document in corpus:
        if "Access" not in document:
            continue
        valid_url: str = None
        for url in document["Access"]:
            m = re.search(RE_URL, url)
            try:
                valid_url = re.search(RE_URL, url).group()
            except AttributeError:
                continue
        if valid_url is not None:
            yield {"uuid": document["uuid"], "url": valid_url}

            
def download(metadata: typing.Dict[str, str]) -> None:
    local_filename = PDFS / f"{metadata['uuid']}.pdf"
    with requests.get(metadata["url"], stream=True) as r:
        r.raise_for_status()
        with local_filename.open("wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

In [12]:
assign_identifiers(data)

In [46]:
proper_pdfs = list(prepare_links(data))

In [57]:
for linked_document in prepare_links(data):
    try:
        download(linked_document)
    except Exception as e:
        print(f"failed to download {linked_document['uuid']}: {str(e)}")

failed to download 8317b2487b8a4fad8d20e8b02dd07837: HTTPConnectionPool(host='www.specialer.sam.au.dk', port=80): Max retries exceeded with url: /stat/2010/20085575/20085575.1.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fcb242e3730>: Failed to establish a new connection: [Errno -2] Name or service not known'))
failed to download 4d2c49e167cf4377b61f464338a6b53c: HTTPConnectionPool(host='edocs.nps.edu', port=80): Max retries exceeded with url: /npspubs/scholarly/theses/2002/Dec/02Dec_Gray.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fcb242e3940>, 'Connection to edocs.nps.edu timed out. (connect timeout=None)'))
failed to download 126ad992bcf240be82b6746e6fb2b14a: HTTPConnectionPool(host='edocs.nps.edu', port=80): Max retries exceeded with url: /npspubs/scholarly/theses/2010/Sep/10Sep_Harlambakis.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fcb242e8280>, 'Connection to edocs

In [58]:
with open("../assets/corpus-with-identifiers.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

In [None]:
# !todo
# move pdf files to parse-pdf-grobid folder
# parse
# join back using uuid 
# rename fields!!!