In [1]:
# importing all needed libs
import bs4, json, os, requests, hashlib, sys
from pathlib import Path

# declare wanted target directory
dir_path = "marcxmls/zips"

# declare wanted file format ("has to contain")
wanted_format = '.mrc.xml.gz'

url_to_files = 'https://data.dnb.de/DNB/'

In [2]:
def create_checksum_dict():
    
    checksum_dict = {}
    downloadresult = download(url_to_files, file_to_download)
    
    #if downloadresult == False:
     #   sys.exit("could not create checksum dictionary")
    #else:
    
    with open(dir_path + "/001_Pruefsumme_Checksum.txt", 'r') as infile:
        # skip first line
        next(infile)
        for line in infile:
            # condense multiple whitespaces to one
            condensed_whitespaces = ' '.join(line.split())
            splitted_line = condensed_whitespaces.split(' ')
            # add file and checksum als key and value to chesum_dict
            checksum_dict[splitted_line[0]] = splitted_line[1]
    
    print('created checksum_dict')
    return checksum_dict

In [3]:
def checkhash(filepath):
    
    if bool(checksum_dict) == True:
        sha256_hash = hashlib.sha256()
        with open(filepath,"rb") as f:
            # Read and update hash string value in blocks of 4K
            for byte_block in iter(lambda: f.read(4096),b""):
                sha256_hash.update(byte_block)
            calculated_hash = sha256_hash.hexdigest()
        # get the hash out of the dictionary, can use file_to_download as key
        # TODO: except if key doesn't exist
        dnb_hash = checksum_dict.get(file_to_download)
        if calculated_hash == dnb_hash:
            return True
        else:
            return False
    else:
        sys.exit("create_checksum_dict before trying to download or checkhash a file")         

In [4]:
def download(url, file):
    
    filepath = Path(dir_path + "/" + file)

    # check if file exists already
    try:
        filepath.resolve(strict=True)
        
    except FileNotFoundError:
        
        url_to_download = "https://data.dnb.de/DNB/" + file
        print("trying to download " + file + " ...")
        resource = requests.get(url_to_download)

        # check if download was successful
        if resource.status_code == 404:
            
            sys.exit(file + " is not available for download")
            
        else:
            # write downloaded resource to file 
            open(filepath, 'wb').write(resource.content)
            
            # check if file exists again
            try:
                filepath.resolve(strict=True)
            except FileNotFoundError:
                sys.exit("could not download " + file)
            else:
                print("downloaded " + file)
                # check if file itself is a checksum-file otherwise check the hash of the downloaded file
                if file.__contains__('Checksum') == True:
                    return True
                else:
                    result_of_hashcheck = checkhash(filepath)
                    print("checked hash of downloaded file: " + str(result_of_hashcheck))
                    return result_of_hashcheck
    else:
        
        print(file + " already exists")
        
        # check if file itself is a checksum-file otherwise check the hash of the file
        if file.__contains__('Checksum') == True:    
            return True
        else:
            result_of_hashcheck = checkhash(filepath)
            print("checked hash of file: " + str(result_of_hashcheck))
            return result_of_hashcheck
    

In [6]:
# create target directory if it doesn't exist
try:
    os.makedirs(dir_path)
    print("created " , dir_path)
except FileExistsError:
    print(dir_path + " already exists")

# download the files
dnb_data_resource = requests.get(url_to_files)
dnb_soup = bs4.BeautifulSoup(dnb_data_resource.content)

for x in dnb_soup.select('pre a'):
    
    file_to_download = x.attrs.get('href')
    
    if file_to_download.__contains__('Checksum') == True:
        checksum_dict = create_checksum_dict()
    elif file_to_download.__contains__(wanted_format) == True:
        download(url_to_files, file_to_download)

marcxmls/zips already exists
001_Pruefsumme_Checksum.txt already exists
created checksum_dict
dnb_all_dnbmarc_20200213-1.mrc.xml.gz already exists
checked hash of file: True
dnb_all_dnbmarc_20200213-2.mrc.xml.gz already exists
checked hash of file: True
dnb_all_dnbmarc_20200213-3.mrc.xml.gz already exists
checked hash of file: True
dnb_all_dnbmarc_20200213-4.mrc.xml.gz already exists
checked hash of file: True
