# 3. Get Files

__by Sean Gilleran__  
__Last updated November 29__, __2021__  
[https://github.com/seangilleran/ia-compmag-collect](https://github.com/seangilleran/ia-compmag-collect)

This notebook downloads files from metadata.

## 3.1 Setup

### 3.1.1 Imports

In [None]:
from datetime import datetime
import json
import os

import dateparser
import regex as re
import wget

re_date_full = re.compile(r"(19[6-9]\d|20[0-1]\d)-[0-1]\d")
re_date_year = re.compile(r"19[6-9]\d|20[0-1]\d")


def ts():
    return f"[{datetime.now().strftime('%X')}]"

### 3.1.2 Set Paths

Set metadata and corpus paths.

In [None]:
metadata_path = "./meta"
download_path = "./downloads"

### 3.1.3 Set Download Formats

List of file formats to download.

In [None]:
download_formats = ["DjVuTXT"]

## 3.2 Download Files

### 3.2.1 Find Files in Metadata

Put together a list of URLs to download. Skip it if we've already got a copy.

In [None]:
# Load collections from JSON.
# TODO: Catch JSON load errors.
collections = []
for file in [f for f in os.listdir(metadata_path) if f.endswith(".json")]:
    with open(os.path.join(metadata_path, file), "r", encoding="utf-8") as f:
        collections.append(json.load(f))

# Assemble download URLs.
downloads = []
skip_count = 0

for collection in collections:
    for item in collection["items"]:

        # Check for metadata.
        if isinstance(item, str):
            print(f"{ts()} WRN: No metadata for {item['name']}, skipping.")
            continue

        # Find date.
        date = item["date"]
        try:
            date = dateparser.parse(date, settings=dict(REQUIRE_PARTS=["month", "year"]))
            date = date.strftime("%Y-%m")
        except Exception:
            pass

        for file in item["files"]:

            # Check file type.
            if file["format"] not in download_formats:
                skip_count = skip_count + 1
                continue

            # Find date (if not stored in item metadata).
            if not date or date == "":
                try:
                    date = re_date_full.search(file["name"]).group(0)
                    date = date.strftime("%Y-%m")
                except Exception:
                    skip_count = skip_count + 1
                    continue

            # Assemble a full filename.
            # NB: a handful of files have local path names boiled in somehow. These tend to
            # break things, so let's get rid of them.
            filename = file["name"][4:].split("/")[-1].split("\\")[-1]
            filename = f"{date} - {filename} - {file['id']}.{file['ext']}"

            # Make sure we don't already have it.
            if os.path.exists(os.path.join(download_path, filename)):
                skip_count = skip_count + 1
                continue

            # Add it to the pile.
            print(f".", end="")
            downloads.append((file["url"], filename))

print(f"\n{ts()} ** DONE! **")
print(f"Found {len(downloads)} files to download ({skip_count} skipped).")


### 3.2.2 Download Files

In [None]:
skip_count = 0
download_count = 0

for url, filename in downloads:

    if not os.path.exists(download_path):
        os.makedirs(os.path.abspath(download_path))

    try:
        print(f"{ts()} Downloading {url}...")
        newfile = wget.download(url)
    except Exception:
        print(f"{ts()} ERR: Could not download file!")
        skip_count = skip_count + 1
        continue

    os.rename(os.path.abspath(newfile), os.path.abspath(os.path.join(download_path, filename)))
    download_count = download_count + 1

print(f"\n{ts()} ** DONE! **")
print(f"Downloaded {download_count} files ({skip_count} skipped).")