internetarchive

#!/usr/bin/env node
process.env.DEBUG="dweb-transports dweb-transports:* dweb-archivecontroller:* dweb-mirror:* parallel-streams:* dweb-mirror:HashStore";  // Get highest level debugging of these two libraries, must be before require(dweb-transports)
//process.env.DEBUG=process.env.DEBUG + " bittorrent-tracker:*";  // Add Webtorrent debugging

// TODO-GUN check using GUN for metadata

const debug = require('debug')("dweb-mirror:internetarchive");
const getopts = require('getopts');
const canonicaljson = require('@stratumn/canonicaljson');
const waterfall = require('async/waterfall');
const yaml = require('js-yaml'); //https://www.npmjs.com/package/js-yaml
// Other IA repos
// noinspection JSUndefinedPropertyAssignment
global.DwebTransports = require('@internetarchive/dweb-transports');
// noinspection JSUnusedLocalSymbols
const {ObjectFilter} = require('@internetarchive/dweb-archivecontroller');
//This Repo
// noinspection JSUnusedLocalSymbols
const ArchiveItem = require('./ArchiveItemPatched');
// noinspection JSUnusedLocalSymbols
const ArchiveFile = require('./ArchiveFilePatched');
const CrawlManager = require('./CrawlManager');
const MirrorFS = require('./MirrorFS');
const MirrorConfig = require('./MirrorConfig');
const mirrorHttp = require('./mirrorHttp');
const {registerMDNS, destroyMDNS} = require('./mdns-register');
const optsInt = ["depth",  "maxFileSize", "concurrency", "limitTotalTasks"]; // Not part of getopts, just documenting what aren't string or boolean
const optsArray = ["level", "transport", "rows", "directory"]; // Options that can be arrays
const optsLowerCase = ["level"]; // Always want lowercase
const optsUpperCase = ["transport"]; // Always want uppercase
const optsJson = ["search", "related"]; // Can be provided as JSON rather than individual sub-opts

//XXX make depth max of depth, level-1, rows
//const opts = getopts("--dummy --level details commute".split(" "),{ // Just for testing different options
const opts = getopts(process.argv.slice(2),{
    alias: { l: "level", r: "rows", h: "help", v: "verbose", d: "depth", c: "crawl", s: "server", m: "maintenance",
        "skipFetchFile":"skipfetchfile", "maxFileSize":"maxfilesize", "limitTotalTasks":"limittotaltasks",
        "copyDirectory":"copydirectory", "MDNS":"mdns", "NOMDNS": "nomdns"},
    boolean: ["h","v", "skipFetchFile", "noCache", "dummy", "NOMDNS"],
    //string: ["directory", "search", "related", "depth", "debugidentifier", "maxFileSize", "concurrency", "limitTotalTasks", "transport"],
    string: ["directory", "search", "related", "transport", "level", "debugidentifier"], // Not debugidentifier because undefined and "" are not the same.
    //default: {transport: "HTTP"}, // Getting defaults from yaml via MirrorConfig
    "unknown": option => { if (!optsInt.includes(option)) { console.log("Unknown option", option, ", 'internetarchive -h' for help"); process.exit()} }
});
const httpOrHttps = "http"; // This server is running on http, not https (at least currently)

//TODO make --maintenance also delete locks on any hashstores via new hashstores.maintenance call

const help = `
usage: internetarchive [-hvscm] [--crawl] [--server] [--maintenance] [-l level] [-r rows] [ -d depth ] [--directory path] [--search json] [--related json]
    [--debugidentifier identifier] [--maxFileSize bytes] [--concurrency threads] [--limittotaltasks tasks] [--transport TRANSPORT]*
    [--skipfetchfile] [--noCache] [--dummy] [identifier]*

    -h : help print this text
    -v : verbose tell us which config being run (default is currently pretty verbose)
    -q : quiet (TODO implement this)
    -s : server run http server
    -c : crawl run crawler
    -m : maintenance check for things like 0 length files or partials etc and ensure in IPFS
    -l level : Crawl the identifiers to a certain level, valid values are:
                "tile"    for just enough to print a collection page, including the thumbnail image
                "metadata" and the full metadata, which will be useful once local search is implemented
                "details"  and enough to paint a page, including for example a lower bandwidth video
                "full"     and all the files in the item - beware, this can get very big.
    -r rows          : overrides any (simple) search string to crawl this number of items
    -d depth         : crawl collections found in this collection to a depth,
                      (0 is none, dont even crawl this collection, 1 is normal, 2 is collections in this collection
    --copydirectory path : Store a copy of the crawl in this directory (often used for a removable drive)
    --directory path : override the directory set in the configuration for the root of the cache
    --search json   : override default search string, strict json syntax only
    --related json  : override default setting for crawling related items, strict json syntax only
    --debugidentifier identifier : identifier to do extra debugging on, only really valuable when using an IDE
    --maxfilesize bytes     : any file bigger than this will be ignored
    --concurrency threads   : how many files or searches to be happening concurrently - use 1 for debugging, otherwise 10 is about right
    --limittotaltasks tasks : a maximum number of tasks to run, will be (approximately) the number of searches, plus the number of items crawled.
    --transport TRANSPORT   : The names of transport to use, by default its HTTP, but can currently add IPFS, WEBTORRENT GUN, (TODO must currently be upper case - allow both)
    --skipfetchfile : Dont actually transfer the files (good for debugging)
    --noCache       : Ignore current contents of cache and always refetch
    --mdns DOMAIN   : Respond with MDNS to DOMAIN.local (by default MDNS responds on archive.local)
    --nomdns        : Do not respond with MDNS on archive.local
    --dummy         : Just print the result of the options in the JSON format used for configuration

   identifier       : Zero or more identifiers to crawl (if none, then it will use the default query from the configuration)
   
   Examples:
    
   crawl.js prelinger # Gets the default crawl for the prelinger collection, (details on prelinger, then tiles for top 40 items in the collection and 6 related items)
   crawl.js --level details --rows 100 prelinger   # Would pull the top 100 items in prelinger (just the tiles)
   crawl.js --level all commute  # Fetches all the files in the commute item 
   
   Specifying level, or rows more than once will apply that result to the searches, so for example: 
   
   crawl.js --level details --rows 10 --level details prelinger # Gets the movies for the first 10 movies in prelinger
   crawl.js --level details --rows 100 --level tiles --rows 100 --level tiles movies # Gets the top 100 items in movies, and then crawls any of those items that are collections 
   crawl.js --rows 100 --depth 2 movies # Is a shortcut to do the same thing
   
   Running crawl with no options will run the default crawls in the configuration file with no modifications, which is good for example if running under cron.
   
   A useful hint is to experiment with arguments, but add the \`--dummy\` argument to output a JSON description of the search task(s) to be carried out.
`;
if (opts.help) { console.log(help); process.exit(); }

function processOpts() {
    /* Process the command line opts resulting in a munged opts - this is intended to be generic, not specific to dweb-mirror*/

    // Handle arrays, always return an array, even if empty
    optsArray.forEach(key => {
        if ((typeof opts[key] === "undefined") || (opts[key] === "")) {
            opts[key] = [];
        } else if (!Array.isArray(opts[key])) {
            opts[key] = [ opts[key] ];
        }
    });

    optsLowerCase.forEach(key => {
        opts[key] = Array.isArray(opts[key])
        ? opts[key].map(t=>t.toLowerCase())
        : opts[key].toLowerCase()});

    optsUpperCase.forEach(key => {
        opts[key] = Array.isArray(opts[key])
            ? opts[key].map(t=>t.toUpperCase())
            : opts[key].toUpperCase()});

    // Look for some complete json args and unpack them
    optsJson.forEach(key => { // search, related
        if (opts[key].length) {
            try {
                opts[key] = canonicaljson.parse(opts[key]);
            } catch (err) {
                console.log("Invalid json in argument", key, "=", opts[key], err.message);
                process.exit();
            }
        } else {
            opts[key] = undefined;
        }
    });

}

function optsToConfig() {
    /* Take opts, and manipulate defaults in config */
    processOpts(opts);

    // Default level is 1 level at details
    if (!opts.level.length) opts.level.push("details");
    // Default rows is in config...defaultDetailsSearch.rows if level >= detail
    if (!opts.rows.length) {
        opts.rows.push(
            (CrawlManager._levels.indexOf(opts.level[0]) >= CrawlManager._levels.indexOf("details")
                ? ((config.apps.crawl.opts.defaultDetailsSearch && config.apps.crawl.opts.defaultDetailsSearch.rows) || 0)
                : 0)
        );
    }

    // Map - if specified: config.connect.transports (plural) = opts.transport (singular but array)
    if (opts.transport.length) {
        config.setOpts({apps: {crawl: {connect: {transports: opts.transport}}}}) // Code cares about case
    }
    // Map - if specified: directories = opts.directory
    if (opts.directory.length) {
        config.setOpts({directories: opts.directory})
    }
    // Check for errors
    // Its an error not to specify directory if none are defined in defaults
    if (!config.directories.length) {
        debug("WARNING: Directory for the cache is not defined or doesnt exist - crawl will wait for disk to be available");
        //Dont make this an error, crawl will wait and server SHOULD work without disk
        // return new Error("ERROR: Directory for the cache is not defined or doesnt exist");
    }
    if (opts.search && (opts.rows || opts.depth)) {
        return new Error("ERROR: Cannot specify search with rows or depth arguments");
    }
    if (opts.debugidentifier.length) {
        // noinspection ES6ConvertVarToLetConst
        global.debugidentifier = opts.debugidentifier }
    if (opts.verbose || opts.dummy) {
        debug( "Config:"); debug(yaml.dump(ObjectFilter(config, (key, unusedValue) => key !== "configOpts")));
    }
    if (opts.mdns) { config.setOpts({mdns: opts.mdns})}
    if (opts.nomdns) { config.setOpts({mdns: undefined})}
    return null;
}
function _tasks() {
    // Somewhat complex ....
    // Allows specifying either ... an array of e.g. -level details -level tiles
    // or a -depth parameter
    // or an array of -rows
    let tasks;
    if (!(opts._.length || (typeof opts.crawl === "string"))) {
        // We have no positional arguments
        // If no positional args specified, then use from config.apps.crawl.tasks
        if (opts.depth || opts.search || opts.related ) {
            return new Error("If specifying options then should also specify identifiers to crawl");
        }
        // noinspection JSUnresolvedVariable
        tasks = config.apps.crawl.tasks; // Default or configured tasks
    } else {
        if (typeof opts.crawl === "string") { // e.g. "./internetarchive --crawl foo" which is technically wrong, but understandable.
            opts._.push(opts.crawl) }
        opts.crawl = true;  // Specifying identifiers implies --crawl
        // We have positional arguments, use default details search
        function f(depthnow, depth) { // Recurses
            return depth
                ? Object.assign({}, opts.search || config.apps.crawl.opts.defaultDetailsSearch,
                    {   level: opts.level[Math.min(depthnow+1,opts.level.length-1)],
                        rows: opts.rows[Math.min(depthnow,opts.rows.length-1)],
                        search: f(depthnow+1, depth -1)
                    })
                : undefined;
        }
        // Build an array of tasks where each specifies a multi-level search based on depth
        tasks = opts._.map( identifier => { return {
                identifier,
                level: opts.level[0],
                related: opts.related,
                search: f(0, Math.max(opts.depth || 0, opts.level.length, opts.rows.length)) // recurse structure
            } } );
    }
    return tasks;
}

function connect(cb) {
    const connectOpts = config.connect;
    //wrtc is not available on some platforms (esp 32 bit such as Rachel3+) so only include if requested (by webtorrent.tracker = 'wrtc' and available.

    // SEE-OTHER-ADDTRANSPORT in dweb-transports dweb-archive dweb-mirror
    // TODO-SPLIT these will need to move into local server or may be excluded by cross-origin low-bandwidth rule in chrome
    // These are no longer packaged in dweb-transports, include specifically only if going to use it
    // And make sure to do add via yarn during installation
    DwebTransports.loadIntoNode(connectOpts); // Note runs loadIntoNode from each used DwebTransports.TransportXxx or default DTS.Transport
    if (opts.verbose || opts.dummy) {
        debug( "Connect configuration: %o", connectOpts);
    }
    if (!opts.dummy) {
        DwebTransports.connect(connectOpts, unusedErr => {
            cb(unusedErr);
        });
    }
}

function crawl(cb) {
    // Group explicit crawl opts from config, and any other opts that CrawlManager accepts, overriding existing defaults
    const crawlopts = Object.assign({},
        config.apps.crawl.opts,
        ObjectFilter(opts, (k,v)=> CrawlManager.optsallowed.includes(k) && (typeof v !== "undefined")),
        {callbackDrainOnce: true, name: "main"}
    );
    const tasks = _tasks(); // draws from opts and config.apps.crawl.tasks
    if (opts.verbose || opts.dummy) {
        debug( "Crawl configuration: crawlopts=%o tasks=%O", crawlopts, tasks);
    }
    if (!opts.dummy) {
        CrawlManager.startCrawl(tasks, crawlopts, cb);
    }
}
let config;
let server;

function startServer(cb) {
    mirrorHttp(config, (err, serv) => {
        server = serv;
        if (!err && config.mdns) {
            registerMDNS(config.mdns);
        }
        cb(err);
    });
}
function stopServer(cb) {
    if (server) {
        debug("Closing server");
        server.close((err) => {
            if (err)
                debug("Failed to stop server: %s, but it might just because it was already started", err.message);
            destroyMDNS();
            cb(null); // Dont pass on failure, still want to stop transports
        });
        server = undefined; // Its obviously not running
    } else {
        cb(null);
    }
}

waterfall([
    cb => MirrorConfig.new(undefined,
        (obj) => { if (typeof obj.directories !== "undefined") MirrorFS.setState({directories: obj.directories}) },
        (err, res) => { config = res; cb(err); }), // Load config early, so can use in opts processing
    cb => cb(optsToConfig()), // Currently synchronous returning err||null
    cb => {
        if (opts.server && ! config.archiveui.directory) { debug("ERROR unlikely to work as none of %o present",config.archiveui.directoriest)}
        cb(null);
    },
    cb => { MirrorFS.init({
                directories: config.directories,
                httpServer: httpOrHttps+"://localhost:"+config.apps.http.port,
                preferredStreamTransports: config.connect.preferredStreamTransports});
            cb(null); },
    cb => connect(cb),
    cb => { // Start server before crawler as crawler takes time before returning
        if (opts.server || opts.maintenance || opts.crawl) { startServer(cb); } else { cb(null); }},
    cb => { // Maintenance must be after server start since needs for IPFS, should be before crawl
        if (opts.maintenance) { MirrorFS.maintenance({cacheDirectories: config.directories}, cb) } else {cb(null);}},
    cb => { if (opts.crawl) { crawl(cb); } else { cb(null); }},
    cb => { // Stop express server unless explicit -s option
        if (!opts.server) {stopServer(cb); } else { cb(null); } },
    cb => { // If we aren't leaving a server running, then stop the transports
        if (!opts.server) { DwebTransports.p_stop(cb); } else { cb(null); }}
],(err) => {
    if (err) {
        debug("Failed: %s", err.message)
    } else {
        if (server) {
            debug('Completed, but server still running');
        } else {
            debug('Completed');
        }
    }
});