/
internetarchive
executable file
·318 lines (291 loc) · 16.1 KB
/
internetarchive
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/usr/bin/env node
process.env.DEBUG="dweb-transports dweb-transports:* dweb-archivecontroller:* dweb-mirror:* parallel-streams:* dweb-mirror:HashStore"; // Get highest level debugging of these two libraries, must be before require(dweb-transports)
//process.env.DEBUG=process.env.DEBUG + " bittorrent-tracker:*"; // Add Webtorrent debugging
// TODO-GUN check using GUN for metadata
const debug = require('debug')("dweb-mirror:internetarchive");
const getopts = require('getopts');
const canonicaljson = require('@stratumn/canonicaljson');
const waterfall = require('async/waterfall');
const yaml = require('js-yaml'); //https://www.npmjs.com/package/js-yaml
// Other IA repos
// noinspection JSUndefinedPropertyAssignment
global.DwebTransports = require('@internetarchive/dweb-transports');
// noinspection JSUnusedLocalSymbols
const {ObjectFilter} = require('@internetarchive/dweb-archivecontroller');
//This Repo
// noinspection JSUnusedLocalSymbols
const ArchiveItem = require('./ArchiveItemPatched');
// noinspection JSUnusedLocalSymbols
const ArchiveFile = require('./ArchiveFilePatched');
const CrawlManager = require('./CrawlManager');
const MirrorFS = require('./MirrorFS');
const MirrorConfig = require('./MirrorConfig');
const mirrorHttp = require('./mirrorHttp');
const {registerMDNS, destroyMDNS} = require('./mdns-register');
const optsInt = ["depth", "maxFileSize", "concurrency", "limitTotalTasks"]; // Not part of getopts, just documenting what aren't string or boolean
const optsArray = ["level", "transport", "rows", "directory"]; // Options that can be arrays
const optsLowerCase = ["level"]; // Always want lowercase
const optsUpperCase = ["transport"]; // Always want uppercase
const optsJson = ["search", "related"]; // Can be provided as JSON rather than individual sub-opts
//XXX make depth max of depth, level-1, rows
//const opts = getopts("--dummy --level details commute".split(" "),{ // Just for testing different options
const opts = getopts(process.argv.slice(2),{
alias: { l: "level", r: "rows", h: "help", v: "verbose", d: "depth", c: "crawl", s: "server", m: "maintenance",
"skipFetchFile":"skipfetchfile", "maxFileSize":"maxfilesize", "limitTotalTasks":"limittotaltasks",
"copyDirectory":"copydirectory", "MDNS":"mdns", "NOMDNS": "nomdns"},
boolean: ["h","v", "skipFetchFile", "noCache", "dummy", "NOMDNS"],
//string: ["directory", "search", "related", "depth", "debugidentifier", "maxFileSize", "concurrency", "limitTotalTasks", "transport"],
string: ["directory", "search", "related", "transport", "level", "debugidentifier"], // Not debugidentifier because undefined and "" are not the same.
//default: {transport: "HTTP"}, // Getting defaults from yaml via MirrorConfig
"unknown": option => { if (!optsInt.includes(option)) { console.log("Unknown option", option, ", 'internetarchive -h' for help"); process.exit()} }
});
const httpOrHttps = "http"; // This server is running on http, not https (at least currently)
//TODO make --maintenance also delete locks on any hashstores via new hashstores.maintenance call
const help = `
usage: internetarchive [-hvscm] [--crawl] [--server] [--maintenance] [-l level] [-r rows] [ -d depth ] [--directory path] [--search json] [--related json]
[--debugidentifier identifier] [--maxFileSize bytes] [--concurrency threads] [--limittotaltasks tasks] [--transport TRANSPORT]*
[--skipfetchfile] [--noCache] [--dummy] [identifier]*
-h : help print this text
-v : verbose tell us which config being run (default is currently pretty verbose)
-q : quiet (TODO implement this)
-s : server run http server
-c : crawl run crawler
-m : maintenance check for things like 0 length files or partials etc and ensure in IPFS
-l level : Crawl the identifiers to a certain level, valid values are:
"tile" for just enough to print a collection page, including the thumbnail image
"metadata" and the full metadata, which will be useful once local search is implemented
"details" and enough to paint a page, including for example a lower bandwidth video
"full" and all the files in the item - beware, this can get very big.
-r rows : overrides any (simple) search string to crawl this number of items
-d depth : crawl collections found in this collection to a depth,
(0 is none, dont even crawl this collection, 1 is normal, 2 is collections in this collection
--copydirectory path : Store a copy of the crawl in this directory (often used for a removable drive)
--directory path : override the directory set in the configuration for the root of the cache
--search json : override default search string, strict json syntax only
--related json : override default setting for crawling related items, strict json syntax only
--debugidentifier identifier : identifier to do extra debugging on, only really valuable when using an IDE
--maxfilesize bytes : any file bigger than this will be ignored
--concurrency threads : how many files or searches to be happening concurrently - use 1 for debugging, otherwise 10 is about right
--limittotaltasks tasks : a maximum number of tasks to run, will be (approximately) the number of searches, plus the number of items crawled.
--transport TRANSPORT : The names of transport to use, by default its HTTP, but can currently add IPFS, WEBTORRENT GUN, (TODO must currently be upper case - allow both)
--skipfetchfile : Dont actually transfer the files (good for debugging)
--noCache : Ignore current contents of cache and always refetch
--mdns DOMAIN : Respond with MDNS to DOMAIN.local (by default MDNS responds on archive.local)
--nomdns : Do not respond with MDNS on archive.local
--dummy : Just print the result of the options in the JSON format used for configuration
identifier : Zero or more identifiers to crawl (if none, then it will use the default query from the configuration)
Examples:
crawl.js prelinger # Gets the default crawl for the prelinger collection, (details on prelinger, then tiles for top 40 items in the collection and 6 related items)
crawl.js --level details --rows 100 prelinger # Would pull the top 100 items in prelinger (just the tiles)
crawl.js --level all commute # Fetches all the files in the commute item
Specifying level, or rows more than once will apply that result to the searches, so for example:
crawl.js --level details --rows 10 --level details prelinger # Gets the movies for the first 10 movies in prelinger
crawl.js --level details --rows 100 --level tiles --rows 100 --level tiles movies # Gets the top 100 items in movies, and then crawls any of those items that are collections
crawl.js --rows 100 --depth 2 movies # Is a shortcut to do the same thing
Running crawl with no options will run the default crawls in the configuration file with no modifications, which is good for example if running under cron.
A useful hint is to experiment with arguments, but add the \`--dummy\` argument to output a JSON description of the search task(s) to be carried out.
`;
if (opts.help) { console.log(help); process.exit(); }
function processOpts() {
/* Process the command line opts resulting in a munged opts - this is intended to be generic, not specific to dweb-mirror*/
// Handle arrays, always return an array, even if empty
optsArray.forEach(key => {
if ((typeof opts[key] === "undefined") || (opts[key] === "")) {
opts[key] = [];
} else if (!Array.isArray(opts[key])) {
opts[key] = [ opts[key] ];
}
});
optsLowerCase.forEach(key => {
opts[key] = Array.isArray(opts[key])
? opts[key].map(t=>t.toLowerCase())
: opts[key].toLowerCase()});
optsUpperCase.forEach(key => {
opts[key] = Array.isArray(opts[key])
? opts[key].map(t=>t.toUpperCase())
: opts[key].toUpperCase()});
// Look for some complete json args and unpack them
optsJson.forEach(key => { // search, related
if (opts[key].length) {
try {
opts[key] = canonicaljson.parse(opts[key]);
} catch (err) {
console.log("Invalid json in argument", key, "=", opts[key], err.message);
process.exit();
}
} else {
opts[key] = undefined;
}
});
}
function optsToConfig() {
/* Take opts, and manipulate defaults in config */
processOpts(opts);
// Default level is 1 level at details
if (!opts.level.length) opts.level.push("details");
// Default rows is in config...defaultDetailsSearch.rows if level >= detail
if (!opts.rows.length) {
opts.rows.push(
(CrawlManager._levels.indexOf(opts.level[0]) >= CrawlManager._levels.indexOf("details")
? ((config.apps.crawl.opts.defaultDetailsSearch && config.apps.crawl.opts.defaultDetailsSearch.rows) || 0)
: 0)
);
}
// Map - if specified: config.connect.transports (plural) = opts.transport (singular but array)
if (opts.transport.length) {
config.setOpts({apps: {crawl: {connect: {transports: opts.transport}}}}) // Code cares about case
}
// Map - if specified: directories = opts.directory
if (opts.directory.length) {
config.setOpts({directories: opts.directory})
}
// Check for errors
// Its an error not to specify directory if none are defined in defaults
if (!config.directories.length) {
debug("WARNING: Directory for the cache is not defined or doesnt exist - crawl will wait for disk to be available");
//Dont make this an error, crawl will wait and server SHOULD work without disk
// return new Error("ERROR: Directory for the cache is not defined or doesnt exist");
}
if (opts.search && (opts.rows || opts.depth)) {
return new Error("ERROR: Cannot specify search with rows or depth arguments");
}
if (opts.debugidentifier.length) {
// noinspection ES6ConvertVarToLetConst
global.debugidentifier = opts.debugidentifier }
if (opts.verbose || opts.dummy) {
debug( "Config:"); debug(yaml.dump(ObjectFilter(config, (key, unusedValue) => key !== "configOpts")));
}
if (opts.mdns) { config.setOpts({mdns: opts.mdns})}
if (opts.nomdns) { config.setOpts({mdns: undefined})}
return null;
}
function _tasks() {
// Somewhat complex ....
// Allows specifying either ... an array of e.g. -level details -level tiles
// or a -depth parameter
// or an array of -rows
let tasks;
if (!(opts._.length || (typeof opts.crawl === "string"))) {
// We have no positional arguments
// If no positional args specified, then use from config.apps.crawl.tasks
if (opts.depth || opts.search || opts.related ) {
return new Error("If specifying options then should also specify identifiers to crawl");
}
// noinspection JSUnresolvedVariable
tasks = config.apps.crawl.tasks; // Default or configured tasks
} else {
if (typeof opts.crawl === "string") { // e.g. "./internetarchive --crawl foo" which is technically wrong, but understandable.
opts._.push(opts.crawl) }
opts.crawl = true; // Specifying identifiers implies --crawl
// We have positional arguments, use default details search
function f(depthnow, depth) { // Recurses
return depth
? Object.assign({}, opts.search || config.apps.crawl.opts.defaultDetailsSearch,
{ level: opts.level[Math.min(depthnow+1,opts.level.length-1)],
rows: opts.rows[Math.min(depthnow,opts.rows.length-1)],
search: f(depthnow+1, depth -1)
})
: undefined;
}
// Build an array of tasks where each specifies a multi-level search based on depth
tasks = opts._.map( identifier => { return {
identifier,
level: opts.level[0],
related: opts.related,
search: f(0, Math.max(opts.depth || 0, opts.level.length, opts.rows.length)) // recurse structure
} } );
}
return tasks;
}
function connect(cb) {
const connectOpts = config.connect;
//wrtc is not available on some platforms (esp 32 bit such as Rachel3+) so only include if requested (by webtorrent.tracker = 'wrtc' and available.
// SEE-OTHER-ADDTRANSPORT in dweb-transports dweb-archive dweb-mirror
// TODO-SPLIT these will need to move into local server or may be excluded by cross-origin low-bandwidth rule in chrome
// These are no longer packaged in dweb-transports, include specifically only if going to use it
// And make sure to do add via yarn during installation
DwebTransports.loadIntoNode(connectOpts); // Note runs loadIntoNode from each used DwebTransports.TransportXxx or default DTS.Transport
if (opts.verbose || opts.dummy) {
debug( "Connect configuration: %o", connectOpts);
}
if (!opts.dummy) {
DwebTransports.connect(connectOpts, unusedErr => {
cb(unusedErr);
});
}
}
function crawl(cb) {
// Group explicit crawl opts from config, and any other opts that CrawlManager accepts, overriding existing defaults
const crawlopts = Object.assign({},
config.apps.crawl.opts,
ObjectFilter(opts, (k,v)=> CrawlManager.optsallowed.includes(k) && (typeof v !== "undefined")),
{callbackDrainOnce: true, name: "main"}
);
const tasks = _tasks(); // draws from opts and config.apps.crawl.tasks
if (opts.verbose || opts.dummy) {
debug( "Crawl configuration: crawlopts=%o tasks=%O", crawlopts, tasks);
}
if (!opts.dummy) {
CrawlManager.startCrawl(tasks, crawlopts, cb);
}
}
let config;
let server;
function startServer(cb) {
mirrorHttp(config, (err, serv) => {
server = serv;
if (!err && config.mdns) {
registerMDNS(config.mdns);
}
cb(err);
});
}
function stopServer(cb) {
if (server) {
debug("Closing server");
server.close((err) => {
if (err)
debug("Failed to stop server: %s, but it might just because it was already started", err.message);
destroyMDNS();
cb(null); // Dont pass on failure, still want to stop transports
});
server = undefined; // Its obviously not running
} else {
cb(null);
}
}
waterfall([
cb => MirrorConfig.new(undefined,
(obj) => { if (typeof obj.directories !== "undefined") MirrorFS.setState({directories: obj.directories}) },
(err, res) => { config = res; cb(err); }), // Load config early, so can use in opts processing
cb => cb(optsToConfig()), // Currently synchronous returning err||null
cb => {
if (opts.server && ! config.archiveui.directory) { debug("ERROR unlikely to work as none of %o present",config.archiveui.directoriest)}
cb(null);
},
cb => { MirrorFS.init({
directories: config.directories,
httpServer: httpOrHttps+"://localhost:"+config.apps.http.port,
preferredStreamTransports: config.connect.preferredStreamTransports});
cb(null); },
cb => connect(cb),
cb => { // Start server before crawler as crawler takes time before returning
if (opts.server || opts.maintenance || opts.crawl) { startServer(cb); } else { cb(null); }},
cb => { // Maintenance must be after server start since needs for IPFS, should be before crawl
if (opts.maintenance) { MirrorFS.maintenance({cacheDirectories: config.directories}, cb) } else {cb(null);}},
cb => { if (opts.crawl) { crawl(cb); } else { cb(null); }},
cb => { // Stop express server unless explicit -s option
if (!opts.server) {stopServer(cb); } else { cb(null); } },
cb => { // If we aren't leaving a server running, then stop the transports
if (!opts.server) { DwebTransports.p_stop(cb); } else { cb(null); }}
],(err) => {
if (err) {
debug("Failed: %s", err.message)
} else {
if (server) {
debug('Completed, but server still running');
} else {
debug('Completed');
}
}
});