Skip to content

Commit

Permalink
Merge pull request #89 from tw4l/use-existing-cdx-pages
Browse files Browse the repository at this point in the history
Add option to use existing CDXJ rather than indexing from WARCs
  • Loading branch information
matteocargnelutti committed Mar 7, 2024
2 parents 84780df + 3e785fc commit 3201423
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 2 deletions.
8 changes: 8 additions & 0 deletions README.md
Expand Up @@ -100,6 +100,14 @@ If not provided, **js-wacz** is going to attempt to detect pages in WARC records
js-wacz create -f "collection/*.warc.gz" --pages collection/pages.jsonl
```

### --cdxj

Allows to pass a directory of existing CDXJ files, rather than indexing from WARCs. Must be used in combination with `--pages`.

```bash
js-wacz create -f "collection/*.warc.gz" --pages collection/pages.jsonl --cdxj collection/indexes/
```

### --url

If provided, will be used as the [`mainPageUrl` attribute for `datapackage.json`](https://specs.webrecorder.net/wacz/1.1.1/#datapackage-json).
Expand Down
40 changes: 40 additions & 0 deletions bin/cli.js
@@ -1,6 +1,8 @@
#! /usr/bin/env node

import { createReadStream } from 'fs'
import fs from 'fs/promises'
import { resolve } from 'path'
import * as readline from 'node:readline/promises'

import log from 'loglevel'
Expand Down Expand Up @@ -59,6 +61,10 @@ program.command('create')
.option(
'--log-level <string>',
'Can be "silent", "trace", "debug", "info", "warn", "error"', 'info')
.option('--cdxj <string>',
'Path to a directory containing CDXJ indices to merge into final WACZ CDXJ. ' +
'If not provided, js-wacz will reindex from WARCS. Must be used in combination ' +
'with --pages, since using this option will skip reading the WARC files.')
.action(async (name, options, command) => {
/** @type {Object} */
const values = options._optionValues
Expand Down Expand Up @@ -93,6 +99,11 @@ program.command('create')
return
}

if (values?.cdxj && !values?.pages) {
console.error('Error: --cdxj option must be used in combination with --pages.')
return
}

// Pass options to WACZ
try {
archive = new WACZ({
Expand Down Expand Up @@ -133,6 +144,35 @@ program.command('create')
}
}

// Ingest user-provided CDX files, if any.
if (values?.cdxj) {
try {
const dirPath = values?.cdxj
const cdxjFiles = await fs.readdir(dirPath)
const allowedExts = ['cdx', 'cdxj']

for (let i = 0; i < cdxjFiles.length; i++) {
const cdxjFile = resolve(dirPath, cdxjFiles[i])

const ext = cdxjFile.split('.').pop()
if (!allowedExts.includes(ext)) {
log.warn(`CDXJ: Skipping file ${cdxjFile}, not a CDXJ file`)
continue
}

log.info(`CDXJ: Reading entries from ${cdxjFile}`)
const rl = readline.createInterface({ input: createReadStream(cdxjFile) })

for await (const line of rl) {
archive.addCDXJ(line + '\n')
}
}
} catch (err) {
log.trace(err)
log.error('An error occurred while processing user-provided CDXJ indices.')
}
}

// Main process
try {
await archive.process()
Expand Down
29 changes: 27 additions & 2 deletions index.js
Expand Up @@ -76,6 +76,12 @@ export class WACZ {
*/
detectPages = true

/**
* From WACZOptions.indexFromWARCs.
* @type {boolean}
*/
indexFromWARCs = true

/**
* From WACZOptions.url.
* @type {?string}
Expand Down Expand Up @@ -270,6 +276,10 @@ export class WACZ {
this.detectPages = false
}

if (options?.indexFromWARCs === false) {
this.indexFromWARCs = false
}

if (options?.url) {
try {
new URL(options.url) // eslint-disable-line
Expand Down Expand Up @@ -337,8 +347,10 @@ export class WACZ {
info('Initializing indexer')
this.initWorkerPool()

info('Indexing WARCS')
await this.indexWARCs()
if (this.indexFromWARCs) {
info('Indexing WARCS')
await this.indexWARCs()
}

info('Harvesting sorted indexes from trees')
this.harvestArraysFromTrees()
Expand Down Expand Up @@ -792,6 +804,19 @@ export class WACZ {
return page
}

/**
* Allows to manually add a CDJX entry to `this.cdxTree`.
* Calling this method automatically turns indexing from WARCS off.
* @param {string} cdjx - CDJX as string
* @returns {Promise<void>}
*/
addCDXJ = (cdjx) => {
this.stateCheck()
this.indexFromWARCs = false

this.cdxTree.setIfNotPresent(cdjx, true)
}

/**
* Adds a file to the output ZIP stream.
* Automatically keeps trace of file in `this.resources` so it can be referenced in datapackage.json.
Expand Down
25 changes: 25 additions & 0 deletions index.test.js
Expand Up @@ -74,6 +74,20 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) =
assert.equal(archive.detectPages, false)
})

test('WACZ constructor ignores options.indexFromWARCs if invalid.', async (_t) => {
const scenarios = ['foo', {}, Buffer.alloc(0), 12, () => {}]

for (const indexFromWARCs of scenarios) {
const archive = new WACZ({ input: FIXTURE_INPUT, indexFromWARCs })
assert.equal(archive.indexFromWARCs, true)
}
})

test('WACZ constructor accounts for options.indexFromWARCs if valid.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT, indexFromWARCs: false })
assert.equal(archive.indexFromWARCs, false)
})

test('WACZ constructor ignores options.url if invalid.', async (_t) => {
const scenarios = ['foo', {}, Buffer.alloc(0), 12, () => {}]

Expand Down Expand Up @@ -178,6 +192,17 @@ test('addPage adds entry to pagesTree and turns detectPages off.', async (_t) =>
assert.equal(archive.pagesTree.length, 1)
})

test('addCDXJ adds entry to cdxTree and turns indexFromWARCs off.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT })
assert.equal(archive.indexFromWARCs, true)
assert.equal(archive.cdxTree.length, 0)

archive.addCDXJ('net,webrecorder)/ 20240307070734 {"url":"https://webrecorder.net/","mime":"text/html","status":200,"digest":"16966a2a2909825ad1d9a6f1b2f4833c8fe43428cb9920d0f974bd7b3d73c31d","length":3941,"offset":0,"filename":"rec-8bc4bd095683-20240307070734658-0.warc.gz"}')

assert.equal(archive.indexFromWARCs, false)
assert.equal(archive.cdxTree.length, 1)
})

// Note: if `TEST_SIGNING_URL` / `TEST_SIGNING_TOKEN` are present, this will also test the signing feature.
test('WACZ.process runs the entire process and writes a valid .wacz to disk, accounting for options.', async (_t) => {
//
Expand Down

0 comments on commit 3201423

Please sign in to comment.