Skip to content

Commit

Permalink
Add a button to create an archive
Browse files Browse the repository at this point in the history
  • Loading branch information
bcomnes committed Mar 21, 2023
1 parent 811ca79 commit 71d0764
Show file tree
Hide file tree
Showing 16 changed files with 349 additions and 41 deletions.
7 changes: 0 additions & 7 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@ const __dirname = desm(import.meta.url)
const hid = hyperid()

export default async function App (fastify, opts) {
// Place here your custom code!

// Do not touch the following lines

const testPattern = /.*(test|spec).js/
// This loads all plugins defined in plugins
// those should be support plugins that are reused
Expand All @@ -33,9 +29,6 @@ export default async function App (fastify, opts) {
ignorePattern: testPattern,
options: Object.assign({}, opts)
})

// await fastify.after()
// console.log(fastify.printRoutes())
}

export const options = {
Expand Down
36 changes: 36 additions & 0 deletions migrations/013.do.readability-archive.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
create table archives (
id uuid primary key default gen_random_uuid(),
owner_id uuid not null,
bookmark_id uuid not null,
created_at timestamptz not null default now(),
updated_at timestamptz,
url text not null,
title text,
site_name text,
html_content text,
length bigint,
excerpt text,
byline text,
direction text,
language text,
ready boolean not null default false,
error text,

constraint fk_owner
foreign key(owner_id)
references users(id)
on delete cascade,

constraint fk_bookmark
foreign key(bookmark_id)
references bookmarks(id)
on delete cascade
);

create index idx_archives_owner ON archives(owner_id);
create index idx_archives_bookmark ON archives(bookmark_id);

create trigger set_timestamp_archives
before update on archives
for each row
execute procedure trigger_set_timestamp();
4 changes: 4 additions & 0 deletions migrations/013.undo.readability-archive.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
drop trigger if exists set_timestamp_archives ON readabilty_archives;
drop index if exists idx_archives_bookmark;
drop index if exists idx_archives_owner;
drop table if exists archives;
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@
"@fastify/static": "^6.4.0",
"@fastify/swagger": "^8.3.1",
"@fastify/swagger-ui": "^1.3.0",
"@mozilla/readability": "^0.4.2",
"@nearform/sql": "^1.5.0",
"@siteup/cli": "^2.2.4",
"abstract-cache-redis": "^2.0.0",
"classnames": "^2.3.1",
"clean-deep": "^3.4.0",
"desm": "^1.2.0",
"dompurify": "^3.0.1",
"dotenv": "^16.0.1",
"fast-json-body": "^1.1.0",
"fastify": "^4.0.1",
Expand Down
29 changes: 29 additions & 0 deletions plugins/cache.js
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,35 @@ export default fp(async function (fastify, opts) {
return siteMetaCache.set(key, value)
}
})

// For caching server extracted site metadata
const archiveCache = new LRU({
max: 50,
ttl: 1000 * 60 * 5, // 20 mins,
updateAgeOnGet: false,
ttlAutopurge: true
})

function getArchiveCacheKey ({
url
}) {
assert(url, 'url required')
return [
'readability',
url
].join(':')
}

fastify.decorate('archiveCache', {
get ({ url } = {}) {
const key = getArchiveCacheKey({ url })
return archiveCache.get(key)
},
set ({ url } = {}, value) {
const key = getArchiveCacheKey({ url })
return archiveCache.set(key, value)
}
})
}, {
name: 'cache',
dependencies: ['redis']
Expand Down
44 changes: 44 additions & 0 deletions plugins/extract-archive.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import fp from 'fastify-plugin'
import { JSDOM } from 'jsdom'
import { Readability } from '@mozilla/readability'
import createDOMPurify from 'dompurify'

/**
* This plugin adds readability-extract fetching helpers
*/
export default fp(async function (fastify, opts) {
fastify.decorate('extractArchive', async function extractArchive ({
url,
initialHTML // optinally pass html here if its already fetched before
}) {
const endTimer = fastify.metrics.archiveSeconds.startTimer()
try {
const cacheKey = { url }

const cachedRBArchive = fastify.archiveCache.get(cacheKey)

if (cachedRBArchive) {
return cachedRBArchive
}

const html = initialHTML ?? await fastify.fetchHTML({ url })

const { document } = (new JSDOM(html, { url })).window
const reader = new Readability(document)
const article = reader.parse()

const dpWindow = new JSDOM('').window
const DOMPurify = createDOMPurify(dpWindow)
article.content = DOMPurify.sanitize(article.content)

fastify.siteMetaCache.set(cacheKey, article)

return article
} finally {
endTimer()
}
})
}, {
name: 'extract-archive',
dependencies: ['env', 'prom', 'cache', 'prom', 'fetch-html']
})
46 changes: 46 additions & 0 deletions plugins/fetch-html.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import fp from 'fastify-plugin'
import { request as undiciRequest } from 'undici'

// Sorry newspapers, no cheating
const GOOGLE_BOT_UA = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36'

const uaHacks = {
'twitter.com': GOOGLE_BOT_UA,
'mobile.twitter.com': GOOGLE_BOT_UA
}

/**
* This plugin adds a function to fetch html
*/
export default fp(async function (fastify, opts) {
fastify.decorate('fetchHTML', async function fetchHTML ({
url
}) {
const requestURL = new URL(url)

const ua = uaHacks[requestURL.hostname] ?? `Breadcrum / ${fastify.pkg.version}`

const response = await undiciRequest(requestURL, {
headers: {
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'user-agent': ua
},
maxRedirections: 3,
autoSelectFamily: true,
headersTimeout: 15000,
bodyTimeout: 15000
})

if (response.statusCode > 299) {
const text = await response.body.text()
throw new Error(`Fetch HTML error (${response.statusCode}): ` + text)
}

const html = await response.body.text()

return html
})
}, {
name: 'fetch-html',
dependencies: ['env']
})
10 changes: 10 additions & 0 deletions plugins/prom.js
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,16 @@ export default fp(async function (fastify, opts) {
help: 'The time it takes for site meta extraction'
})

fastify.metrics.archiveSeconds = new fastify.metrics.client.Histogram({
name: 'breadcrum_archive_seconds',
help: 'The time it takes for readability archive extraction'
})

fastify.metrics.archiveCounter = new fastify.metrics.client.Counter({
name: 'breadcrum_archive_created_total',
help: 'The number of times a readability archive is created'
})

const promServer = Fastify({
logger: true
})
Expand Down
35 changes: 3 additions & 32 deletions plugins/site-meta.js
Original file line number Diff line number Diff line change
@@ -1,16 +1,7 @@
import fp from 'fastify-plugin'
import { request as undiciRequest } from 'undici'
import { JSDOM } from 'jsdom'
import { extractMeta } from '@breadcrum/extract-meta'

// Sorry newspapers, no cheating
const GOOGLE_BOT_UA = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36'

const uaHacks = {
'twitter.com': GOOGLE_BOT_UA,
'mobile.twitter.com': GOOGLE_BOT_UA
}

/**
* This plugin adds site-metadata fetching helpers
*/
Expand All @@ -20,8 +11,6 @@ export default fp(async function (fastify, opts) {
}) {
const endTimer = fastify.metrics.siteMetaSeconds.startTimer()
try {
const requestURL = new URL(url)

const cacheKey = { url }

const cachedMeta = fastify.siteMetaCache.get(cacheKey)
Expand All @@ -30,37 +19,19 @@ export default fp(async function (fastify, opts) {
return cachedMeta
}

const ua = uaHacks[requestURL.hostname] ?? `Breadcrum / ${fastify.pkg.version}`

const response = await undiciRequest(requestURL, {
headers: {
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'user-agent': ua
},
maxRedirections: 3,
autoSelectFamily: true,
headersTimeout: 15000,
bodyTimeout: 15000
})

if (response.statusCode > 299) {
const text = await response.body.text()
throw new Error(`site metadata error (${response.statusCode}): ` + text)
}

const html = await response.body.text()
const html = await fastify.fetchHTML({ url })

const { document } = (new JSDOM(html, { url })).window
const metadata = extractMeta(document)

fastify.siteMetaCache.set(cacheKey, metadata)

return metadata
return { ...metadata, html }
} finally {
endTimer()
}
})
}, {
name: 'site-metadata',
dependencies: ['env', 'prom', 'cache', 'prom']
dependencies: ['env', 'prom', 'cache', 'prom', 'fetch-html']
})
21 changes: 21 additions & 0 deletions routes/api/archives/archive-props.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
export const createArchiveProp = {
createArchive: {
anyOf: [
{
type: 'object',
properties: {
url: { type: 'string', format: 'uri' }
},
required: [
'uri'
]
},
{
type: 'boolean'
},
{
type: 'null'
}
]
}
}
20 changes: 20 additions & 0 deletions routes/api/archives/archive-query-create.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/* eslint-disable camelcase */
import SQL from '@nearform/sql'

export async function createArchive ({
client,
userID,
bookmarkId,
bookmarkTitle,
url
}) {
const createArchiveQuery = SQL`
INSERT INTO archives (owner_id, bookmark_id, url, title)
VALUES (${userID}, ${bookmarkId}, ${url}, ${bookmarkTitle})
returning id, url, title;
`

const archiveResults = await client.query(createArchiveQuery)
const archive = archiveResults.rows[0]
return archive
}
3 changes: 3 additions & 0 deletions routes/api/archives/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
export default async function archiveRoutes (fastify, opts) {
await Promise.all([])
}
58 changes: 58 additions & 0 deletions routes/api/archives/resolve-archive.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import SQL from '@nearform/sql'

export async function resolveArchive ({
fastify,
pg, // optional tx client
log, // optional request logging instance
userID,
bookmarkTitle,
archiveID,
url,
initialHTML
}) {
pg = pg ?? fastify.pg
log = log ?? fastify.log

try {
const article = await fastify.extractArchive({
url,
initialHTML
})

// log.info({ article })

const archiveData = []

archiveData.push(SQL`ready = true`)
archiveData.push(SQL`url = ${url}`)
if ('title' in article) archiveData.push(SQL`title = ${article.title}`)
if ('siteName' in article) archiveData.push(SQL`site_name = ${article.siteName}`)
if ('content' in article) archiveData.push(SQL`html_content = ${article.content}`)
if ('length' in article) archiveData.push(SQL`length = ${article.length}`)
if ('excerpt' in article) archiveData.push(SQL`excerpt = ${article.excerpt}`)
if ('byline' in article) archiveData.push(SQL`byline = ${article.byline}`)
if ('dir' in article) archiveData.push(SQL`direction = ${article.dir}`)
if ('lang' in article) archiveData.push(SQL`language = ${article.lang}`)

const query = SQL`
update archives
set ${SQL.glue(archiveData, ' , ')}
where id = ${archiveID}
and owner_id =${userID};
`

const archiveResult = await pg.query(query)
archiveResult.rows.pop()

log.info(`Archive ${archiveID} for ${url} is ready.`)
} catch (err) {
log.error(`Error resolving archive ${archiveID}`)
log.error(err)
const errorQuery = SQL`
update archives
set error = ${err.stack}
where id = ${archiveID}
and owner_id =${userID};`
await pg.query(errorQuery)
}
}

0 comments on commit 71d0764

Please sign in to comment.