Add a button to create an archive

hifiwi-fi · Mar 21, 2023 · 71d0764 · 71d0764
1 parent 811ca79
commit 71d0764
Show file tree

Hide file tree

Showing 16 changed files with 349 additions and 41 deletions.
diff --git a/app.js b/app.js
@@ -7,10 +7,6 @@ const __dirname = desm(import.meta.url)
 const hid = hyperid()
 
 export default async function App (fastify, opts) {
-  // Place here your custom code!
-
-  // Do not touch the following lines
-
   const testPattern = /.*(test|spec).js/
   // This loads all plugins defined in plugins
   // those should be support plugins that are reused
@@ -33,9 +29,6 @@ export default async function App (fastify, opts) {
     ignorePattern: testPattern,
     options: Object.assign({}, opts)
   })
-
-  // await fastify.after()
-  // console.log(fastify.printRoutes())
 }
 
 export const options = {

diff --git a/migrations/013.do.readability-archive.sql b/migrations/013.do.readability-archive.sql
@@ -0,0 +1,36 @@
+create table archives (
+  id uuid primary key default gen_random_uuid(),
+  owner_id uuid not null,
+  bookmark_id uuid not null,
+  created_at timestamptz not null default now(),
+  updated_at timestamptz,
+  url text not null,
+  title text,
+  site_name text,
+  html_content text,
+  length bigint,
+  excerpt text,
+  byline text,
+  direction text,
+  language text,
+  ready boolean not null default false,
+  error text,
+
+  constraint fk_owner
+    foreign key(owner_id)
+      references users(id)
+      on delete cascade,
+
+  constraint fk_bookmark
+    foreign key(bookmark_id)
+      references bookmarks(id)
+      on delete cascade
+);
+
+create index idx_archives_owner ON archives(owner_id);
+create index idx_archives_bookmark ON archives(bookmark_id);
+
+create trigger set_timestamp_archives
+before update on archives
+for each row
+execute procedure trigger_set_timestamp();
diff --git a/migrations/013.undo.readability-archive.sql b/migrations/013.undo.readability-archive.sql
@@ -0,0 +1,4 @@
+drop trigger if exists set_timestamp_archives ON readabilty_archives;
+drop index if exists idx_archives_bookmark;
+drop index if exists idx_archives_owner;
+drop table if exists archives;
diff --git a/package.json b/package.json
@@ -29,12 +29,14 @@
     "@fastify/static": "^6.4.0",
     "@fastify/swagger": "^8.3.1",
     "@fastify/swagger-ui": "^1.3.0",
+    "@mozilla/readability": "^0.4.2",
     "@nearform/sql": "^1.5.0",
     "@siteup/cli": "^2.2.4",
     "abstract-cache-redis": "^2.0.0",
     "classnames": "^2.3.1",
     "clean-deep": "^3.4.0",
     "desm": "^1.2.0",
+    "dompurify": "^3.0.1",
     "dotenv": "^16.0.1",
     "fast-json-body": "^1.1.0",
     "fastify": "^4.0.1",

diff --git a/plugins/cache.js b/plugins/cache.js
@@ -120,6 +120,35 @@ export default fp(async function (fastify, opts) {
       return siteMetaCache.set(key, value)
     }
   })
+
+  // For caching server extracted site metadata
+  const archiveCache = new LRU({
+    max: 50,
+    ttl: 1000 * 60 * 5, // 20 mins,
+    updateAgeOnGet: false,
+    ttlAutopurge: true
+  })
+
+  function getArchiveCacheKey ({
+    url
+  }) {
+    assert(url, 'url required')
+    return [
+      'readability',
+      url
+    ].join(':')
+  }
+
+  fastify.decorate('archiveCache', {
+    get ({ url } = {}) {
+      const key = getArchiveCacheKey({ url })
+      return archiveCache.get(key)
+    },
+    set ({ url } = {}, value) {
+      const key = getArchiveCacheKey({ url })
+      return archiveCache.set(key, value)
+    }
+  })
 }, {
   name: 'cache',
   dependencies: ['redis']

diff --git a/plugins/extract-archive.js b/plugins/extract-archive.js
@@ -0,0 +1,44 @@
+import fp from 'fastify-plugin'
+import { JSDOM } from 'jsdom'
+import { Readability } from '@mozilla/readability'
+import createDOMPurify from 'dompurify'
+
+/**
+ * This plugin adds readability-extract fetching helpers
+ */
+export default fp(async function (fastify, opts) {
+  fastify.decorate('extractArchive', async function extractArchive ({
+    url,
+    initialHTML // optinally pass html here if its already fetched before
+  }) {
+    const endTimer = fastify.metrics.archiveSeconds.startTimer()
+    try {
+      const cacheKey = { url }
+
+      const cachedRBArchive = fastify.archiveCache.get(cacheKey)
+
+      if (cachedRBArchive) {
+        return cachedRBArchive
+      }
+
+      const html = initialHTML ?? await fastify.fetchHTML({ url })
+
+      const { document } = (new JSDOM(html, { url })).window
+      const reader = new Readability(document)
+      const article = reader.parse()
+
+      const dpWindow = new JSDOM('').window
+      const DOMPurify = createDOMPurify(dpWindow)
+      article.content = DOMPurify.sanitize(article.content)
+
+      fastify.siteMetaCache.set(cacheKey, article)
+
+      return article
+    } finally {
+      endTimer()
+    }
+  })
+}, {
+  name: 'extract-archive',
+  dependencies: ['env', 'prom', 'cache', 'prom', 'fetch-html']
+})
diff --git a/plugins/fetch-html.js b/plugins/fetch-html.js
@@ -0,0 +1,46 @@
+import fp from 'fastify-plugin'
+import { request as undiciRequest } from 'undici'
+
+// Sorry newspapers, no cheating
+const GOOGLE_BOT_UA = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36'
+
+const uaHacks = {
+  'twitter.com': GOOGLE_BOT_UA,
+  'mobile.twitter.com': GOOGLE_BOT_UA
+}
+
+/**
+ * This plugin adds a function to fetch html
+ */
+export default fp(async function (fastify, opts) {
+  fastify.decorate('fetchHTML', async function fetchHTML ({
+    url
+  }) {
+    const requestURL = new URL(url)
+
+    const ua = uaHacks[requestURL.hostname] ?? `Breadcrum / ${fastify.pkg.version}`
+
+    const response = await undiciRequest(requestURL, {
+      headers: {
+        Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'user-agent': ua
+      },
+      maxRedirections: 3,
+      autoSelectFamily: true,
+      headersTimeout: 15000,
+      bodyTimeout: 15000
+    })
+
+    if (response.statusCode > 299) {
+      const text = await response.body.text()
+      throw new Error(`Fetch HTML error (${response.statusCode}): ` + text)
+    }
+
+    const html = await response.body.text()
+
+    return html
+  })
+}, {
+  name: 'fetch-html',
+  dependencies: ['env']
+})
diff --git a/plugins/prom.js b/plugins/prom.js
@@ -69,6 +69,16 @@ export default fp(async function (fastify, opts) {
     help: 'The time it takes for site meta extraction'
   })
 
+  fastify.metrics.archiveSeconds = new fastify.metrics.client.Histogram({
+    name: 'breadcrum_archive_seconds',
+    help: 'The time it takes for readability archive extraction'
+  })
+
+  fastify.metrics.archiveCounter = new fastify.metrics.client.Counter({
+    name: 'breadcrum_archive_created_total',
+    help: 'The number of times a readability archive is created'
+  })
+
   const promServer = Fastify({
     logger: true
   })

diff --git a/plugins/site-meta.js b/plugins/site-meta.js
@@ -1,16 +1,7 @@
 import fp from 'fastify-plugin'
-import { request as undiciRequest } from 'undici'
 import { JSDOM } from 'jsdom'
 import { extractMeta } from '@breadcrum/extract-meta'
 
-// Sorry newspapers, no cheating
-const GOOGLE_BOT_UA = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36'
-
-const uaHacks = {
-  'twitter.com': GOOGLE_BOT_UA,
-  'mobile.twitter.com': GOOGLE_BOT_UA
-}
-
 /**
  * This plugin adds site-metadata fetching helpers
  */
@@ -20,8 +11,6 @@ export default fp(async function (fastify, opts) {
   }) {
     const endTimer = fastify.metrics.siteMetaSeconds.startTimer()
     try {
-      const requestURL = new URL(url)
-
       const cacheKey = { url }
 
       const cachedMeta = fastify.siteMetaCache.get(cacheKey)
@@ -30,37 +19,19 @@ export default fp(async function (fastify, opts) {
         return cachedMeta
       }
 
-      const ua = uaHacks[requestURL.hostname] ?? `Breadcrum / ${fastify.pkg.version}`
-
-      const response = await undiciRequest(requestURL, {
-        headers: {
-          Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-          'user-agent': ua
-        },
-        maxRedirections: 3,
-        autoSelectFamily: true,
-        headersTimeout: 15000,
-        bodyTimeout: 15000
-      })
-
-      if (response.statusCode > 299) {
-        const text = await response.body.text()
-        throw new Error(`site metadata error (${response.statusCode}): ` + text)
-      }
-
-      const html = await response.body.text()
+      const html = await fastify.fetchHTML({ url })
 
       const { document } = (new JSDOM(html, { url })).window
       const metadata = extractMeta(document)
 
       fastify.siteMetaCache.set(cacheKey, metadata)
 
-      return metadata
+      return { ...metadata, html }
     } finally {
       endTimer()
     }
   })
 }, {
   name: 'site-metadata',
-  dependencies: ['env', 'prom', 'cache', 'prom']
+  dependencies: ['env', 'prom', 'cache', 'prom', 'fetch-html']
 })
diff --git a/routes/api/archives/archive-props.js b/routes/api/archives/archive-props.js
@@ -0,0 +1,21 @@
+export const createArchiveProp = {
+  createArchive: {
+    anyOf: [
+      {
+        type: 'object',
+        properties: {
+          url: { type: 'string', format: 'uri' }
+        },
+        required: [
+          'uri'
+        ]
+      },
+      {
+        type: 'boolean'
+      },
+      {
+        type: 'null'
+      }
+    ]
+  }
+}
diff --git a/routes/api/archives/archive-query-create.js b/routes/api/archives/archive-query-create.js
@@ -0,0 +1,20 @@
+/* eslint-disable camelcase */
+import SQL from '@nearform/sql'
+
+export async function createArchive ({
+  client,
+  userID,
+  bookmarkId,
+  bookmarkTitle,
+  url
+}) {
+  const createArchiveQuery = SQL`
+          INSERT INTO archives (owner_id, bookmark_id, url, title)
+          VALUES (${userID}, ${bookmarkId}, ${url}, ${bookmarkTitle})
+          returning id, url, title;
+          `
+
+  const archiveResults = await client.query(createArchiveQuery)
+  const archive = archiveResults.rows[0]
+  return archive
+}
diff --git a/routes/api/archives/index.js b/routes/api/archives/index.js
@@ -0,0 +1,3 @@
+export default async function archiveRoutes (fastify, opts) {
+  await Promise.all([])
+}
diff --git a/routes/api/archives/resolve-archive.js b/routes/api/archives/resolve-archive.js
@@ -0,0 +1,58 @@
+import SQL from '@nearform/sql'
+
+export async function resolveArchive ({
+  fastify,
+  pg, // optional tx client
+  log, // optional request logging instance
+  userID,
+  bookmarkTitle,
+  archiveID,
+  url,
+  initialHTML
+}) {
+  pg = pg ?? fastify.pg
+  log = log ?? fastify.log
+
+  try {
+    const article = await fastify.extractArchive({
+      url,
+      initialHTML
+    })
+
+    // log.info({ article })
+
+    const archiveData = []
+
+    archiveData.push(SQL`ready = true`)
+    archiveData.push(SQL`url = ${url}`)
+    if ('title' in article) archiveData.push(SQL`title = ${article.title}`)
+    if ('siteName' in article) archiveData.push(SQL`site_name = ${article.siteName}`)
+    if ('content' in article) archiveData.push(SQL`html_content = ${article.content}`)
+    if ('length' in article) archiveData.push(SQL`length = ${article.length}`)
+    if ('excerpt' in article) archiveData.push(SQL`excerpt = ${article.excerpt}`)
+    if ('byline' in article) archiveData.push(SQL`byline = ${article.byline}`)
+    if ('dir' in article) archiveData.push(SQL`direction = ${article.dir}`)
+    if ('lang' in article) archiveData.push(SQL`language = ${article.lang}`)
+
+    const query = SQL`
+        update archives
+        set ${SQL.glue(archiveData, ' , ')}
+        where id = ${archiveID}
+        and owner_id =${userID};
+      `
+
+    const archiveResult = await pg.query(query)
+    archiveResult.rows.pop()
+
+    log.info(`Archive ${archiveID} for ${url} is ready.`)
+  } catch (err) {
+    log.error(`Error resolving archive ${archiveID}`)
+    log.error(err)
+    const errorQuery = SQL`
+        update archives
+        set error = ${err.stack}
+        where id = ${archiveID}
+        and owner_id =${userID};`
+    await pg.query(errorQuery)
+  }
+}