Add full text search

Add full text search for bookmarks, episodes and archives.
hifiwi-fi · Sep 17, 2023 · 02dcc26 · 02dcc26
1 parent 8223c30
commit 02dcc26
Show file tree

Hide file tree

Showing 61 changed files with 1,913 additions and 188 deletions.
diff --git a/app.js b/app.js
@@ -35,7 +35,7 @@ export const options = {
   trustProxy: true,
   genReqId: function (req) { return hid() },
   logger: {
-    mixin () {
+    mixin () { // TODO: move this to the log ingestor somehow?
       return {
         service: 'breadcrum.net',
         ddsource: 'nodejs',

diff --git a/migrations/018.do.fts.sql b/migrations/018.do.fts.sql
@@ -0,0 +1,118 @@
+-- Bookmarks FTS
+alter table bookmarks
+  add column tsv tsvector;
+
+create index bookmarks_tsv_idx on bookmarks using gin(tsv);
+
+
+update bookmarks set tsv =
+  setweight(to_tsvector('english', coalesce(note,'')), 'A') ||
+  setweight(to_tsvector('english', coalesce(title,'')), 'B') ||
+  setweight(to_tsvector('english', coalesce(summary,'')), 'C') ||
+  setweight(to_tsvector('english', coalesce(url,'')), 'C');
+
+create function bookmarks_tsv_update() returns trigger as $$
+begin
+  new.tsv :=
+    setweight(to_tsvector('english', coalesce(NEW.note,'')), 'A') ||
+    setweight(to_tsvector('english', coalesce(NEW.title,'')), 'B') ||
+    setweight(to_tsvector('english', coalesce(NEW.summary,'')), 'C') ||
+    setweight(to_tsvector('english', coalesce(NEW.url,'')), 'C');
+  return new;
+end
+$$ language plpgsql;
+
+create trigger bookmarks_tsv_trigger before insert or update
+on bookmarks for each row execute function bookmarks_tsv_update();
+
+-- Archives FTS
+alter table archives
+  add column tsv tsvector;
+
+create index archives_tsv_idx on archives using gin(tsv);
+
+
+update archives set tsv =
+  setweight(to_tsvector('english', coalesce(title,'')), 'A') ||
+  setweight(to_tsvector('english', coalesce(text_content,'')), 'B') ||
+  setweight(to_tsvector('english', coalesce(excerpt,'')), 'C') ||
+  setweight(to_tsvector('english', coalesce(site_name,'')), 'D') ||
+  setweight(to_tsvector('english', coalesce(byline,'')), 'D') ||
+  setweight(to_tsvector('english', coalesce(url,'')), 'D');
+
+
+create function archives_tsv_update() returns trigger as $$
+begin
+  new.tsv :=
+    setweight(to_tsvector('english', coalesce(NEW.title,'')), 'A') ||
+    setweight(to_tsvector('english', coalesce(NEW.text_content,'')), 'B') ||
+    setweight(to_tsvector('english', coalesce(NEW.excerpt,'')), 'C') ||
+    setweight(to_tsvector('english', coalesce(NEW.site_name,'')), 'D') ||
+    setweight(to_tsvector('english', coalesce(NEW.byline,'')), 'D') ||
+    setweight(to_tsvector('english', coalesce(NEW.url,'')), 'D');
+  return new;
+end
+$$ language plpgsql;
+
+create trigger archives_tsv_trigger before insert or update
+on archives for each row execute function archives_tsv_update();
+
+-- Episodes FTS
+alter table episodes
+  add column tsv tsvector;
+
+create index episodes_tsv_idx on episodes using gin(tsv);
+
+
+update episodes set tsv =
+  setweight(to_tsvector('english', coalesce(title,'')), 'A') ||
+  setweight(to_tsvector('english', coalesce(text_content,'')), 'B') ||
+  setweight(to_tsvector('english', coalesce(author_name,'')), 'C') ||
+  setweight(to_tsvector('english', coalesce(filename,'')), 'D') ||
+  setweight(to_tsvector('english', coalesce(url,'')), 'D');
+
+
+create function episodes_tsv_update() returns trigger as $$
+begin
+  new.tsv :=
+    setweight(to_tsvector('english', coalesce(NEW.title,'')), 'A') ||
+    setweight(to_tsvector('english', coalesce(NEW.text_content,'')), 'B') ||
+    setweight(to_tsvector('english', coalesce(NEW.author_name,'')), 'C') ||
+    setweight(to_tsvector('english', coalesce(NEW.filename,'')), 'D') ||
+    setweight(to_tsvector('english', coalesce(NEW.url,'')), 'D');
+  return new;
+end
+$$ language plpgsql;
+
+create trigger episodes_tsv_trigger before insert or update
+on episodes for each row execute function episodes_tsv_update();
+
+-- Users FTS
+alter table users
+  add column tsv tsvector;
+
+create index users_tsv_idx on users using gin(tsv);
+
+
+update users set tsv =
+  setweight(to_tsvector('english', coalesce(username,'')), 'A') ||
+  setweight(to_tsvector('english', coalesce(disabled_reason,'')), 'A') ||
+  setweight(to_tsvector('english', coalesce(internal_note,'')), 'A') ||
+  setweight(to_tsvector('english', coalesce(email,'')), 'B') ||
+  setweight(to_tsvector('english', coalesce(pending_email_update,'')), 'B');
+
+
+create function users_tsv_update() returns trigger as $$
+begin
+  new.tsv :=
+    setweight(to_tsvector('english', coalesce(NEW.username,'')), 'A') ||
+    setweight(to_tsvector('english', coalesce(NEW.disabled_reason,'')), 'A') ||
+    setweight(to_tsvector('english', coalesce(NEW.internal_note,'')), 'A') ||
+    setweight(to_tsvector('english', coalesce(NEW.email,'')), 'B') ||
+    setweight(to_tsvector('english', coalesce(NEW.pending_email_update,'')), 'B');
+  return new;
+end
+$$ language plpgsql;
+
+create trigger users_tsv_trigger before insert or update
+on users for each row execute function users_tsv_update();
diff --git a/migrations/018.undo.fts.sql b/migrations/018.undo.fts.sql
@@ -0,0 +1,23 @@
+-- Bookmarks FTS down
+drop trigger if exists bookmarks_tsv_trigger on bookmarks;
+drop function if exists bookmarks_tsv_update();
+drop index if exists bookmarks_tsv_idx;
+alter table bookmarks drop column if exists tsv;
+
+-- Archives FTS down
+drop trigger if exists archives_tsv_trigger on archives;
+drop function if exists archives_tsv_update();
+drop index if exists archives_tsv_idx;
+alter table archives drop column if exists tsv;
+
+-- Episodes FTS down
+drop trigger if exists episodes_tsv_trigger on episodes;
+drop function if exists episodes_tsv_update();
+drop index if exists episodes_tsv_idx;
+alter table episodes drop column if exists tsv;
+
+-- Users FTS down
+drop trigger if exists users_tsv_trigger on users;
+drop function if exists users_tsv_update();
+drop index if exists users_tsv_idx;
+alter table users drop column if exists tsv;
diff --git a/plugins/helmet.js b/plugins/helmet.js
@@ -9,9 +9,13 @@ import fp from 'fastify-plugin'
 export default fp(async function (fastify, opts) {
   // This is also customized in the ./static.js plugin
   fastify.register(import('@fastify/helmet'), {
+    crossOriginResourcePolicy: { policy: 'cross-origin' },
+    crossOriginEmbedderPolicy: { policy: 'credentialless' },
     contentSecurityPolicy: {
       directives: {
-        'upgrade-insecure-requests': fastify.config.ENV !== 'production' ? null : []
+        'upgrade-insecure-requests': fastify.config.ENV !== 'production' ? null : [],
+        'media-src': '*',
+        'img-src': '*'
       }
     }
   })

diff --git a/plugins/static.js b/plugins/static.js
@@ -36,56 +36,6 @@ export default fp(async function (fastify, opts) {
       ...staticOpts
     })
   }, { prefix: '/admin' })
-
-  // Feed Routes modified CSP
-  const upgradeInsecureRequests = fastify.config.ENV !== 'production' ? null : []
-
-  const episodesHeaders = {
-    contentSecurityPolicy: {
-      directives: {
-        'media-src': '*',
-        'img-src': '*',
-        'upgrade-insecure-requests': upgradeInsecureRequests
-      }
-    }
-  }
-  fastify.register(async function (fastify, opts) {
-    fastify.register(import('@fastify/helmet'), episodesHeaders)
-    fastify.register(import('@fastify/static'), {
-      root: path.join(__dirname, '../public/feeds'),
-      prefix: '/',
-      ...staticOpts
-    })
-  }, { prefix: '/feeds' })
-  // Episodes routes
-  fastify.register(async function (fastify, opts) {
-    fastify.register(import('@fastify/helmet'), episodesHeaders)
-    fastify.register(import('@fastify/static'), {
-      root: path.join(__dirname, '../public/episodes'),
-      prefix: '/',
-      ...staticOpts
-    })
-  }, { prefix: '/episodes' })
-
-  // Archives Routes Modified CSP, COEP, CORP
-  fastify.register(async function (fastify, opts) {
-    fastify.register(import('@fastify/helmet'), {
-      crossOriginResourcePolicy: { policy: 'cross-origin' },
-      crossOriginEmbedderPolicy: { policy: 'credentialless' },
-      contentSecurityPolicy: {
-        directives: {
-          'media-src': '*',
-          'img-src': '*',
-          'upgrade-insecure-requests': upgradeInsecureRequests
-        }
-      }
-    })
-    fastify.register(import('@fastify/static'), {
-      root: path.join(__dirname, '../public/archives/view'),
-      prefix: '/',
-      ...staticOpts
-    })
-  }, { prefix: '/archives/view' })
 }, {
   name: 'static',
   dependencies: ['compress', 'auth', 'jwt', 'helmet']

diff --git a/routes/api/admin/search/users/index.js b/routes/api/admin/search/users/index.js
@@ -0,0 +1,5 @@
+export default async function searchAdminUsersRoutes (fastify, opts) {
+  await Promise.all([
+
+  ])
+}
diff --git a/routes/api/archives/archive-query-get.js b/routes/api/archives/archive-query-get.js
@@ -1,22 +1,48 @@
 import SQL from '@nearform/sql'
 
-export function getArchivesQuery ({
+/**
+ * @typedef {import('@nearform/sql').SqlStatement} SqlStatement
+ */
+
+/**
+ * Generates an SQL query for fetching archive properties based on various filters.
+ *
+ * @export
+ * @param {Object} options - The options object containing filter and query properties.
+ * @param {boolean} [options.fullArchives] - Whether to include full HTML content of the archive.
+ * @param {string|number} options.ownerId - The owner ID to filter the archives and bookmarks.
+ * @param {boolean} [options.sensitive] - Whether to include sensitive bookmarks.
+ * @param {boolean} [options.toread] - Whether to include bookmarks marked "to read."
+ * @param {boolean} [options.starred] - Whether to include starred bookmarks.
+ * @param {boolean} [options.ready] - Whether the archive is ready.
+ * @param {string|number} [options.archiveId] - Specific archive ID to fetch.
+ * @param {string|number} [options.bookmarkId] - Specific bookmark ID associated with an archive to fetch.
+ * @param {string|number} [options.before] - Timestamp to fetch archives created before this time.
+ * @param {boolean} [options.withRank] - Whether to include ranking based on text search.
+ * @param {string} [options.query] - Text search query for ranking.
+ * @param {boolean} [options.includeRank] - Include rank column
+ *
+ * @returns {SqlStatement} The generated SQL query.
+ */
+export function archivePropsQuery ({
+  fullArchives,
   ownerId,
-  archiveId,
-  bookmarkId,
-  before,
   sensitive,
   toread,
   starred,
   ready,
-  perPage,
-  fullArchives
+  archiveId,
+  bookmarkId,
+  before,
+  query,
+  includeRank
 }) {
-  const archivesQuery = SQL`
+  return SQL`
     select
       ar.id,
       ar.created_at,
       ar.updated_at,
+      ${includeRank ? SQL`ts_rank(ar.tsv,  websearch_to_tsquery('english', ${query})) AS rank,` : SQL``}
       ar.url,
       ar.title,
       coalesce (ar.title, bm.title) as display_title,
@@ -47,13 +73,40 @@ export function getArchivesQuery ({
     on ar.bookmark_id = bm.id
     where ar.owner_id = ${ownerId}
     and bm.owner_id = ${ownerId}
-    ${archiveId ? SQL`and ar.id = ${archiveId}` : SQL``}
-    ${bookmarkId ? SQL`and ar.bookmark_id = ${bookmarkId}` : SQL``}
-    ${before ? SQL`and ar.created_at < ${before}` : SQL``}
     ${!sensitive ? SQL`and sensitive = false` : SQL``}
     ${toread ? SQL`and toread = true` : SQL``}
     ${starred ? SQL`and starred = true` : SQL``}
     ${ready != null ? SQL`and ready = ${ready}` : SQL``}
+    ${archiveId ? SQL`and ar.id = ${archiveId}` : SQL``}
+    ${bookmarkId ? SQL`and ar.bookmark_id = ${bookmarkId}` : SQL``}
+    ${before ? SQL`and ar.created_at < ${before}` : SQL``}
+  `
+}
+
+export function getArchivesQuery ({
+  ownerId,
+  archiveId,
+  bookmarkId,
+  before,
+  sensitive,
+  toread,
+  starred,
+  ready,
+  perPage,
+  fullArchives
+}) {
+  const archivesQuery = SQL`
+    ${archivePropsQuery({
+      fullArchives,
+      ownerId,
+      sensitive,
+      toread,
+      starred,
+      ready,
+      archiveId,
+      bookmarkId,
+      before
+      })}
     order by ar.created_at desc, ar.url desc, bm.title desc
     ${perPage != null ? SQL`fetch first ${perPage} rows only` : SQL``}
   `