feat(prerenderer): extract lastmod from article:modified_time

nuxt-modules · Nov 13, 2023 · 3ec3667 · 3ec3667
1 parent 8bb8adc
commit 3ec3667
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 46 deletions.
diff --git a/src/prerender.ts b/src/prerender.ts
@@ -1,6 +1,6 @@
 import { mkdir, writeFile } from 'node:fs/promises'
 import { join } from 'node:path'
-import { parseURL, withBase, withoutLeadingSlash } from 'ufo'
+import { withBase, withoutLeadingSlash } from 'ufo'
 import { assertSiteConfig } from 'nuxt-site-config-kit'
 import { useNuxt } from '@nuxt/kit'
 import type { Nuxt } from '@nuxt/schema'
@@ -9,8 +9,8 @@ import chalk from 'chalk'
 import { dirname } from 'pathe'
 import { build } from 'nitropack'
 import { defu } from 'defu'
-import { extractImages } from './util/extractImages'
-import type { ModuleRuntimeConfig, ResolvedSitemapUrl, SitemapUrl } from './runtime/types'
+import { extractSitemapMetaFromHtml } from './util/extractSitemapMetaFromHtml'
+import type { ModuleRuntimeConfig, SitemapUrl } from './runtime/types'
 
 function formatPrerenderRoute(route: PrerenderRoute) {
   let str = `  ├─ ${route.route} (${route.generateTimeMS}ms)`
@@ -66,23 +66,12 @@ export function setupPrerenderHandler(options: ModuleRuntimeConfig, nuxt: Nuxt =
           route._sitemap._sitemap = iso || code
         }
       }
-      // do a loose regex match, get all alternative link lines
-      // this is not tested
-      const alternatives = (html.match(/<link[^>]+rel="alternate"[^>]+>/g) || [])
-        .map((a) => {
-          // extract the href, lang and type from the link
-          const href = a.match(/href="([^"]+)"/)?.[1]
-          const hreflang = a.match(/hreflang="([^"]+)"/)?.[1]
-          return { hreflang, href: parseURL(href).pathname }
-        })
-        .filter(a => a.hreflang && a.href) as ResolvedSitemapUrl['alternatives']
-      if (alternatives?.length && (alternatives.length > 1 || alternatives?.[0].hreflang !== 'x-default'))
-        route._sitemap.alternatives = alternatives
-
-      if (options.discoverImages) {
-        route._sitemap.images = <Required<ResolvedSitemapUrl>['images']>[...extractImages(html)]
-          .map(loc => ({ loc }))
-      }
+      route._sitemap = defu(extractSitemapMetaFromHtml(html, {
+        images: options.discoverImages,
+        // TODO configurable?
+        lastmod: true,
+        alternatives: true,
+      }), route._sitemap) as SitemapUrl
     })
     nitro.hooks.hook('prerender:done', async () => {
       // force templates to be rebuilt

diff --git a/src/util/extractImages.ts b/src/util/extractImages.ts
diff --git a/src/util/extractSitemapMetaFromHtml.ts b/src/util/extractSitemapMetaFromHtml.ts
@@ -0,0 +1,55 @@
+import { withSiteUrl } from 'nuxt-site-config-kit'
+import { parseURL } from 'ufo'
+import type { ResolvedSitemapUrl, SitemapUrl } from '../runtime/types'
+
+export function extractSitemapMetaFromHtml(html: string, options?: { images?: boolean; lastmod?: boolean; alternatives?: boolean }) {
+  options = options || { images: true, lastmod: true, alternatives: true }
+  const payload: Partial<SitemapUrl> = {}
+  if (options?.images) {
+    const images = new Set<string>()
+    const mainRegex = /<main[^>]*>([\s\S]*?)<\/main>/
+    const mainMatch = mainRegex.exec(html)
+    if (mainMatch?.[1] && mainMatch[1].includes('<img')) {
+      // extract image src using regex on the html
+      const imgRegex = /<img[^>]+src="([^">]+)"/g
+      let match
+      // eslint-disable-next-line no-cond-assign
+      while ((match = imgRegex.exec(mainMatch[1])) !== null) {
+        // This is necessary to avoid infinite loops with zero-width matches
+        if (match.index === imgRegex.lastIndex)
+          imgRegex.lastIndex++
+        let url = match[1]
+        // if the match is relative
+        if (url.startsWith('/'))
+          url = withSiteUrl(url)
+        images.add(url)
+      }
+    }
+    if (images.size > 0)
+      payload.images = [...images].map(i => ({ loc: i }))
+  }
+
+  if (options?.lastmod) {
+    // let's extract the lastmod from the html using the following tags:
+    const articleModifiedTime = html.match(/<meta[^>]+property="article:modified_time"[^>]+content="([^"]+)"/)?.[1]
+      || html.match(/<meta[^>]+content="([^"]+)"[^>]+property="article:modified_time"/)?.[1]
+    if (articleModifiedTime)
+      payload.lastmod = articleModifiedTime
+  }
+
+  if (options?.alternatives) {
+    // do a loose regex match, get all alternative link lines
+    // this is not tested
+    const alternatives = (html.match(/<link[^>]+rel="alternate"[^>]+>/g) || [])
+      .map((a) => {
+        // extract the href, lang and type from the link
+        const href = a.match(/href="([^"]+)"/)?.[1]
+        const hreflang = a.match(/hreflang="([^"]+)"/)?.[1]
+        return { hreflang, href: parseURL(href).pathname }
+      })
+      .filter(a => a.hreflang && a.href) as ResolvedSitemapUrl['alternatives']
+    if (alternatives?.length && (alternatives.length > 1 || alternatives?.[0].hreflang !== 'x-default'))
+      payload.alternatives = alternatives
+  }
+  return payload
+}
diff --git a/test/unit/extractSitemapMetaFromHtml.test.ts b/test/unit/extractSitemapMetaFromHtml.test.ts
@@ -0,0 +1,31 @@
+import { describe, expect, it } from 'vitest'
+import { extractSitemapMetaFromHtml } from '../../src/util/extractSitemapMetaFromHtml'
+
+describe('extractSitemapMetaFromHtml', () => {
+  it('lastmod', async () => {
+    // test article meta
+    const output = extractSitemapMetaFromHtml(`
+    <head>
+      <meta property="article:published_time" content="2021-04-01T00:00:00Z">
+      <meta property="article:modified_time" content="2021-04-02T00:00:00Z">
+    </head>
+`)
+    expect(output).toMatchInlineSnapshot(`
+      {
+        "lastmod": "2021-04-02T00:00:00Z",
+      }
+    `)
+    // test article meta
+    const output2 = extractSitemapMetaFromHtml(`
+    <head>
+      <meta content="2021-04-01T00:00:00Z" property="article:published_time"/>
+      <meta content="2021-04-02T00:00:00Z" property="article:modified_time"/>
+    </head>
+`)
+    expect(output2).toMatchInlineSnapshot(`
+      {
+        "lastmod": "2021-04-02T00:00:00Z",
+      }
+    `)
+  })
+})