Skip to content

Commit

Permalink
feat(prerenderer): extract lastmod from article:modified_time
Browse files Browse the repository at this point in the history
  • Loading branch information
harlan-zw committed Nov 13, 2023
1 parent 8bb8adc commit 3ec3667
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 46 deletions.
29 changes: 9 additions & 20 deletions src/prerender.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { mkdir, writeFile } from 'node:fs/promises'
import { join } from 'node:path'
import { parseURL, withBase, withoutLeadingSlash } from 'ufo'
import { withBase, withoutLeadingSlash } from 'ufo'
import { assertSiteConfig } from 'nuxt-site-config-kit'
import { useNuxt } from '@nuxt/kit'
import type { Nuxt } from '@nuxt/schema'
Expand All @@ -9,8 +9,8 @@ import chalk from 'chalk'
import { dirname } from 'pathe'
import { build } from 'nitropack'
import { defu } from 'defu'
import { extractImages } from './util/extractImages'
import type { ModuleRuntimeConfig, ResolvedSitemapUrl, SitemapUrl } from './runtime/types'
import { extractSitemapMetaFromHtml } from './util/extractSitemapMetaFromHtml'
import type { ModuleRuntimeConfig, SitemapUrl } from './runtime/types'

function formatPrerenderRoute(route: PrerenderRoute) {
let str = ` ├─ ${route.route} (${route.generateTimeMS}ms)`
Expand Down Expand Up @@ -66,23 +66,12 @@ export function setupPrerenderHandler(options: ModuleRuntimeConfig, nuxt: Nuxt =
route._sitemap._sitemap = iso || code
}
}
// do a loose regex match, get all alternative link lines
// this is not tested
const alternatives = (html.match(/<link[^>]+rel="alternate"[^>]+>/g) || [])
.map((a) => {
// extract the href, lang and type from the link
const href = a.match(/href="([^"]+)"/)?.[1]
const hreflang = a.match(/hreflang="([^"]+)"/)?.[1]
return { hreflang, href: parseURL(href).pathname }
})
.filter(a => a.hreflang && a.href) as ResolvedSitemapUrl['alternatives']
if (alternatives?.length && (alternatives.length > 1 || alternatives?.[0].hreflang !== 'x-default'))
route._sitemap.alternatives = alternatives

if (options.discoverImages) {
route._sitemap.images = <Required<ResolvedSitemapUrl>['images']>[...extractImages(html)]
.map(loc => ({ loc }))
}
route._sitemap = defu(extractSitemapMetaFromHtml(html, {
images: options.discoverImages,
// TODO configurable?
lastmod: true,
alternatives: true,
}), route._sitemap) as SitemapUrl
})
nitro.hooks.hook('prerender:done', async () => {
// force templates to be rebuilt
Expand Down
26 changes: 0 additions & 26 deletions src/util/extractImages.ts

This file was deleted.

55 changes: 55 additions & 0 deletions src/util/extractSitemapMetaFromHtml.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import { withSiteUrl } from 'nuxt-site-config-kit'
import { parseURL } from 'ufo'
import type { ResolvedSitemapUrl, SitemapUrl } from '../runtime/types'

export function extractSitemapMetaFromHtml(html: string, options?: { images?: boolean; lastmod?: boolean; alternatives?: boolean }) {
options = options || { images: true, lastmod: true, alternatives: true }
const payload: Partial<SitemapUrl> = {}
if (options?.images) {
const images = new Set<string>()
const mainRegex = /<main[^>]*>([\s\S]*?)<\/main>/
const mainMatch = mainRegex.exec(html)
if (mainMatch?.[1] && mainMatch[1].includes('<img')) {
// extract image src using regex on the html
const imgRegex = /<img[^>]+src="([^">]+)"/g
let match
// eslint-disable-next-line no-cond-assign
while ((match = imgRegex.exec(mainMatch[1])) !== null) {
// This is necessary to avoid infinite loops with zero-width matches
if (match.index === imgRegex.lastIndex)
imgRegex.lastIndex++
let url = match[1]
// if the match is relative
if (url.startsWith('/'))
url = withSiteUrl(url)
images.add(url)
}
}
if (images.size > 0)
payload.images = [...images].map(i => ({ loc: i }))
}

if (options?.lastmod) {
// let's extract the lastmod from the html using the following tags:
const articleModifiedTime = html.match(/<meta[^>]+property="article:modified_time"[^>]+content="([^"]+)"/)?.[1]
|| html.match(/<meta[^>]+content="([^"]+)"[^>]+property="article:modified_time"/)?.[1]
if (articleModifiedTime)
payload.lastmod = articleModifiedTime
}

if (options?.alternatives) {
// do a loose regex match, get all alternative link lines
// this is not tested
const alternatives = (html.match(/<link[^>]+rel="alternate"[^>]+>/g) || [])
.map((a) => {
// extract the href, lang and type from the link
const href = a.match(/href="([^"]+)"/)?.[1]
const hreflang = a.match(/hreflang="([^"]+)"/)?.[1]
return { hreflang, href: parseURL(href).pathname }
})
.filter(a => a.hreflang && a.href) as ResolvedSitemapUrl['alternatives']
if (alternatives?.length && (alternatives.length > 1 || alternatives?.[0].hreflang !== 'x-default'))
payload.alternatives = alternatives
}
return payload
}
31 changes: 31 additions & 0 deletions test/unit/extractSitemapMetaFromHtml.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import { describe, expect, it } from 'vitest'
import { extractSitemapMetaFromHtml } from '../../src/util/extractSitemapMetaFromHtml'

describe('extractSitemapMetaFromHtml', () => {
it('lastmod', async () => {
// test article meta
const output = extractSitemapMetaFromHtml(`
<head>
<meta property="article:published_time" content="2021-04-01T00:00:00Z">
<meta property="article:modified_time" content="2021-04-02T00:00:00Z">
</head>
`)
expect(output).toMatchInlineSnapshot(`
{
"lastmod": "2021-04-02T00:00:00Z",
}
`)
// test article meta
const output2 = extractSitemapMetaFromHtml(`
<head>
<meta content="2021-04-01T00:00:00Z" property="article:published_time"/>
<meta content="2021-04-02T00:00:00Z" property="article:modified_time"/>
</head>
`)
expect(output2).toMatchInlineSnapshot(`
{
"lastmod": "2021-04-02T00:00:00Z",
}
`)
})
})

0 comments on commit 3ec3667

Please sign in to comment.