From d8209ac91d477764a5eb4d294705ba2ea299d6e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eric=20Meadows-J=C3=B6nsson?= Date: Sun, 31 May 2026 19:09:04 +0200 Subject: [PATCH] Rewrite hexdocs.pm canonical links to per-package subdomains ExDoc emits a tag (when the package sets the :canonical option) pointing at the old path-based URL, https://hexdocs.pm//... . Now that docs are served from per-package subdomains, that canonical points away from where the page actually lives, splitting SEO signal. Rewrite the canonical tag at ingestion time in the file rewriter so it points at https://.hexdocs.pm/... , reusing package_to_subdomain for the underscore-to-hyphen mapping and upgrading http to https. The bare apex, apex files such as sitemap.xml, and canonical links that already use a subdomain are left untouched. Body links and other tags are intentionally not rewritten: a permanent redirect from the old URLs preserves link equity via 301, so canonical is the only tag where the rewrite changes SEO behavior. --- lib/hexdocs/file_rewriter.ex | 16 +++++ test/hexdocs/file_rewriter_test.exs | 104 +++++++++++++++++++++++++++- 2 files changed, 119 insertions(+), 1 deletion(-) diff --git a/lib/hexdocs/file_rewriter.ex b/lib/hexdocs/file_rewriter.ex index 5dd73e2..3e65054 100644 --- a/lib/hexdocs/file_rewriter.ex +++ b/lib/hexdocs/file_rewriter.ex @@ -16,9 +16,25 @@ defmodule Hexdocs.FileRewriter do |> add_elixir_org_link(path) |> add_analytics(path) |> remove_noindex(path) + |> rewrite_canonical_links(path) |> add_nofollow(path) end + @canonical_tag_re ~r{]*\brel=["']canonical["'][^>]*>}i + @hexdocs_link_re ~r{https?://hexdocs\.pm/([a-z][a-z0-9_]*)(?![a-zA-Z0-9_.-])} + + defp rewrite_canonical_links(content, path) do + if String.ends_with?(path, ".html") do + Regex.replace(@canonical_tag_re, content, fn tag -> + Regex.replace(@hexdocs_link_re, tag, fn _match, package -> + "https://#{Hexdocs.Utils.package_to_subdomain(package)}.hexdocs.pm" + end) + end) + else + content + end + end + defp add_elixir_org_link(content, path) do if String.ends_with?(path, ".html") and not String.contains?(content, @link_addition) do String.replace(content, @link_hooks, &(&1 <> " for the " <> @link_addition)) diff --git a/test/hexdocs/file_rewriter_test.exs b/test/hexdocs/file_rewriter_test.exs index 174a050..f31ae25 100644 --- a/test/hexdocs/file_rewriter_test.exs +++ b/test/hexdocs/file_rewriter_test.exs @@ -17,6 +17,108 @@ defmodule Hexdocs.FileRewriterTest do assert FileRewriter.run("index.html", ~s||) == "" end + describe "rewrite hexdocs.pm canonical links to subdomains" do + test "rewrites a canonical link" do + assert FileRewriter.run( + "index.html", + ~s|| + ) == + ~s|| + end + + test "preserves version, query and fragment in the tail" do + assert FileRewriter.run( + "index.html", + ~s|| + ) == + ~s|| + end + + test "maps underscores in the package name to hyphens" do + assert FileRewriter.run( + "index.html", + ~s|| + ) == + ~s|| + end + + test "rewrites the hex package" do + assert FileRewriter.run( + "index.html", + ~s|| + ) == + ~s|| + end + + test "rewrites http links to https subdomains" do + assert FileRewriter.run( + "index.html", + ~s|| + ) == + ~s|| + end + + test "handles href before rel in the canonical tag" do + assert FileRewriter.run( + "index.html", + ~s|| + ) == + ~s|| + end + + test "does not rewrite body links or text" do + for input <- [ + ~s|Jason|, + ~s|
visit https://hexdocs.pm/jason/readme.html
| + ] do + assert FileRewriter.run("index.html", input) == input + end + end + + test "does not rewrite other link tags" do + input = ~s|| + assert FileRewriter.run("index.html", input) == input + end + + test "leaves the bare apex untouched" do + for input <- [ + ~s||, + ~s|| + ] do + assert FileRewriter.run("index.html", input) == input + end + end + + test "leaves apex files untouched" do + for input <- [ + ~s||, + ~s|| + ] do + assert FileRewriter.run("index.html", input) == input + end + end + + test "does not touch canonical links that already use a subdomain" do + for input <- [ + ~s||, + ~s|| + ] do + assert FileRewriter.run("index.html", input) == input + end + end + + test "is idempotent" do + input = ~s|| + once = FileRewriter.run("index.html", input) + assert FileRewriter.run("index.html", once) == once + end + + test "does not modify non-html files" do + input = ~s|| + assert FileRewriter.run("index.js", input) == input + end + end + describe "add_nofollow" do test "adds rel=nofollow to external links" do assert FileRewriter.run("index.html", ~s|example|) == @@ -42,7 +144,7 @@ defmodule Hexdocs.FileRewriterTest do test "does not add nofollow to official ecosystem links" do for url <- [ "https://hex.pm/packages/foo", - "https://hexdocs.pm/foo", + "https://hexdocs.pm", "https://elixir-lang.org", "https://www.erlang.org", "https://preview.hexdocs.pm/foo"