From d8209ac91d477764a5eb4d294705ba2ea299d6e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eric=20Meadows-J=C3=B6nsson?=
 <eric.meadows.jonsson@gmail.com>
Date: Sun, 31 May 2026 19:09:04 +0200
Subject: [PATCH] Rewrite hexdocs.pm canonical links to per-package subdomains

ExDoc emits a <link rel="canonical"> tag (when the package sets the
:canonical option) pointing at the old path-based URL,
https://hexdocs.pm/<package>/... . Now that docs are served from
per-package subdomains, that canonical points away from where the page
actually lives, splitting SEO signal.

Rewrite the canonical tag at ingestion time in the file rewriter so it
points at https://<package>.hexdocs.pm/... , reusing package_to_subdomain
for the underscore-to-hyphen mapping and upgrading http to https. The
bare apex, apex files such as sitemap.xml, and canonical links that
already use a subdomain are left untouched.

Body links and other tags are intentionally not rewritten: a permanent
redirect from the old URLs preserves link equity via 301, so canonical
is the only tag where the rewrite changes SEO behavior.
---
 lib/hexdocs/file_rewriter.ex        |  16 +++++
 test/hexdocs/file_rewriter_test.exs | 104 +++++++++++++++++++++++++++-
 2 files changed, 119 insertions(+), 1 deletion(-)
diff --git a/lib/hexdocs/file_rewriter.ex b/lib/hexdocs/file_rewriter.ex
index 5dd73e2..3e65054 100644
--- a/lib/hexdocs/file_rewriter.ex
+++ b/lib/hexdocs/file_rewriter.ex
@@ -16,9 +16,25 @@ defmodule Hexdocs.FileRewriter do
     |> add_elixir_org_link(path)
     |> add_analytics(path)
     |> remove_noindex(path)
+    |> rewrite_canonical_links(path)
     |> add_nofollow(path)
   end
 
+  @canonical_tag_re ~r{<link[^>]*\brel=["']canonical["'][^>]*>}i
+  @hexdocs_link_re ~r{https?://hexdocs\.pm/([a-z][a-z0-9_]*)(?![a-zA-Z0-9_.-])}
+
+  defp rewrite_canonical_links(content, path) do
+    if String.ends_with?(path, ".html") do
+      Regex.replace(@canonical_tag_re, content, fn tag ->
+        Regex.replace(@hexdocs_link_re, tag, fn _match, package ->
+          "https://#{Hexdocs.Utils.package_to_subdomain(package)}.hexdocs.pm"
+        end)
+      end)
+    else
+      content
+    end
+  end
+
   defp add_elixir_org_link(content, path) do
     if String.ends_with?(path, ".html") and not String.contains?(content, @link_addition) do
       String.replace(content, @link_hooks, &(&1 <> " for the " <> @link_addition))
diff --git a/test/hexdocs/file_rewriter_test.exs b/test/hexdocs/file_rewriter_test.exs
index 174a050..f31ae25 100644
--- a/test/hexdocs/file_rewriter_test.exs
+++ b/test/hexdocs/file_rewriter_test.exs
@@ -17,6 +17,108 @@ defmodule Hexdocs.FileRewriterTest do
     assert FileRewriter.run("index.html", ~s|<meta name="robots" content="noindex">|) == ""
   end
 
+  describe "rewrite hexdocs.pm canonical links to subdomains" do
+    test "rewrites a canonical link" do
+      assert FileRewriter.run(
+               "index.html",
+               ~s|<link rel="canonical" href="https://hexdocs.pm/jason/Jason.html"/>|
+             ) ==
+               ~s|<link rel="canonical" href="https://jason.hexdocs.pm/Jason.html"/>|
+    end
+
+    test "preserves version, query and fragment in the tail" do
+      assert FileRewriter.run(
+               "index.html",
+               ~s|<link rel="canonical" href="https://hexdocs.pm/jason/1.4.0/Jason.html?foo=bar#decode/2"/>|
+             ) ==
+               ~s|<link rel="canonical" href="https://jason.hexdocs.pm/1.4.0/Jason.html?foo=bar#decode/2"/>|
+    end
+
+    test "maps underscores in the package name to hyphens" do
+      assert FileRewriter.run(
+               "index.html",
+               ~s|<link rel="canonical" href="https://hexdocs.pm/phoenix_html/Phoenix.HTML.html"/>|
+             ) ==
+               ~s|<link rel="canonical" href="https://phoenix-html.hexdocs.pm/Phoenix.HTML.html"/>|
+    end
+
+    test "rewrites the hex package" do
+      assert FileRewriter.run(
+               "index.html",
+               ~s|<link rel="canonical" href="https://hexdocs.pm/hex/usage.html"/>|
+             ) ==
+               ~s|<link rel="canonical" href="https://hex.hexdocs.pm/usage.html"/>|
+    end
+
+    test "rewrites http links to https subdomains" do
+      assert FileRewriter.run(
+               "index.html",
+               ~s|<link rel="canonical" href="http://hexdocs.pm/jason/Jason.html"/>|
+             ) ==
+               ~s|<link rel="canonical" href="https://jason.hexdocs.pm/Jason.html"/>|
+    end
+
+    test "handles href before rel in the canonical tag" do
+      assert FileRewriter.run(
+               "index.html",
+               ~s|<link href="https://hexdocs.pm/jason/Jason.html" rel="canonical">|
+             ) ==
+               ~s|<link href="https://jason.hexdocs.pm/Jason.html" rel="canonical">|
+    end
+
+    test "does not rewrite body links or text" do
+      for input <- [
+            ~s|<a href="https://hexdocs.pm/jason/Jason.html">Jason</a>|,
+            ~s|<pre><code>visit https://hexdocs.pm/jason/readme.html</code></pre>|
+          ] do
+        assert FileRewriter.run("index.html", input) == input
+      end
+    end
+
+    test "does not rewrite other link tags" do
+      input = ~s|<link rel="stylesheet" href="https://hexdocs.pm/jason/app.css">|
+      assert FileRewriter.run("index.html", input) == input
+    end
+
+    test "leaves the bare apex untouched" do
+      for input <- [
+            ~s|<link rel="canonical" href="https://hexdocs.pm"/>|,
+            ~s|<link rel="canonical" href="https://hexdocs.pm/"/>|
+          ] do
+        assert FileRewriter.run("index.html", input) == input
+      end
+    end
+
+    test "leaves apex files untouched" do
+      for input <- [
+            ~s|<link rel="canonical" href="https://hexdocs.pm/sitemap.xml"/>|,
+            ~s|<link rel="canonical" href="https://hexdocs.pm/foo.html"/>|
+          ] do
+        assert FileRewriter.run("index.html", input) == input
+      end
+    end
+
+    test "does not touch canonical links that already use a subdomain" do
+      for input <- [
+            ~s|<link rel="canonical" href="https://jason.hexdocs.pm/Jason.html"/>|,
+            ~s|<link rel="canonical" href="https://preview.hexdocs.pm/foo/Foo.html"/>|
+          ] do
+        assert FileRewriter.run("index.html", input) == input
+      end
+    end
+
+    test "is idempotent" do
+      input = ~s|<link rel="canonical" href="https://hexdocs.pm/jason/Jason.html"/>|
+      once = FileRewriter.run("index.html", input)
+      assert FileRewriter.run("index.html", once) == once
+    end
+
+    test "does not modify non-html files" do
+      input = ~s|<link rel="canonical" href="https://hexdocs.pm/jason/Jason.html"/>|
+      assert FileRewriter.run("index.js", input) == input
+    end
+  end
+
   describe "add_nofollow" do
     test "adds rel=nofollow to external links" do
       assert FileRewriter.run("index.html", ~s|<a href="https://example.com">example</a>|) ==
@@ -42,7 +144,7 @@ defmodule Hexdocs.FileRewriterTest do
     test "does not add nofollow to official ecosystem links" do
       for url <- [
             "https://hex.pm/packages/foo",
-            "https://hexdocs.pm/foo",
+            "https://hexdocs.pm",
             "https://elixir-lang.org",
             "https://www.erlang.org",
             "https://preview.hexdocs.pm/foo"