diff --git a/lib/hexdocs/file_rewriter.ex b/lib/hexdocs/file_rewriter.ex index 5dd73e2..3e65054 100644 --- a/lib/hexdocs/file_rewriter.ex +++ b/lib/hexdocs/file_rewriter.ex @@ -16,9 +16,25 @@ defmodule Hexdocs.FileRewriter do |> add_elixir_org_link(path) |> add_analytics(path) |> remove_noindex(path) + |> rewrite_canonical_links(path) |> add_nofollow(path) end + @canonical_tag_re ~r{]*\brel=["']canonical["'][^>]*>}i + @hexdocs_link_re ~r{https?://hexdocs\.pm/([a-z][a-z0-9_]*)(?![a-zA-Z0-9_.-])} + + defp rewrite_canonical_links(content, path) do + if String.ends_with?(path, ".html") do + Regex.replace(@canonical_tag_re, content, fn tag -> + Regex.replace(@hexdocs_link_re, tag, fn _match, package -> + "https://#{Hexdocs.Utils.package_to_subdomain(package)}.hexdocs.pm" + end) + end) + else + content + end + end + defp add_elixir_org_link(content, path) do if String.ends_with?(path, ".html") and not String.contains?(content, @link_addition) do String.replace(content, @link_hooks, &(&1 <> " for the " <> @link_addition)) diff --git a/test/hexdocs/file_rewriter_test.exs b/test/hexdocs/file_rewriter_test.exs index 174a050..f31ae25 100644 --- a/test/hexdocs/file_rewriter_test.exs +++ b/test/hexdocs/file_rewriter_test.exs @@ -17,6 +17,108 @@ defmodule Hexdocs.FileRewriterTest do assert FileRewriter.run("index.html", ~s||) == "" end + describe "rewrite hexdocs.pm canonical links to subdomains" do + test "rewrites a canonical link" do + assert FileRewriter.run( + "index.html", + ~s|| + ) == + ~s|| + end + + test "preserves version, query and fragment in the tail" do + assert FileRewriter.run( + "index.html", + ~s|| + ) == + ~s|| + end + + test "maps underscores in the package name to hyphens" do + assert FileRewriter.run( + "index.html", + ~s|| + ) == + ~s|| + end + + test "rewrites the hex package" do + assert FileRewriter.run( + "index.html", + ~s|| + ) == + ~s|| + end + + test "rewrites http links to https subdomains" do + assert FileRewriter.run( + "index.html", + ~s|| + ) == + ~s|| + end + + test "handles href before rel in the canonical tag" do + assert FileRewriter.run( + "index.html", + ~s|| + ) == + ~s|| + end + + test "does not rewrite body links or text" do + for input <- [ + ~s|Jason|, + ~s|
visit https://hexdocs.pm/jason/readme.html
| + ] do + assert FileRewriter.run("index.html", input) == input + end + end + + test "does not rewrite other link tags" do + input = ~s|| + assert FileRewriter.run("index.html", input) == input + end + + test "leaves the bare apex untouched" do + for input <- [ + ~s||, + ~s|| + ] do + assert FileRewriter.run("index.html", input) == input + end + end + + test "leaves apex files untouched" do + for input <- [ + ~s||, + ~s|| + ] do + assert FileRewriter.run("index.html", input) == input + end + end + + test "does not touch canonical links that already use a subdomain" do + for input <- [ + ~s||, + ~s|| + ] do + assert FileRewriter.run("index.html", input) == input + end + end + + test "is idempotent" do + input = ~s|| + once = FileRewriter.run("index.html", input) + assert FileRewriter.run("index.html", once) == once + end + + test "does not modify non-html files" do + input = ~s|| + assert FileRewriter.run("index.js", input) == input + end + end + describe "add_nofollow" do test "adds rel=nofollow to external links" do assert FileRewriter.run("index.html", ~s|example|) == @@ -42,7 +144,7 @@ defmodule Hexdocs.FileRewriterTest do test "does not add nofollow to official ecosystem links" do for url <- [ "https://hex.pm/packages/foo", - "https://hexdocs.pm/foo", + "https://hexdocs.pm", "https://elixir-lang.org", "https://www.erlang.org", "https://preview.hexdocs.pm/foo"