Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions lib/hexdocs/file_rewriter.ex
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,25 @@ defmodule Hexdocs.FileRewriter do
|> add_elixir_org_link(path)
|> add_analytics(path)
|> remove_noindex(path)
|> rewrite_canonical_links(path)
|> add_nofollow(path)
end

@canonical_tag_re ~r{<link[^>]*\brel=["']canonical["'][^>]*>}i
@hexdocs_link_re ~r{https?://hexdocs\.pm/([a-z][a-z0-9_]*)(?![a-zA-Z0-9_.-])}

defp rewrite_canonical_links(content, path) do
if String.ends_with?(path, ".html") do
Regex.replace(@canonical_tag_re, content, fn tag ->
Regex.replace(@hexdocs_link_re, tag, fn _match, package ->
"https://#{Hexdocs.Utils.package_to_subdomain(package)}.hexdocs.pm"
end)
end)
else
content
end
end

defp add_elixir_org_link(content, path) do
if String.ends_with?(path, ".html") and not String.contains?(content, @link_addition) do
String.replace(content, @link_hooks, &(&1 <> " for the " <> @link_addition))
Expand Down
104 changes: 103 additions & 1 deletion test/hexdocs/file_rewriter_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,108 @@ defmodule Hexdocs.FileRewriterTest do
assert FileRewriter.run("index.html", ~s|<meta name="robots" content="noindex">|) == ""
end

describe "rewrite hexdocs.pm canonical links to subdomains" do
test "rewrites a canonical link" do
assert FileRewriter.run(
"index.html",
~s|<link rel="canonical" href="https://hexdocs.pm/jason/Jason.html"/>|
) ==
~s|<link rel="canonical" href="https://jason.hexdocs.pm/Jason.html"/>|
end

test "preserves version, query and fragment in the tail" do
assert FileRewriter.run(
"index.html",
~s|<link rel="canonical" href="https://hexdocs.pm/jason/1.4.0/Jason.html?foo=bar#decode/2"/>|
) ==
~s|<link rel="canonical" href="https://jason.hexdocs.pm/1.4.0/Jason.html?foo=bar#decode/2"/>|
end

test "maps underscores in the package name to hyphens" do
assert FileRewriter.run(
"index.html",
~s|<link rel="canonical" href="https://hexdocs.pm/phoenix_html/Phoenix.HTML.html"/>|
) ==
~s|<link rel="canonical" href="https://phoenix-html.hexdocs.pm/Phoenix.HTML.html"/>|
end

test "rewrites the hex package" do
assert FileRewriter.run(
"index.html",
~s|<link rel="canonical" href="https://hexdocs.pm/hex/usage.html"/>|
) ==
~s|<link rel="canonical" href="https://hex.hexdocs.pm/usage.html"/>|
end

test "rewrites http links to https subdomains" do
assert FileRewriter.run(
"index.html",
~s|<link rel="canonical" href="http://hexdocs.pm/jason/Jason.html"/>|
) ==
~s|<link rel="canonical" href="https://jason.hexdocs.pm/Jason.html"/>|
end

test "handles href before rel in the canonical tag" do
assert FileRewriter.run(
"index.html",
~s|<link href="https://hexdocs.pm/jason/Jason.html" rel="canonical">|
) ==
~s|<link href="https://jason.hexdocs.pm/Jason.html" rel="canonical">|
end

test "does not rewrite body links or text" do
for input <- [
~s|<a href="https://hexdocs.pm/jason/Jason.html">Jason</a>|,
~s|<pre><code>visit https://hexdocs.pm/jason/readme.html</code></pre>|
] do
assert FileRewriter.run("index.html", input) == input
end
end

test "does not rewrite other link tags" do
input = ~s|<link rel="stylesheet" href="https://hexdocs.pm/jason/app.css">|
assert FileRewriter.run("index.html", input) == input
end

test "leaves the bare apex untouched" do
for input <- [
~s|<link rel="canonical" href="https://hexdocs.pm"/>|,
~s|<link rel="canonical" href="https://hexdocs.pm/"/>|
] do
assert FileRewriter.run("index.html", input) == input
end
end

test "leaves apex files untouched" do
for input <- [
~s|<link rel="canonical" href="https://hexdocs.pm/sitemap.xml"/>|,
~s|<link rel="canonical" href="https://hexdocs.pm/foo.html"/>|
] do
assert FileRewriter.run("index.html", input) == input
end
end

test "does not touch canonical links that already use a subdomain" do
for input <- [
~s|<link rel="canonical" href="https://jason.hexdocs.pm/Jason.html"/>|,
~s|<link rel="canonical" href="https://preview.hexdocs.pm/foo/Foo.html"/>|
] do
assert FileRewriter.run("index.html", input) == input
end
end

test "is idempotent" do
input = ~s|<link rel="canonical" href="https://hexdocs.pm/jason/Jason.html"/>|
once = FileRewriter.run("index.html", input)
assert FileRewriter.run("index.html", once) == once
end

test "does not modify non-html files" do
input = ~s|<link rel="canonical" href="https://hexdocs.pm/jason/Jason.html"/>|
assert FileRewriter.run("index.js", input) == input
end
end

describe "add_nofollow" do
test "adds rel=nofollow to external links" do
assert FileRewriter.run("index.html", ~s|<a href="https://example.com">example</a>|) ==
Expand All @@ -42,7 +144,7 @@ defmodule Hexdocs.FileRewriterTest do
test "does not add nofollow to official ecosystem links" do
for url <- [
"https://hex.pm/packages/foo",
"https://hexdocs.pm/foo",
"https://hexdocs.pm",
"https://elixir-lang.org",
"https://www.erlang.org",
"https://preview.hexdocs.pm/foo"
Expand Down