Skip to content

Commit

Permalink
Ensure URL fragments to named anchors also validate (#363)
Browse files Browse the repository at this point in the history
Ensure URL fragments to named anchors also validate
  • Loading branch information
pvlugter committed Sep 15, 2019
2 parents bf1b17d + 28e907a commit dc86f75
Show file tree
Hide file tree
Showing 9 changed files with 145 additions and 88 deletions.
3 changes: 2 additions & 1 deletion build.sbt
Expand Up @@ -46,7 +46,8 @@ lazy val core = project
name := "paradox",
libraryDependencies ++= Library.pegdown,
libraryDependencies ++= Seq(
Library.st4
Library.st4,
Library.jsoup
),
parallelExecution in Test := false
)
Expand Down
90 changes: 55 additions & 35 deletions core/src/main/scala/com/lightbend/paradox/ParadoxProcessor.scala
Expand Up @@ -20,14 +20,14 @@ import com.lightbend.paradox.template.PageTemplate
import com.lightbend.paradox.markdown._
import com.lightbend.paradox.tree.Tree.{ Forest, Location }
import java.io.{ File, FileOutputStream, OutputStreamWriter }
import java.net.{ HttpURLConnection, URI }
import java.nio.charset.StandardCharsets

import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.pegdown.ast._

import scala.annotation.tailrec
import scala.collection.JavaConverters._
import scala.io.Source
import scala.util.control.NonFatal
import scala.util.matching.Regex

Expand Down Expand Up @@ -137,49 +137,65 @@ class ParadoxProcessor(reader: Reader = new Reader, writer: Writer = new Writer)
validate(Some(root.location))
}

def ignore(path: String) = ignorePaths.exists(_.pattern.matcher(path).matches())

linkCapturer.allLinks.foreach {
case CapturedLinkWithSources(CapturedAbsoluteLink(uri), sources) if validateAbsolute && !ignore(uri.toString) =>
validateExternalLink(uri, reportErrorOnSources(errorCollector, sources), logger)
case CapturedLinkWithSources(CapturedRelativeLink(path, fragment), sources) if !ignore(path) =>
fullSite.get(path) match {
case Some(file) =>
fragment.foreach { f =>
validateFragment(path, Source.fromFile(file, "UTF-8").mkString, f, reportErrorOnSources(errorCollector, sources))
}
case None =>
reportErrorOnSources(errorCollector, sources)(s"Could not find path [$path] in site")
}
case _ =>
// Ignore
}
linkCapturer.allLinks
.filterNot(l => ignorePaths.exists(_.pattern.matcher(l.link.toString).matches()))
.foreach {
case c @ CapturedLink(uri, fragments) if c.isInternal =>
fullSite.get(uri.getPath) match {
case Some(file) =>
if (c.hasFragments) {
validateFragments(uri.getPath, Jsoup.parse(file, "UTF-8"), fragments, errorCollector)
}
case None =>
reportErrorOnSources(errorCollector, c.allSources)(s"Could not find path [${uri.getPath}] in site")
}
case absolute if validateAbsolute =>
validateExternalLink(absolute, errorCollector, logger)
case _ =>
// Ignore
}

errorCollector.logErrors(logger)
errorCollector.errorCount
}

private def validateExternalLink(uri: URI, reportError: String => Unit, logger: ParadoxLogger) = {
logger.info(s"Validating external link: $uri")
val conn = uri.toURL.openConnection().asInstanceOf[HttpURLConnection]
conn.addRequestProperty("User-Agent", "Paradox Link Validator <https://github.com/lightbend/paradox>")
private def validateExternalLink(capturedLink: CapturedLink, errorContext: ErrorContext, logger: ParadoxLogger) = {
logger.info(s"Validating external link: ${capturedLink.link}")

def reportError = reportErrorOnSources(errorContext, capturedLink.allSources)(_)
val url = capturedLink.link.toString

try {
if (conn.getResponseCode / 100 == 3) {
reportError(s"Received a ${conn.getResponseCode} redirect on external link, location redirected to is [${conn.getHeaderField("Location")}]")
} else if (conn.getResponseCode != 200) {
reportError(s"Error validating external link, status code was ${conn.getResponseCode}")
val response = Jsoup.connect(url)
.userAgent("Paradox Link Validator <https://github.com/lightbend/paradox>")
.followRedirects(false)
.ignoreHttpErrors(true)
.ignoreContentType(true)
.execute()

// jsoup doesn't offer any simple way to clean up, the only way to close is to get the body stream and close it,
// but if you've already read the response body, that will throw an exception, and there's no way to check if
// you've already tried to read the response body, so we can't do that in a finally block, we have to do it
// explicitly every time we don't want to consume the stream.
def close() = response.bodyStream().close()

if (response.statusCode() / 100 == 3) {
close()
reportError(s"Received a ${response.statusCode()} ${response.statusMessage()} on external link, location redirected to is [${response.header("Location")}]")
} else if (response.statusCode() != 200) {
close()
reportError(s"Error validating external link, status was ${response.statusCode()} ${response.statusMessage()}")
} else {
if (uri.getFragment != null) {
val content = Source.fromInputStream(conn.getInputStream).mkString
validateFragment(uri.toString, content, uri.getFragment, reportError)
if (capturedLink.hasFragments) {
validateFragments(url, response.parse(), capturedLink.fragments, errorContext)
} else {
close()
}
}
} catch {
case NonFatal(e) =>
reportError(s"Exception occurred when validating external link: $e")
logger.debug(e)
} finally {
conn.disconnect()
}
}

Expand All @@ -189,9 +205,13 @@ class ParadoxProcessor(reader: Reader = new Reader, writer: Writer = new Writer)
}
}

private def validateFragment(path: String, content: String, fragment: String, reportError: String => Unit) = {
if (!content.contains("id=\"" + fragment + "\"")) {
reportError(s"Could not find anchor id [$fragment] in page [$path]")
private def validateFragments(path: String, content: Document, fragments: List[CapturedLinkFragment], errorContext: ErrorContext): Unit = {
fragments.foreach {
case CapturedLinkFragment(Some(fragment), sources) =>
if (content.getElementById(fragment) == null && content.select(s"a[name=$fragment]").isEmpty) {
reportErrorOnSources(errorContext, sources)(s"Could not find anchor [$fragment] in page [$path]")
}
case _ =>
}
}

Expand Down
Expand Up @@ -89,10 +89,10 @@ class LinkCapturer {
}
}

def allLinks: List[CapturedLinkWithSources] = {
def allLinks: List[CapturedLink] = {
// First, resolve the links, discarding links that we can't resolve.
links.collect {

case (page, node, uri) if isPageInSite(page, uri) =>
case Link(page, node, uri, fragment) if isPageInSite(page, uri) =>
val path = node match {
// Javadoc links may use the frames style, and may reference index.html, if so, need to drop it.
case d: DirectiveNode if d.name == "javadoc" && uri.getQuery != null =>
Expand All @@ -107,19 +107,22 @@ class LinkCapturer {
val pathWithIndex = if (path.endsWith("/")) uri.getPath + "index.html"
else path

val relativePath = URI.create(page.path).resolve(pathWithIndex).getPath

(CapturedRelativeLink(relativePath, Option(uri.getFragment)), page, node)

case (page, node, uri) if uri.getAuthority != null =>
(CapturedAbsoluteLink(uri), page, node)

}.groupBy(_._1).map {
case (link, links) =>
CapturedLinkWithSources(link, links.map {
case (_, page, node) => (page.file, node)
})
}.toList
val resolvedUri = URI.create(page.path).resolve(pathWithIndex)
Link(page, node, resolvedUri, fragment)

case link @ Link(_, _, uri, _) if uri.getAuthority != null => link
}.groupBy(_.uri)
.toList
.map {
case (uri, links) =>
val fragments = links.groupBy(_.fragment)
.toList
.map {
case (fragment, links) =>
CapturedLinkFragment(fragment, links.map(l => (l.page.file, l.node)))
}
CapturedLink(uri, fragments)
}
}

private def isPageInSite(page: Page, uri: URI): Boolean = {
Expand All @@ -136,18 +139,29 @@ class LinkCapturer {
}

private var nodeOverride: Option[Node] = None
private var links: List[(Page, Node, URI)] = Nil

private case class Link(page: Page, node: Node, uri: URI, fragment: Option[String])

private var links: List[Link] = Nil

def capture(page: Page, node: Node, rendering: LinkRenderer.Rendering): LinkRenderer.Rendering = {
links = (page, nodeOverride.getOrElse(node), URI.create(rendering.href)) :: links
val fullUri = URI.create(rendering.href)
val (uri, fragment) = if (fullUri.getFragment == null) (fullUri, None)
else (new URI(fullUri.getScheme, fullUri.getAuthority, fullUri.getPath, fullUri.getQuery, null), Some(fullUri.getFragment))
links = Link(page, nodeOverride.getOrElse(node), uri, fragment) :: links
rendering
}
}

case class CapturedLinkWithSources(link: CapturedLink, sources: List[(File, Node)])
sealed trait CapturedLink
case class CapturedAbsoluteLink(link: URI) extends CapturedLink
case class CapturedRelativeLink(path: String, fragment: Option[String]) extends CapturedLink
case class CapturedLink(link: URI, fragments: List[CapturedLinkFragment]) {
def allSources = fragments.flatMap(_.sources)

def isInternal = link.getAuthority == null && link.getPath != null

def hasFragments = fragments.size > 1 || fragments.headOption.flatMap(_.fragment).nonEmpty
}

case class CapturedLinkFragment(fragment: Option[String], sources: List[(File, Node)])

private class LinkCapturerRenderer(capturer: LinkCapturer, renderer: LinkRenderer, page: Page) extends LinkRenderer {
private def capture(node: Node, rendering: LinkRenderer.Rendering) = capturer.capture(page, node, rendering)
Expand Down
8 changes: 8 additions & 0 deletions plugin/src/sbt-test/paradox/validation/build.sbt
@@ -1,2 +1,10 @@
lazy val root = (project in file("."))
.enablePlugins(ParadoxPlugin)

lazy val additionalInternalMappings = settingKey[List[(File, String)]]("Additional mappings to add to paradox mappings")
lazy val additionalMappings = settingKey[List[(File, String)]]("Additional mappings to add to paradox mappings")

additionalInternalMappings := Nil
additionalMappings := Nil
(Compile / paradox / mappings) ++= additionalInternalMappings.value
(Compile / paradoxValidateInternalLinks / mappings) ++= additionalMappings.value
@@ -1,8 +1,12 @@
[docs site path link](test.html#frag)
[docs site path link name frag](test.html#namefrag)

[outside docs path link](../outside.html#frag)

https://www.lightbend.com/thispagedoesnotexist
This URL should 404 (or otherwise fail)
https://github.com/lightbend/thisrepodoesnotexist
This URL should 301 (or otherwise fail)
https://lightbend.com

Including this in build means we depend on github.com being up. Probably ok.
https://github.com/
36 changes: 20 additions & 16 deletions plugin/src/sbt-test/paradox/validation/test
Expand Up @@ -5,12 +5,11 @@
-> paradoxValidateInternalLinks

# Add the path, but without the fragment, so validation still fails
> set (Compile / paradox / mappings) += (file("withoutfragment.html") -> ("test.html"))
> set additionalInternalMappings := List(file("withoutfragment.html") -> ("test.html"))
-> paradoxValidateInternalLinks

# Now add the path with the fragment, validation should succeed
> reload
> set (Compile / paradox / mappings) += (file("withfragment.html") -> ("test.html"))
> set additionalInternalMappings := List(file("withfragment.html") -> ("test.html"))
> paradoxValidateInternalLinks

# Now set a base path of /, validation should fail due to /../outside.html not existing
Expand All @@ -22,27 +21,32 @@
-> paradoxValidateInternalLinks

# Now add an /outside.html, but without a fragment, validation should fail
> set (Compile / paradoxValidateInternalLinks / mappings) += (file("withoutfragment.html") -> ("/outside.html"))
> set additionalMappings := List(file("withoutfragment.html") -> ("/outside.html"))
-> paradoxValidateInternalLinks

# Now add an /outside.html, with a fragment, validation should succeed
> reload
> set paradoxValidationSiteBasePath := Some("/docs")
> set (Compile / paradox / mappings) += (file("withfragment.html") -> ("test.html"))
> set (Compile / paradoxValidateInternalLinks / mappings) += (file("withfragment.html") -> ("/outside.html"))
> set additionalMappings := List(file("withfragment.html") -> ("/outside.html"))
> paradoxValidateInternalLinks

# Also try setting the internal link with an absolute path
> reload
> set paradoxValidationSiteBasePath := Some("/docs")
> set (Compile / paradoxValidateInternalLinks / mappings) += (file("withfragment.html") -> ("/docs/test.html"))
> set (Compile / paradoxValidateInternalLinks / mappings) += (file("withfragment.html") -> ("/outside.html"))
> set additionalInternalMappings := Nil
> set additionalMappings += (file("withfragment.html") -> ("/docs/test.html"))
> paradoxValidateInternalLinks

# Finally, try validating external links, first check that it fails
-> paradoxValidateLinks
# Now ignore the link that fails
> set paradoxValidationIgnorePaths += ("https://www.lightbend.com.*").r
# The following test will fail if GitHub is down. That's fairly unlikely if we're running a CI build, since CI
# depends on GitHub being up to run in the first place, and also being up to report the status of the build, etc.

# Now make sure it fails when just the 404ing link is ignored
> set paradoxValidationIgnorePaths := List(".*thisrepodoesnotexist.*".r)
-> paradoxValidateLinks

# Now make sure it fails when just the 401ing link is ignored
> set paradoxValidationIgnorePaths := List("https://lightbend.com".r)
-> paradoxValidateLinks

# Now ignore both failing links, and it should pass. Note though that passing depends on GitHub being up,
# since there is a valid link that points to GitHub. If we're running in CI, GitHub being up is an alright
# assumption to make, given that GitHub triggered the build, and will have to be up to receive the status
# report
> set paradoxValidationIgnorePaths := List("https://lightbend.com".r, (".*thisrepodoesnotexist.*").r)
> paradoxValidateLinks
1 change: 1 addition & 0 deletions plugin/src/sbt-test/paradox/validation/withfragment.html
@@ -1,5 +1,6 @@
<html>
<body>
<a id="frag"></a>
<a name="namefrag"></a>
</body>
</html>
2 changes: 2 additions & 0 deletions project/Dependencies.scala
Expand Up @@ -25,6 +25,7 @@ object Version {
val sbtWeb = "1.4.4"
val scalatest = "3.0.8"
val st4 = "4.1"
val jsoup = "1.12.1"
}

object Library {
Expand All @@ -38,4 +39,5 @@ object Library {
val sbtWeb = "com.typesafe.sbt" % "sbt-web" % Version.sbtWeb
val scalatest = "org.scalatest" %% "scalatest" % Version.scalatest
val st4 = "org.antlr" % "ST4" % Version.st4
val jsoup = "org.jsoup" % "jsoup" % Version.jsoup
}

0 comments on commit dc86f75

Please sign in to comment.