Skip to content

Commit

Permalink
support hexadecimal numeric html entities, fixes #13
Browse files Browse the repository at this point in the history
  • Loading branch information
Mario Hros committed May 24, 2021
1 parent a9ab4df commit 51179b0
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 3 deletions.
5 changes: 5 additions & 0 deletions go.mod
@@ -0,0 +1,5 @@
module github.com/k3a/html2text

go 1.16

require github.com/smartystreets/goconvey v1.6.4 // indirect
13 changes: 13 additions & 0 deletions go.sum
@@ -0,0 +1,13 @@
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8=
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM=
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s=
github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
16 changes: 13 additions & 3 deletions html2text.go
Expand Up @@ -17,16 +17,26 @@ var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
var numericEntityRE = regexp.MustCompile(`^#([0-9]+)$`)
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)

func parseHTMLEntity(entName string) (string, bool) {
if r, ok := entity[entName]; ok {
return string(r), true
}

if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
digits := match[1]
n, err := strconv.Atoi(digits)
var (
err error
n int64
digits = match[1]
)

if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
n, err = strconv.ParseInt(digits[1:], 16, 64)
} else {
n, err = strconv.ParseInt(digits, 10, 64)
}

if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
return string(rune(n)), true
}
Expand Down
1 change: 1 addition & 0 deletions html2text_test.go
Expand Up @@ -61,6 +61,7 @@ func TestHTML2Text(t *testing.T) {
So(HTML2Text(`fish & chips`), ShouldEqual, "fish & chips")
So(HTML2Text(`"I'm sorry, Dave. I'm afraid I can't do that." – HAL, 2001: A Space Odyssey`), ShouldEqual, "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey")
So(HTML2Text(`Google ®`), ShouldEqual, "Google ®")
So(HTML2Text(`⁌ decimal and hex entities supported ⁍`), ShouldEqual, "⁌ decimal and hex entities supported ⁍")
})

Convey("Large Entity", func() {
Expand Down

0 comments on commit 51179b0

Please sign in to comment.