Skip to content

Commit

Permalink
Move XML encoding to StringUtils
Browse files Browse the repository at this point in the history
  • Loading branch information
inukshuk committed Dec 12, 2017
1 parent 5d0792f commit 9fd5cb2
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 20 deletions.
4 changes: 2 additions & 2 deletions lib/anystyle/feature.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
module AnyStyle
class Feature
include StringUtils

@available = {}

class << self
Expand All @@ -14,8 +16,6 @@ def key
end
end

include UnicodeUtils

def name
self.class.key
end
Expand Down
19 changes: 2 additions & 17 deletions lib/anystyle/parser/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ module AnyStyle
module Parser

class Parser
include StringUtils

@formats = [:bibtex, :hash, :normalized, :citeproc, :xml, :tags, :raw].freeze

Expand All @@ -14,9 +15,7 @@ class Parser
:threads => 4,
:separator => /[[:space:]]+|\b(\d[^[:space:]]*:)/,
:tagged_separator => /[[:space:]]+|(<\/?[^>]+>)/,
:strip => /[^[:alnum:]]/,
:format => :normalized,
:xml_entities => Hash[*%w{ &amp; & &lt; < &gt; > &apos; ' &quot; " }],
:training_data => File.expand_path('../../../../res/train.txt', __FILE__)
}.freeze

Expand Down Expand Up @@ -150,7 +149,7 @@ def prepare(input, tagged = false)
# Expands the passed-in token string by appending a space separated list
# of all features for the token.
def expand(token, sequence = [], offset = 0, label = nil)
f = features_for(token, strip(token), offset, sequence)
f = features_for(token, scrub(token), offset, sequence)
f.unshift(token)
f.push(label) unless label.nil?
f.join(' ')
Expand Down Expand Up @@ -278,20 +277,6 @@ def features_for(*arguments)
features.map { |f| f.elicit(*arguments) }
end

def strip(token)
token.gsub(options[:strip], '')
end

def decode_xml_text(string)
string.gsub(/&(amp|gt|lt);/) do |match|
options[:xml_entities][match]
end
end

def encode_xml_text(string)
string.encode string.encoding, :xml => :text
end

def format_bibtex(labels)
b = BibTeX::Bibliography.new
format_normalized(labels).each do |hash|
Expand Down
14 changes: 13 additions & 1 deletion lib/anystyle/utils.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def maybe_require(mod)
end
end

module UnicodeUtils
module StringUtils
def scrub(string, blacklist: /[\p{^Alnum}\p{Lm}]/)
string.scrub.gsub(blacklist, '')
end
Expand All @@ -22,6 +22,18 @@ def transliterate(string, form: :nfkd)
def canonize(string)
scrub(transliterate(string)).downcase
end

XML_ENTITIES = Hash[*%w{
&amp; & &lt; < &gt; > &apos; ' &quot; "
}].freeze

def decode_xml_text(string)
string.gsub(/&(amp|gt|lt);/) { |entity| XML_ENTITIES[entity] }
end

def encode_xml_text(string)
string.encode string.encoding, xml: :text
end
end

extend Utils
Expand Down

0 comments on commit 9fd5cb2

Please sign in to comment.