diff --git a/lib/anystyle/document.rb b/lib/anystyle/document.rb index 3b01960..cf27fa6 100644 --- a/lib/anystyle/document.rb +++ b/lib/anystyle/document.rb @@ -4,13 +4,15 @@ class << self include PdfUtils def parse(string, delimiter: /\n/, tagged: false) + current_label = nil new(string.split(delimiter).map { |line| label, line = line.split(/\s*:/, 2) if tagged - Wapiti::Token.new line, label: label.to_s + current_label = label || current_label + Wapiti::Token.new line, label: current_label.to_s }) end - def open(path, format: File.extname(path), tagged: false) + def open(path, format: File.extname(path), tagged: false, **options) raise ArgumentError, "cannot open tainted path: '#{path}'" if path.tainted? raise ArgumentError, @@ -20,8 +22,8 @@ def open(path, format: File.extname(path), tagged: false) case format.downcase when '.pdf' - meta = pdf_meta path - info = pdf_info path + meta = pdf_meta path if options[:parse_meta] + info = pdf_info path if options[:parse_info] input = pdf_to_text path when '.ttx' tagged = true diff --git a/lib/anystyle/finder.rb b/lib/anystyle/finder.rb index 6015bc8..7efa0b5 100644 --- a/lib/anystyle/finder.rb +++ b/lib/anystyle/finder.rb @@ -15,5 +15,15 @@ class Finder < ParserCore def initialize(options = {}) super(options) end + + def expand(dataset) + dataset.each do |doc| + doc.each_with_index do |(line, page, pn), ln| + line.observations = features.map { |f| + f.observe line.value, page: page, doc: doc, pn: pn, ln: ln + } + end + end + end end end diff --git a/lib/anystyle/parser.rb b/lib/anystyle/parser.rb index 501a822..9a57f4d 100644 --- a/lib/anystyle/parser.rb +++ b/lib/anystyle/parser.rb @@ -69,14 +69,7 @@ def normalize(item) end def expand(dataset) - dataset.each do |seq| - seq.tokens.each_with_index do |tok, idx| - alpha = scrub tok.value - tok.observations = features.map { |f| - f.observe tok.value, alpha, idx, seq - } - end - end + raise NotImplementedError end def prepare(input, **opts) @@ -86,6 +79,8 @@ def prepare(input, **opts) case input when Wapiti::Dataset expand input + when Wapiti::Sequence + expand Wapiti::Dataset.new([input]) when String if !input.tainted? && input.length < 1024 && File.exists?(input) expand Wapiti::Dataset.open(input, opts) @@ -155,6 +150,21 @@ def initialize(options = {}) ] end + def expand(dataset) + dataset.each do |seq| + seq.tokens.each_with_index do |tok, idx| + alpha = scrub tok.value + tok.observations = features.map { |f| + f.observe tok.value, alpha, idx, seq + } + end + end + end + + def format_hash(dataset) + dataset.map { |seq| normalize(seq.to_h(symbolize_keys: true)) } + end + def parse(input, format: options[:format]) case format when :wapiti @@ -166,10 +176,5 @@ def parse(input, format: options[:format]) raise ArgumentError, "format not supported: #{format}" end end - - - def format_hash(dataset) - dataset.map { |seq| normalize(seq.to_h(symbolize_keys: true)) } - end end end