Skip to content

Commit

Permalink
Prepare document features
Browse files Browse the repository at this point in the history
  • Loading branch information
inukshuk committed Mar 16, 2018
1 parent cca5710 commit 125b3f3
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 17 deletions.
10 changes: 6 additions & 4 deletions lib/anystyle/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ class << self
include PdfUtils

def parse(string, delimiter: /\n/, tagged: false)
current_label = nil
new(string.split(delimiter).map { |line|
label, line = line.split(/\s*:/, 2) if tagged
Wapiti::Token.new line, label: label.to_s
current_label = label || current_label
Wapiti::Token.new line, label: current_label.to_s
})
end

def open(path, format: File.extname(path), tagged: false)
def open(path, format: File.extname(path), tagged: false, **options)
raise ArgumentError,
"cannot open tainted path: '#{path}'" if path.tainted?
raise ArgumentError,
Expand All @@ -20,8 +22,8 @@ def open(path, format: File.extname(path), tagged: false)

case format.downcase
when '.pdf'
meta = pdf_meta path
info = pdf_info path
meta = pdf_meta path if options[:parse_meta]
info = pdf_info path if options[:parse_info]
input = pdf_to_text path
when '.ttx'
tagged = true
Expand Down
10 changes: 10 additions & 0 deletions lib/anystyle/finder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,15 @@ class Finder < ParserCore
def initialize(options = {})
super(options)
end

def expand(dataset)
dataset.each do |doc|
doc.each_with_index do |(line, page, pn), ln|
line.observations = features.map { |f|
f.observe line.value, page: page, doc: doc, pn: pn, ln: ln
}
end
end
end
end
end
31 changes: 18 additions & 13 deletions lib/anystyle/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,7 @@ def normalize(item)
end

def expand(dataset)
dataset.each do |seq|
seq.tokens.each_with_index do |tok, idx|
alpha = scrub tok.value
tok.observations = features.map { |f|
f.observe tok.value, alpha, idx, seq
}
end
end
raise NotImplementedError
end

def prepare(input, **opts)
Expand All @@ -86,6 +79,8 @@ def prepare(input, **opts)
case input
when Wapiti::Dataset
expand input
when Wapiti::Sequence
expand Wapiti::Dataset.new([input])
when String
if !input.tainted? && input.length < 1024 && File.exists?(input)
expand Wapiti::Dataset.open(input, opts)
Expand Down Expand Up @@ -155,6 +150,21 @@ def initialize(options = {})
]
end

def expand(dataset)
dataset.each do |seq|
seq.tokens.each_with_index do |tok, idx|
alpha = scrub tok.value
tok.observations = features.map { |f|
f.observe tok.value, alpha, idx, seq
}
end
end
end

def format_hash(dataset)
dataset.map { |seq| normalize(seq.to_h(symbolize_keys: true)) }
end

def parse(input, format: options[:format])
case format
when :wapiti
Expand All @@ -166,10 +176,5 @@ def parse(input, format: options[:format])
raise ArgumentError, "format not supported: #{format}"
end
end


def format_hash(dataset)
dataset.map { |seq| normalize(seq.to_h(symbolize_keys: true)) }
end
end
end

0 comments on commit 125b3f3

Please sign in to comment.