Skip to content
Permalink
master
Go to file
 
 
Cannot retrieve contributors at this time
177 lines (143 sloc) 4.83 KB
# encoding: UTF-8
$:.push(File.dirname($0))
require 'utility-functions'
require 'json'
unless ARGV.size >= 2 or ARGV.size == 0
puts "Usage: ruby taskpaper-extract.rb <scrivener|taskpaper|machine> <infile> [outfile]\n
For example: ruby taskpaper-extract.rb scrivener litreview.taskpaper litreview
(Scrivener outputs to a directory, with each tag in a separate textfile, taskpaper to a single hierarchical textfile)
Without an output name specified, print to stdout."
exit
end
# these defaults are for Taskpaper formatted files, modify for org-mode or any other format
Indent_pattern = /^(\t*)/ # each indent level is determined by one tab
Citekey_pattern = /^\[@(.+?)\]/ # lines that begin with [@...] "contaminate" all indented lines below
if ARGV.size == 0
a = ARGF.read
else
a = try { File.read(ARGV[1]) }
unless a
puts "Could not read input file"
exit
end
end
lines = Array.new
linecontext = Array.new
# insert lines into an array with first argument being indent level, second being line content
a.lines.each_with_index do |l, i|
# count tabs at front of line to get indent level
tabs = $1 if Indent_pattern =~ l
level = (defined? tabs) ? tabs.size : 0
lines[i] = [level, l]
end
# iterate through array and for each line, pick out "line context" (all subsequent lines at lower levels)
ckey = '' # holds the current citation key
all_tagged = '' # holds all tagged text, to later check for lines that have not been tagged
ckey_pattern = /^\[@(.+?)\]/
lines.each_with_index do |l, i|
level = l[0]
# if first level, grab ckey or empty out
if level == 0
ckey = (Citekey_pattern =~ l[1]) ? $1 : ''
end
if i == lines.size-1 || lines[i+1][0] <= level # if it's the last entry, or the next entry is lower level
linecontext[i] = [l[1].strip, ckey]
else
curlevel = level
c = i+1 # set start of counter to current pub
text = l[1].dup
while c < lines.size && lines[c][0] > level # as long as the level of the line in question is higher than the start level
curlevel, t = lines[c]
text << t[l[0]..-1] # remove the same number of indents as highest level has, preserve subsequent indents
c += 1
end
linecontext[i] = [text.strip, ckey]
end
end
tag_regexp = /
\B # non-word marker
(?<!\[) # not preceded by [ (to avoid catching publication references like [@publication])
\#(?<tagcapt>.+?) # word starting with @
\b # word boundary
/x
tags = Hash.new
# iterate through array, and if tag is found, insert line context for that line into tag hash
lines.each_with_index do |l, i|
f = l[1].scan2(tag_regexp) # recognize a @tag
if f # has tag
f[:tagcapt].each do |x| # for each tag if multiple
cont = linecontext[i][0]
cont.remove!(tag_regexp, /\[\@#{linecontext[i][1]}\]/, /\:$/)
tags.add(x, [cont.strip, linecontext[i][1]])
all_tagged << cont
end
end
end
# do a final sweep to see if any lines have not been collected
lines.each_with_index do |l, i|
next if l[1].remove(ckey_pattern).strip.size == 0 # nothing but ckey
next if l[1].scan2(tag_regexp) # recognize a @tag
cont = l[1].remove(/\[\@#{linecontext[i][1]}\]/, /\:$/).strip
unless all_tagged.index(cont) # unless it has been tagged
tags.add('not_tagged', [cont, linecontext[i][1]])
end
end
if ARGV.size == 0
puts JSON.generate(tags)
exit()
end
outdir = ARGV[2]
outdir ||= ARGV[1].remove(".taskpaper") + ".out"
case ARGV[0]
when 'scrivener'
`mkdir '#{outdir}'`
`rm -rf '#{outdir}/*.txt'`
tags.each do |tag, content|
out = ''
nockey = ''
content.each do |fragments|
if fragments[1] == ''
nockey << "#{fragments[0]}\n\n"
next
end
out << "#{fragments[0]} [@#{fragments[1]}]\n\n"
end
if nockey.size > 0
out << nockey
end
File.write("#{outdir}/#{tag}.txt", out)
end
when /(taskpaper|dokuwiki)/ #Taskpaper
out = ''
tags.each do |tag, content|
nockey = ''
out << "#{tag}:\n"
content.each do |fragments|
fragments[0] = fragments[0].lines.map {|ln| ("\t\t" + ln).remove("\n")}
if fragments[1] == ''
nockey << "#{fragments[0].join("\n")}\n"
next
end
out << "\t[@#{fragments[1]}]:\n#{fragments[0].join("\n")}\n"
end
out << "\tNo citekey:\n#{nockey}" if nockey.size > 0
end
if ARGV[0] == 'dokuwiki'
out.gsubs!(
["\t", ' '],
[/(?! )(.+?)$/, ' * \1'],
[/^ \* /, 'h2. ']
)
outdir = outdir + ".txt" unless outdir.index(".txt")
else
outdir = outdir + ".taskpaper" unless outdir.index(".taskpaper")
end
File.write(outdir, out)
else
puts "Did not recognize output format"
exit
end
#puts "#{tags.size} tags written to #{outdir}."
# ideas:
# - hierarchy of tags
# - if tag starts with @-, only take current line
You can’t perform that action at this time.