Skip to content
This repository has been archived by the owner on Jun 25, 2019. It is now read-only.

Commit

Permalink
Update parse source with logger.
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremyf committed Mar 9, 2011
1 parent 885c819 commit 808b4dc
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 4 deletions.
36 changes: 34 additions & 2 deletions lib/parse_source.rb
@@ -1,13 +1,27 @@
#!/usr/bin/env ruby
require 'hpricot'
require 'yaml'
require 'logger'
log = Logger.new(LOG_FILE, 5, 10*1024)

exceptions = []
log.info "\n"
log.info "=" * 80
log.info "=" * 80
log.info "\n"
log.info "Parsing HTML File"
log.info "\n"
log.info "=" * 80
log.info "\n"

Dir.glob(File.join(File.dirname(__FILE__), '../src/html/*.html')).each do |filename|
begin
log.info
# filename = "/Users/jeremyf/Documents/Repositories/git/ndpr-conversion/lib/../src/review-2521.html"
dictionary = {}
review_id = File.basename(filename).gsub(/review-(\d*)\.html/,'\1')
log.info "\tBegin processing review ID: #{review_id}"

if File.size(filename) > 0
doc = open(filename) { |file| Hpricot(file) }

Expand All @@ -31,21 +45,39 @@
end
(content/"div#hr:first-of-type").each {|n| original_html.sub!(n.to_original_html, '')}

log.info "\t\tCatalog ID: #{dictionary['catalog-id']}"
log.info "\t\tAuthors: #{dictionary['author']}"
log.info "\t\tReview Title: #{dictionary['review_title']}"
log.info "\t\tBibliography: #{dictionary['bibliography']}"
log.info "\t\tReviewer: #{dictionary['reviewer']}"

# dictionary['catalog_id'] = node.inner_html.strip
# dictionary['authors'] = (content/"h4").first.inner_html.strip
# dictionary['review_title'] = (content/"h2").first.inner_html.strip
# dictionary['bibliography'] = (content/"p.biblio").first.inner_html.strip
# dictionary['reviewer'] = (content/"p strong").first.inner_html.strip
# review_content = (content/"div#hr").first.following_siblings.collect{|sib| sib.to_original_html}.join("\n")
dictionary['content'] = original_html #review_content #.sub(/ *\<\/div\> *\<\/div\>$/,'')
dictionary['content'] = original_html
end
File.open(File.join(File.dirname(__FILE__), "../src/yml/review-#{review_id}.yml"), 'w+') do |file|
file.puts YAML.dump(dictionary)
end
log.info "\tEnd processing review ID: #{review_id}"
end
rescue RuntimeError => e
exceptions << [filename, e]
end
end

puts "Errors: #{exceptions.inspect}"
log.info "\tExceptions"
exceptions.each do |exception|
log.info "\t\t#{exception}"
end
log.info "\n"
log.info "=" * 80
log.info "\n"
log.info "End Parsing HTML File"
log.info "\n"
log.info "=" * 80
log.info "=" * 80
log.info "\n"
6 changes: 4 additions & 2 deletions lib/render.rb
Expand Up @@ -4,14 +4,15 @@
require 'yaml'

Dir.glob(File.join(File.dirname(__FILE__), "../src/yml/*.yml")).each do |filename|
puts "Processing #{filename}"
object = YAML.load_file(filename)
buffer = ERB.new(File.read(File.join(File.dirname(__FILE__), '../src/template.erb.html'))).result(binding)
target_filename = File.join(File.dirname(__FILE__), "../src/output/review-#{object['review_id']}.html")
source_filename = File.join(File.dirname(__FILE__), "../src/html/review-#{object['review_id']}.html")
File.open(target_filename, 'w+') do |file|
file.puts buffer
end
regexp_for_split = /(\<|\>|\s)+/
regexp_for_split = /./
target = buffer.split(regexp_for_split).join("\n")
source = File.read(source_filename).split(regexp_for_split).join("\n")

Expand All @@ -24,7 +25,8 @@

diff = `diff #{tmp_source_filename} #{tmp_target_filename} -EwBb`.strip
if diff.any?
puts "Review differences for Review #{object['review_id']}:\n\n#{diff}"
require 'ruby-debug'; debugger; true;
puts "Review differences for Review #{object['review_id']}:\n\n#{diff}\n\n#{diff.inspect}"
end
ensure
File.unlink(tmp_target_filename) if File.exist?(tmp_target_filename)
Expand Down

0 comments on commit 808b4dc

Please sign in to comment.