Update parse source with logger.

jeremyf · Mar 9, 2011 · 808b4dc · 808b4dc
1 parent 885c819
commit 808b4dc
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 4 deletions.
diff --git a/lib/parse_source.rb b/lib/parse_source.rb
@@ -1,13 +1,27 @@
 #!/usr/bin/env ruby
 require 'hpricot'
 require 'yaml'
+require 'logger'
+log     = Logger.new(LOG_FILE, 5, 10*1024)
 
 exceptions = []
+log.info "\n"
+log.info "=" * 80
+log.info "=" * 80
+log.info "\n"
+log.info "Parsing HTML File"
+log.info "\n"
+log.info "=" * 80
+log.info "\n"
+
 Dir.glob(File.join(File.dirname(__FILE__), '../src/html/*.html')).each do |filename|
   begin
+    log.info
     # filename = "/Users/jeremyf/Documents/Repositories/git/ndpr-conversion/lib/../src/review-2521.html"
     dictionary = {}
     review_id = File.basename(filename).gsub(/review-(\d*)\.html/,'\1')
+    log.info "\tBegin processing review ID: #{review_id}"
+
     if File.size(filename) > 0
       doc = open(filename) { |file| Hpricot(file) }
 
@@ -31,21 +45,39 @@
         end
         (content/"div#hr:first-of-type").each {|n| original_html.sub!(n.to_original_html, '')}
 
+        log.info "\t\tCatalog ID: #{dictionary['catalog-id']}"
+        log.info "\t\tAuthors: #{dictionary['author']}"
+        log.info "\t\tReview Title: #{dictionary['review_title']}"
+        log.info "\t\tBibliography: #{dictionary['bibliography']}"
+        log.info "\t\tReviewer: #{dictionary['reviewer']}"
+
         # dictionary['catalog_id']   = node.inner_html.strip
         # dictionary['authors']      = (content/"h4").first.inner_html.strip
         # dictionary['review_title'] = (content/"h2").first.inner_html.strip
         # dictionary['bibliography'] = (content/"p.biblio").first.inner_html.strip
         # dictionary['reviewer']     = (content/"p strong").first.inner_html.strip
         # review_content = (content/"div#hr").first.following_siblings.collect{|sib| sib.to_original_html}.join("\n")
-        dictionary['content']      = original_html #review_content #.sub(/ *\<\/div\> *\<\/div\>$/,'')
+        dictionary['content']      = original_html
       end
       File.open(File.join(File.dirname(__FILE__), "../src/yml/review-#{review_id}.yml"), 'w+') do |file|
         file.puts YAML.dump(dictionary)
       end
+      log.info "\tEnd processing review ID: #{review_id}"
     end
   rescue RuntimeError => e
     exceptions << [filename, e]
   end
 end
 
-puts "Errors: #{exceptions.inspect}"
+log.info "\tExceptions"
+exceptions.each do |exception|
+  log.info "\t\t#{exception}"
+end
+log.info "\n"
+log.info "=" * 80
+log.info "\n"
+log.info "End Parsing HTML File"
+log.info "\n"
+log.info "=" * 80
+log.info "=" * 80
+log.info "\n"
diff --git a/lib/render.rb b/lib/render.rb
@@ -4,14 +4,15 @@
 require 'yaml'
 
 Dir.glob(File.join(File.dirname(__FILE__), "../src/yml/*.yml")).each do |filename|
+  puts "Processing #{filename}"
   object = YAML.load_file(filename)
   buffer = ERB.new(File.read(File.join(File.dirname(__FILE__), '../src/template.erb.html'))).result(binding)
   target_filename = File.join(File.dirname(__FILE__), "../src/output/review-#{object['review_id']}.html")
   source_filename = File.join(File.dirname(__FILE__), "../src/html/review-#{object['review_id']}.html")
   File.open(target_filename, 'w+') do |file|
     file.puts buffer
   end
-  regexp_for_split = /(\<|\>|\s)+/
+  regexp_for_split = /./
   target = buffer.split(regexp_for_split).join("\n")
   source = File.read(source_filename).split(regexp_for_split).join("\n")
 
@@ -24,7 +25,8 @@
 
     diff = `diff #{tmp_source_filename} #{tmp_target_filename} -EwBb`.strip
     if diff.any?
-      puts "Review differences for Review #{object['review_id']}:\n\n#{diff}"
+      require 'ruby-debug'; debugger; true;
+      puts "Review differences for Review #{object['review_id']}:\n\n#{diff}\n\n#{diff.inspect}"
     end
   ensure
     File.unlink(tmp_target_filename) if File.exist?(tmp_target_filename)