Permalink
Browse files

Upload a PDF and then search thru archived PDFs.

* added Grim for creating preview image and extracting text from pdf
* added Qu for simple background processing
* added Hunt for searching Documents
  • Loading branch information...
1 parent bcbf104 commit 14016b57a70b4eaf2d4edb05d9d87774a37dfdd2 @jonmagic committed Oct 10, 2011
Showing with 222 additions and 19 deletions.
  1. +1 −0 .gitignore
  2. +4 −0 Gemfile
  3. +33 −0 Gemfile.lock
  4. +2 −1 Procfile
  5. +9 −8 Rakefile
  6. +2 −0 lib/document.rb
  7. +22 −0 lib/pdf_archive.rb
  8. +22 −0 lib/pdf_uploader.rb
  9. +19 −0 lib/process_pdf.rb
  10. +22 −8 lib/views/home.erb
  11. +62 −2 spec/pdf_archive_spec.rb
  12. +24 −0 spec/process_pdf_spec.rb
View
@@ -1,3 +1,4 @@
.rvmrc
tmp
+public
.rspec
View
@@ -5,8 +5,12 @@ gem 'bson_ext'
gem 'mongo_mapper'
gem 'carrierwave', :git => 'git://github.com/jnicklas/carrierwave.git' # fixes issue with sinatra 1.3+
gem 'mm-carrierwave'
+gem 'hunt'
+gem 'grim'
+gem 'qu-redis'
group :test do
gem 'rspec'
gem 'capybara'
+ gem 'ruby-debug19', :require => 'ruby-debug'
end
View
@@ -14,6 +14,7 @@ GEM
i18n (~> 0.6)
activesupport (3.1.1)
multi_json (~> 1.0)
+ archive-tar-minitar (0.5.2)
bson (1.4.0)
bson_ext (1.4.0)
builder (3.0.0)
@@ -26,10 +27,18 @@ GEM
xpath (~> 0.1.4)
childprocess (0.2.2)
ffi (~> 1.0.6)
+ columnize (0.3.4)
diff-lcs (1.1.3)
+ fast-stemmer (1.0.0)
ffi (1.0.9)
+ grim (0.3.0)
+ hunt (0.4)
+ fast-stemmer (~> 1.0)
+ mongo_mapper (~> 0.9.0)
i18n (0.6.0)
json_pure (1.6.1)
+ linecache19 (0.5.12)
+ ruby_core_source (>= 0.1.4)
mime-types (1.16)
mm-carrierwave (0.0.2)
carrierwave (~> 0.5)
@@ -44,12 +53,21 @@ GEM
nokogiri (1.5.0)
plucky (0.3.8)
mongo (~> 1.3)
+ qu (0.1.0)
+ multi_json
+ qu-redis (0.1.0)
+ qu (= 0.1.0)
+ redis-namespace
+ simple_uuid
rack (1.3.4)
rack-protection (1.1.4)
rack
rack-test (0.6.1)
rack (>= 1.0)
rake (0.9.2)
+ redis (2.2.2)
+ redis-namespace (1.1.0)
+ redis (< 3.0.0)
rspec (2.6.0)
rspec-core (~> 2.6.0)
rspec-expectations (~> 2.6.0)
@@ -58,12 +76,23 @@ GEM
rspec-expectations (2.6.0)
diff-lcs (~> 1.1.2)
rspec-mocks (2.6.0)
+ ruby-debug-base19 (0.11.25)
+ columnize (>= 0.3.1)
+ linecache19 (>= 0.5.11)
+ ruby_core_source (>= 0.1.4)
+ ruby-debug19 (0.11.6)
+ columnize (>= 0.3.1)
+ linecache19 (>= 0.5.11)
+ ruby-debug-base19 (>= 0.11.19)
+ ruby_core_source (0.1.5)
+ archive-tar-minitar (>= 0.5.2)
rubyzip (0.9.4)
selenium-webdriver (2.8.0)
childprocess (>= 0.2.1)
ffi (>= 1.0.7)
json_pure
rubyzip
+ simple_uuid (0.2.0)
sinatra (1.3.1)
rack (>= 1.3.4, ~> 1.3)
rack-protection (>= 1.1.2, ~> 1.1)
@@ -79,8 +108,12 @@ DEPENDENCIES
bson_ext
capybara
carrierwave!
+ grim
+ hunt
mm-carrierwave
mongo_mapper
+ qu-redis
rake
rspec
+ ruby-debug19
sinatra
View
@@ -1 +1,2 @@
-web: bundle exec ruby lib/pdf_archive.rb -p 3000
+web: bundle exec ruby lib/pdf_archive.rb -p 3000
+worker: bundle exec rake qu:work QUEUE=default
View
@@ -1,19 +1,20 @@
-$: << File.expand_path('../spec', __FILE__)
-
require 'rubygems'
require 'bundler/setup'
-
require 'rspec/core/rake_task'
+task :console do
+ exec "irb -Iapp -r ./lib/pdf_archive"
+end
+
desc "Run specs"
task :spec do
RSpec::Core::RakeTask.new(:spec) do |t|
t.pattern = './spec/**/*_spec.rb'
end
end
+task :default => :spec
-task :console do
- exec "irb -Iapp -r ./lib/pdf_archive"
-end
-
-task :default => :spec
+require 'qu/tasks'
+task :environment do
+ require './lib/pdf_archive'
+end
View
@@ -1,6 +1,8 @@
class Document
include MongoMapper::Document
+ plugin Hunt
key :page_contents, Array
mount_uploader :pdf, PdfUploader
+ searches :pdf_filename, :page_contents
end
View
@@ -21,14 +21,36 @@ def self.root
MongoMapper.connection = Mongo::Connection.new(uri.host, uri.port, {})
MongoMapper.database = database
+# Qu setup
+Qu.configure do |c|
+ c.backend.namespace = "pdfarchive:qu:#{PdfArchive.environment}"
+end
+
# CarrierWave setup
require 'carrierwave/orm/mongomapper'
# require pdf uploader and document model
require 'pdf_uploader'
require 'document'
+require 'process_pdf'
# Routes
+set :public_folder, "#{PdfArchive.root}/public"
+
get '/' do
erb :home
+end
+
+post '/' do
+ if params['pdf']
+ document = Document.create!(params)
+ Qu.enqueue(ProcessPdf, document.id)
+ end
+
+ erb :home
+end
+
+get '/search' do
+ @documents = Document.search(params['q'])
+ erb :home
end
View
@@ -1,4 +1,5 @@
class PdfUploader < CarrierWave::Uploader::Base
+ Grim::WIDTH = 100
storage :file
def cache_dir
@@ -12,4 +13,25 @@ def store_dir
"#{PdfArchive.root}/public/documents/#{model.id}"
end
end
+
+ def grim
+ @grim ||= Grim.reap(cache_path)
+ end
+
+ def page_count
+ @page_count ||= begin
+ cache_stored_file! unless cached?
+ grim.count
+ end
+ end
+
+ def create_preview
+ cache_stored_file! unless cached?
+ grim[0].save(File.join(store_dir, 'preview.jpg'))
+ end
+
+ def extract_text(index)
+ cache_stored_file! unless cached?
+ grim[index].text
+ end
end
View
@@ -0,0 +1,19 @@
+class ProcessPdf
+ def self.perform(document_id)
+ document = Document.find!(document_id)
+
+ page_count = document.pdf.page_count
+
+ if page_count > 0
+ document.pdf.create_preview
+
+ 0.upto(page_count - 1).each do |index|
+ document.page_contents << document.pdf.grim[index].text
+ end
+
+ document.save!
+ else
+ raise 'PDF has no content'
+ end
+ end
+end
View
@@ -1,18 +1,32 @@
<h1>PDF Archive</h1>
+<form action="/search" method="GET">
+ <p>
+ <input name="q" type="text" />
+ <button>Search</button>
+ </p>
+</form>
+
+<% if @documents %>
+ <h2>Search Results</h2>
+ <ul>
+ <% @documents.each do |document| %>
+ <li>
+ <a href="<%= "/documents/#{document.id}/#{document.pdf_filename}" %>">
+ <img src="<%= "/documents/#{document.id}/preview.jpg" %>" />
+ <%= document.pdf_filename %>
+ </a>
+ </li>
+ <% end %>
+ </ul>
+<% end %>
+
<h2>Upload</h2>
-<form action='/' method="POST">
+<form action='/' method="POST" enctype="multipart/form-data">
<p>
<input name="pdf" type="file" />
</p>
<p>
<button>Upload PDF</button>
</p>
-</form>
-
-<h2>Search</h2>
-<form action="/search" method="GET">
- <p>
- <input name="q" type="text" />
- </p>
</form>
View
@@ -18,12 +18,72 @@
last_response.body.should include('PDF Archive')
end
+ it "should include a form for search" do
+ last_response.body.should include('Search')
+ end
+
it "should include form for uploading pdf" do
last_response.body.should include('Upload')
end
+ end
- it "should include a form for search" do
- last_response.body.should include('Search')
+ describe "POST /" do
+ context "with a pdf" do
+ before(:each) do
+ @count = Document.count
+ post '/', params={:pdf => pdf_fixture('onepage.pdf')}
+ end
+
+ it "responds ok" do
+ last_response.should be_ok
+ end
+
+ it "creates a document" do
+ Document.count.should > @count
+ end
+ end
+
+ context "without a pdf" do
+ before(:each) do
+ @count = Document.count
+ post '/', params={:pdf => nil}
+ end
+
+ it "responds ok" do
+ last_response.should be_ok
+ end
+
+ it "does not create a document" do
+ Document.count.should == @count
+ end
+ end
+ end
+
+ describe "GET /search" do
+ before(:each) do
+ document = Document.create(:pdf => pdf_fixture('onepage.pdf'))
+ ProcessPdf.perform(document.id)
+ end
+
+ context "search by filename" do
+ it "shows document in search results" do
+ get '/search', params={:q => 'onepage.pdf'}
+ last_response.body.should include('onepage.pdf')
+ end
+ end
+
+ context "search by content" do
+ it "shows document in search results" do
+ get '/search', params={:q => 'mongomapper'}
+ last_response.body.should include('onepage.pdf')
+ end
+ end
+
+ context "search with no query" do
+ it "shows document in search results" do
+ get '/search', params={:q => ''}
+ last_response.body.should_not include('onepage.pdf')
+ end
end
end
end
View
@@ -0,0 +1,24 @@
+require 'spec_helper'
+
+describe ProcessPdf do
+ let(:document) { Document.create(:pdf => pdf_fixture('onepage.pdf')) }
+
+ describe ".perform" do
+ before(:all) do
+ ProcessPdf.perform(document.id)
+ document.reload
+ end
+
+ it "creates a preview image" do
+ File.exists?(File.join(tmp_dir, 'documents', document.id, 'preview.jpg')).should be_true
+ end
+
+ it "extracts and saves pdf text to document" do
+ document.page_contents.length.should > 0
+ end
+
+ it "creates search terms for hunt" do
+ document.searches['default'].length.should > 0
+ end
+ end
+end

0 comments on commit 14016b5

Please sign in to comment.