This repository has been archived by the owner on Jul 28, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
cloudlib.rb
executable file
·384 lines (356 loc) · 14 KB
/
cloudlib.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# This library provides the means for maintaining a database of
# documents on Amazon's S3 file store, with searchable metadata in
# Amazon's SimpleDB database. Think of it as a filing cabinet or library
# that can be extended indefinitely and accessed from anywhere in the
# world: a library that lives "in the cloud."
# In order to use this library, you need to sign up for
# Amazon's S3 and SimpleDB services:
#
# * Amazon SimpleDB: http://aws.amazon.com/simpledb/
# * Amazon S3: http://aws.amazon.com/s3/
#
# Simple usage example:
#
# require 'rubygems'
# require 'cloudlib'
# include Cloudlib
# Entry.connect('xxx_key_id_xxx', 'xxx_secret_access_key_xxx', 'my_aws_library')
# logic_entries = Entry.query('logic')
# logic_entries.each {|entry| puts entry.to_s}
#
# For more examples of the use of the library, see the programs cloudlib.rb
# and cloudlib-web.rb, included in the gem.
# Author:: John MacFarlane (jgm at berkeley dot edu)
# Copyright:: Copyright (c) 2008 John MacFarlane
# License:: GPL v2
require 'rubygems'
require 'readline'
require 'aws/s3' # aws-s3 gem
require 'aws_sdb' # aws-sdb gem
require 'open-uri'
require 'fileutils'
module Cloudlib
# A library entry, including content and metadata. An entry has a name
# (which is also the key of the associated S3 object) and an attributes
# hash. The name is of the form "sha1.ext", where sha1 is a SHA1 hash of
# the contents of the file, and ext is the file extension. This makes
# it impossible to have entries with duplicate contents. The attributes
# hash contains the following fields:
#
# * extension - file extension including .
# * size - size of contents (bytes)
# * date-added - date entry was added to library
# * entry_type - article, book, chapter, incollection, unpublished
# * authors - list of authors
# * editors - list of editors
# * title - title of entry
# * booktitle - title of book containing entry
# * year - publication year of entry
# * publisher - publisher of book
# * address - publication address
# * journal - journal containing entry
# * volume - volume number of journal
# * pages - page range of entry in book or journal
# * keywords - keywords
# * doi - DOI for entry
# * url - URL for entry
# * comments - miscellaneous comments
# * *_lowercase - lowercase version of *
# * *_words - lowercase version of *, split into a list of words
# * all_words - list of words in title, authors, editors, booktitle, keywords
class Entry
attr_accessor :name, :attributes
# Establish connections to the S3 file store and the SimpleDB database.
# If values are not supplied for the parameters, they will default to
# the values of the environment variables CLOUDLIB_LIBRARY_NAME,
# AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY. Note that library_name
# is the name of both the S3 bucket that will hold the contents of
# the entries and the SimpleDB domain that will hold the metadata.
def self.connect(library_name=ENV['CLOUDLIB_LIBRARY_NAME'],
aws_access_key_id=ENV['AWS_ACCESS_KEY_ID'],
aws_secret_access_key=ENV['AWS_SECRET_ACCESS_KEY'],
debug = false)
@@aws_access_key_id = aws_access_key_id
@@aws_secret_access_key = aws_secret_access_key
AWS::S3::Base.establish_connection!(:access_key_id => @@aws_access_key_id, :secret_access_key => @@aws_secret_access_key, :use_ssl => true)
@@bucket = library_name
logger = Logger.new(STDERR)
logger.level = if debug then Logger::DEBUG else Logger::WARN end
@@db = AwsSdb::Service.new(:access_key_id => @@aws_access_key_id, :secret_access_key => @@aws_secret_access_key, :use_ssl => true, :logger => logger)
end
# Creates a new entry object. To create an entry with contents,
# use Entry.from_file.
def initialize(name, attributes={'all_words' => []})
@name = name
@attributes = attributes
end
# Create the S3 bucket and SimpleDB domain that will store the library entries.
# This method should be run once to create the library.
def self.create_library
AWS::S3::Bucket.create(@@bucket)
@@db.create_domain(@@bucket)
end
# Delete the S3 bucket and SimpleDB domain that store the library entries.
# All data will be lost.
def self.delete_library
AWS::S3::Bucket.delete(@@bucket, :force => true)
@@db.delete_domain(@@bucket)
end
# Creates and saves an entry from a file, using attributes supplied.
# Returns the entry.
def self.from_file(path, filename=path, attributes={'all_words' => []})
sha1 = Digest::SHA1.file(path).hexdigest
ext = File.extname(filename)
name = "#{sha1}#{ext}"
attributes['size'] = File.size(path).to_s
attributes['date-added'] = Date.today.to_s
entry = Entry.new(name, attributes)
AWS::S3::S3Object.store(name, open(path), @@bucket)
@@db.put_attributes(@@bucket, name, attributes, replace=true)
return entry
end
# Return an entry with the specified name. Raises an error if not found.
def self.find_by_name(name)
attributes = @@db.get_attributes(@@bucket, name)
if attributes == {} then raise "Item not found." end
Entry.new(name, attributes)
end
# Queries the database and returns a list [token, entries]. entries is
# a list of up to numitems Entry objects that match the query. If
# there are more entries than numitems, token will be nonempty, and
# can be passed in on a subsequent calls for the remaining entries.
#
# The query string can contain one or more words. If a word is
# preceded by ti=, only entries that match it in the title will be
# returned. Similarly, au= searches authors, jo= journals, pu=
# publishers, ad= addresses, ed= editors, bo= booktitle (for collections),
# and ye= years. ye> and # ye< may also be used.
# The form ti='word1 word2' may also be used; entries will only match
# if their titles contain both word1 and word2.
def self.query(query_string, numitems=10, token=nil)
query_parts = query_string.downcase.scan(/((ti(?:tle)?|au(?:thors?)?|jo(?:urnal)?|bo(?:ooktitle)?|pu(?:blisher)?|ad(?:ddress)?|ed(?:itors?)?|ye(?:ar)?)\s*([<=>])\s*('[^']*'|"[^"]*"|\S*)|\S+)\s*/)
query = query_parts.reject {|part| part[0] == '*'}.map do |part|
whole, key, comparison, val = part
if val then val = val.gsub(/^['"](.*)['"]$/, "\\1") end
if not val then val = whole end
key_full = if key
case key[0..1]
when 'ti'
'title'
when 'au'
'authors'
when 'jo'
'journal'
when 'pu'
'publisher'
when 'ad'
'address'
when 'ed'
'editors'
when 'ye'
'year'
else 'all'
end
else
'all'
end
vals = val.split
vals.map do |v|
if key_full == 'year' # there is no year_words field
"['year' #{comparison} '#{v}']"
else
"['#{key_full}_words' = '#{v}']"
end
end.join(" intersection ")
end.join(" intersection ")
# note: query has to include year in order to sort by year
# hence this dummy search
if query.empty?
query = "['year' starts-with ''] sort 'year'"
else
query += " intersection ['year' starts-with ''] sort 'year'"
end
names, token = if token
@@db.query(@@bucket, query, numitems, token)
else
@@db.query(@@bucket, query, numitems)
end
entries = names.map do |name|
attributes = @@db.get_attributes(@@bucket, name)
Entry.new(name, attributes)
end
return token, entries
end
# Returns a human-friendly filename for the entry, constructed from
# authors and title.
def friendly_filename
authornames = self.attributes['authors'].map {|a| last_name(a)}.join('_')
title = self.show_attribute('title').gsub(/[,.\/[:space:]]+/,'_')
ext = File.extname(self.name)
return "#{authornames}_#{title}#{ext}"
end
# Deletes the entry.
def delete
AWS::S3::S3Object.delete(self.name, @@bucket)
@@db.delete_attributes(@@bucket, self.name)
end
# Saves the entry (metadata only; contents are saved by the from_file
# method).
def save
@@db.put_attributes(@@bucket, self.name, self.attributes, replace=true)
end
# Downloads the entry and saves as filename.
def download(path)
if File.exist?(path)
STDERR.puts "Backing up existing #{path} as #{path}~"
FileUtils.copy_file(path, "#{path}~", preserve=true)
end
open(path, 'w') do |outfile|
open(self.url, 'r') do |source|
FileUtils.copy_stream(source, outfile)
end
end
return path
end
# Returns a bibtex entry for the entry.
def to_bibtex
pairs = self.fields.map do |field|
if self.attributes[field.to_s]
sprintf(" %-15s: {%s}", field.to_s, self.show_attribute(field.to_s))
else
nil
end
end
pairs += [sprintf(" %-15s: {%s}", "file", self.name)]
authornames = self.attributes['authors'].map {|a| last_name(a)}.join('.')
year = self.attributes['year']
entry_type = self.show_attribute('entry_type') || 'unknown'
if entry_type == 'chapter' then entry_type = 'inbook' end
entry_key = "#{authornames}:#{year}"
"@#{entry_type.upcase}{#{entry_key},\n#{pairs.join(",\n")}\n}"
end
# Returns a string representation of the entry's metadata.
def to_s
authors = self.show_attribute('authors')
unless authors.empty?
authors = "#{authors}, "
end
title = "#{self.show_attribute('title')}"
year = self.show_attribute('year')
titleyear = if year.empty?
title + ". "
else
title + " (#{year}). "
end
pubaddr = [self.show_attribute('address'),
self.show_attribute('publisher')].reject {|x| x.empty?}.join(": ")
chapter = self.show_attribute('chapter')
pages = self.show_attribute('pages')
booktitle = self.show_attribute('booktitle')
editors = self.show_attribute('editors')
journal = self.show_attribute('journal')
volume = self.show_attribute('volume')
rest = case self.show_attribute('entry_type')
when 'article'
if journal.empty?
""
else
"#{journal} #{volume}" +
if pages.empty? then "." else ", #{pages}." end
end
when 'book'
if pubaddr.empty? then "" else "#{pubaddr}." end
when 'chapter'
if pubaddr.empty? then "" else "#{pubaddr}." end +
if chapter.empty? then "" else " Chapter #{chapter}." end +
if pages.empty? then "" else " #{pages}." end
when 'incollection'
"In " +
if editors.empty? then "" else editors + " (eds.), " end +
booktitle +
if pubaddr.empty? then "" else " (#{pubaddr})." end +
if chapter.empty? then "" else " Chapter #{chapter}." end +
if pages.empty? then "" else " #{pages}." end
when 'unpublished'
" (unpublished)."
else ""
end
return authors + titleyear + rest
end
# Sets the specified metadata attribute to ans. ans is assumed to be a regular string.
# It will be split by " and " for authors and editors, or by spaces for keywords.
def set_attribute(attribute, ans)
index = ['title', 'authors', 'editors', 'booktitle'].member?(attribute)
if ans.nil? || ans.empty?
self.attributes[attribute] = nil
else
newval = if attribute == 'editors' || attribute == 'authors'
ans.split(" and ").map {|a| a.strip}
elsif attribute == 'keywords'
ans.split
else
[ans.strip]
end
self.attributes[attribute] = newval
unless ['url', 'doi', 'keywords'].member?(attribute)
self.attributes[attribute + "_lowercase"] = newval.map {|a| a.downcase}
self.attributes[attribute + "_words"] = self.attributes[attribute + "_lowercase"].map {|a| a.split(/[[:space:][:punct:]] */)}.flatten
end
# recalculate all_words
tit_auth_words = ['title', 'authors', 'editors', 'booktitle'].map {|att| self.attributes[att + "_words"] || []}.flatten
keywords = self.attributes['keywords'] || []
self.attributes['all_words'] = keywords + tit_auth_words
end
end
# Returns a string representation of an attribute.
def show_attribute(attribute)
value = self.attributes[attribute]
if value.nil?
""
elsif attribute == 'keywords'
value.join(' ')
elsif attribute == 'editors' || attribute == 'authors'
value.join(' and ')
else
value[0]
end
end
# Returns an array of the field keywords appropriate for a type of entry.
def self.fields(entry_type='*')
fields = [:title, :authors, :year]
case entry_type
when 'article'
fields += [:journal, :volume, :pages]
when 'book'
fields += [:publisher, :address]
when 'chapter'
fields += [:booktitle, :chapter, :publisher, :address, :pages]
when 'incollection'
fields += [:booktitle, :chapter, :publisher, :address, :editors, :pages]
when '*'
fields += [:journal, :volume, :booktitle, :editors, :chapter,
:publisher, :address, :pages]
end
fields += [:keywords, :url, :doi, :comments]
return fields
end
# Returns the fields appropriate for an entry.
def fields
entry_type = self.show_attribute('entry_type')
Entry.fields(entry_type)
end
def url
AWS::S3::S3Object.find(self.name, @@bucket).url(:expires_in => 60 * 10) # expires in 10 min
end
private
# Returns the author's last name.
def last_name(author)
if author =~ /,/
author =~ /([^ ,]+),/
else
author =~ /([^ \t]+)$/
end
return $1
end
end
end