Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Net::HTTP to resolve rate limiting #280

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions lib/wayback_machine_downloader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class WaybackMachineDownloader

include ArchiveAPI

VERSION = "2.3.1"
VERSION = "2.3.2"

attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
Expand Down Expand Up @@ -83,18 +83,22 @@ def match_exclude_filter file_url
def get_all_snapshots_to_consider
# Note: Passing a page index parameter allow us to get more snapshots,
# but from a less fresh index
http = Net::HTTP.new("web.archive.org", 443)
http.use_ssl = true
http.start()
print "Getting snapshot pages"
snapshot_list_to_consider = []
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil, http)
print "."
unless @exact_url
@maximum_pages.times do |page_index|
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index, http)
break if snapshot_list.empty?
snapshot_list_to_consider += snapshot_list
print "."
end
end
http.finish()
puts " found #{snapshot_list_to_consider.length} snaphots to consider."
puts
snapshot_list_to_consider
Expand Down Expand Up @@ -206,11 +210,15 @@ def download_files
@processed_file_count = 0
@threads_count = 1 unless @threads_count != 0
@threads_count.times do
http = Net::HTTP.new("web.archive.org", 443)
http.use_ssl = true
http.start()
threads << Thread.new do
until file_queue.empty?
file_remote_info = file_queue.pop(true) rescue nil
download_file(file_remote_info) if file_remote_info
download_file(file_remote_info, http) if file_remote_info
end
http.finish()
end
end

Expand Down Expand Up @@ -243,7 +251,7 @@ def structure_dir_path dir_path
end
end

def download_file file_remote_info
def download_file (file_remote_info, http)
current_encoding = "".encoding
file_url = file_remote_info[:file_url].encode(current_encoding)
file_id = file_remote_info[:file_id]
Expand All @@ -268,8 +276,8 @@ def download_file file_remote_info
structure_dir_path dir_path
open(file_path, "wb") do |file|
begin
URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}").open("Accept-Encoding" => "plain") do |uri|
file.write(uri.read)
http.get(URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}")) do |body|
file.write(body)
end
rescue OpenURI::HTTPError => e
puts "#{file_url} # #{e}"
Expand Down
4 changes: 2 additions & 2 deletions lib/wayback_machine_downloader/archive_api.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@

module ArchiveAPI

def get_raw_list_from_api url, page_index
def get_raw_list_from_api url, page_index, http
request_url = URI("https://web.archive.org/cdx/search/xd")
params = [["output", "json"], ["url", url]]
params += parameters_for_api page_index
request_url.query = URI.encode_www_form(params)

begin
json = JSON.parse(URI(request_url).open.read)
json = JSON.parse(http.get(URI(request_url)).body)
if (json[0] <=> ["timestamp","original"]) == 0
json.shift
end
Expand Down