This repository has been archived by the owner on Aug 7, 2020. It is now read-only.
/
scraper.rb
87 lines (69 loc) · 1.96 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
require "open-uri"
require "time"
require "nokogiri"
require "open_uri_redirections"
class Scraper
class PageNotFound < StandardError; end
def initialize(path)
@path = path
end
def scrape
@doc = Nokogiri::HTML(open(url, :allow_redirections => :all))
{
title: get_title,
url: url,
items: get_items,
}
rescue OpenURI::HTTPError => e
if e.message == "404 Not Found"
raise PageNotFound
else
raise
end
end
private
def get_title
@doc.css("title").text
end
def get_items
# Multiple selectors as Etsy are in the middle of changing these.
cards = @doc.css("#search-results .listing-card, .listings .listing-card")
cards.map { |card| get_card(card) }.compact
end
def get_card(card)
return if card[:class].include?("house-ad") # Skip unparsable ads.
link = card.at("a.listing-thumb")
return unless link # "No longer available" items without links sometimes appear.
url = link[:href]
# Etsy doesn't properly escape the ga_facet parameter.
url.gsub!(" ", "+")
url.gsub!('"', "%22")
img_element = card.at(".image-wrap img, .listing-thumb img")
src = img_element[:"data-src"] || img_element[:src]
img = src.sub(/il_\d+x\d+/, 'il_570xN').sub(%r{^//}, "http://")
{
id: card[:id].gsub(/\D/, '').to_i,
url: url,
title: card.at(".listing-thumb")[:title],
img: img,
time: Time.now, # Can't determine without loading each item page :/
price: card.at(".listing-price").text.strip,
}
rescue NoMethodError => e
raise "Got: #{e.name}: #{e.message} with card HTML: #{card}"
end
def url
"https://www.etsy.com#{path}"
end
def path
path = @path.dup
# Always order by date.
path.sub!(/[&?]order=\w*/, '')
separator = path.include?("?") ? "&" : "?"
path += "#{separator}order=date_desc"
# Always page 1.
path.sub!(/[&?]page=\d*/, '')
path += "&page=1"
path
end
end