This repository has been archived by the owner on Jun 25, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
html_page_data.rb
159 lines (138 loc) · 4.92 KB
/
html_page_data.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
require 'helpers'
require 'net/http'
require 'cgi'
require 'http_encoding_helper'
require 'nokogiri'
require 'iconv'
class HTMLPageDataError < StandardError
end
class HTMLPageData
DefaultHeaders = {'User-Agent'=>'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Accept-Language'=> 'en-us,en;q=0.5',
'Accept-Charset'=> 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Connection' => 'close'}
ContentLengthLimit = 1048576
ContentTypes = ['text/html','application/xhtml+xml']
def self.get(url, headers = {})
p = self.new(url, headers)
p.response
p
end
def initialize(url,headers = {})
@url = URI.parse(url)
@url = URI.parse "http://#{url}" if @url.scheme.nil?
raise ArgumentError, 'The url has to be absolute' unless @url.absolute?
init_headers(headers)
end
def host
@url.host
end
def title
if @title.nil? && document
document.css("title", "meta[name=title]").each {|n| @title = n.get_attribute("content").nil? ? clean_text(n.inner_html) : clean_text(n.get_attribute("content"))}
end
@title||=""
end
def keywords
if @keywords.nil? && document
document.css("meta[name=keywords]").each {|n| @keywords = n.get_attribute("content").blank? ? [] : n.get_attribute("content").split(",").collect {|k|clean_text(k.strip)}}
end
@keywords||=[]
end
def description
if @description.nil? && document
document.css("meta[name=description]").each {|n| @description = n.get_attribute("content").blank? ? "" : clean_text(n.get_attribute("content").strip)}
end
@description||=""
end
def image_sources
if @images.nil? && document
@images = []
document.css("img").each do |n|
unless n.get_attribute("src").blank?
img_src = URI.parse(n.get_attribute("src"))
img_src = @url.merge(img_src).to_s unless img_src.absolute?
@images << img_src.to_s
end
end
@images = @images.uniq
end
@images
end
def response
@response ||= fetch(@url.to_s)
end
def document
@document = Nokogiri::HTML(response.plain_body) if @document.nil? && response
@document
end
def document_encoding
return @document_encoding if @document_encoding
response.type_params.each_pair do |k,v|
@document_encoding = v.upcase if k =~ /charset/i
end
unless @document_encoding
document.css("meta[http-equiv=Content-Type]").each do |n|
attr = n.get_attribute("content")
@document_encoding = attr.slice(/charset=[a-z1-9\-_]+/i).split("=")[1].upcase if attr
end
end
@document_encoding
end
######
private
######
def fetch(uri_str, limit = 10)
# You should choose better exception.
raise ArgumentError, 'HTTP redirect too deep' if limit == 0
url = URI.parse(uri_str)
response = nil
Net::HTTP.new(url.host, url.port).start do |http|
http.request_get(url.request_uri, @headers) do |res|
puts res.inspect
response = res
if res.is_a?(Net::HTTPSuccess)
raise HTMLPageDataError.new("Invalid Content-Type #{res['Content-Type']}") if !self.class::ContentTypes.include? res.content_type
raise HTMLPageDataError.new("Invalid Content-Length") if res['Content-Length'] && res['Content-Length'].to_i > self.class::ContentLengthLimit
res.read_body
end
end
end
case response
when Net::HTTPSuccess
return handle_special_cases(response, limit)
when Net::HTTPRedirection
location_uri = URI.parse(response['location'])
if location_uri.absolute?
new_uri = location_uri if location_uri.absolute?
else
new_uri = @url.clone
new_uri.path = location_uri.path[0] == 47 ? location_uri.path : "/#{location_uri.path}"
end
return fetch(new_uri.to_s, limit - 1)
else
return response.error!
end #end of case
end
def init_headers(headers)
#faking some header so as to act like a normal browser unless otherwise given
@headers = self.class::DefaultHeaders.merge(headers)
#enforece the html or xhtml types only
#@headers['Accept'] = self.class::ContentTypes.join(",")
@headers['Accept'] = "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8"
end
def handle_special_cases(response, limit)
#for handling image search on yahoo and google
matches = response.plain_body.scan(/<noscript>.*<meta[^>]*HTTP-EQUIV=["']refresh["'][^>]*content=["']\d;url=([^"']+)["'][^>]*>/i)
return response if matches.length == 0
url = (matches.collect {|m| m[0]})[0]
return fetch(url, limit - 1)
end
def clean_text(html)
if html
html = CGI::unescapeHTML(html).gsub(/(\r|\n)/,"").strip
Iconv.conv('UTF-8', document_encoding, html) if document_encoding && document_encoding != "UTF-8"
end
html
end
end