Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

moved from hpricot to nokogiri, refactored a bit

  • Loading branch information...
commit 43806265613a6e8c505ee2dffa15f5b7a1e8e1ab 1 parent b5ac561
@rkononov rkononov authored
View
26 ruby_ng/web_crawler/page_processor.rb
@@ -1,14 +1,17 @@
-require 'net/http'
-require 'uri'
require 'open-uri'
-require 'hpricot'
+require 'nokogiri'
require 'iron_cache'
require 'iron_mq'
+def make_absolute( href, root )
+ return unless href
+ puts "Making absolute:#{href} with root:#{root}"
+ URI.parse(root).merge(URI.parse(href)).to_s rescue nil
+end
def process_images(doc)
#get all images
- images = doc/"img"
+ images = doc.css("img")
#get image with highest height on page
largest_image = doc.search("img").sort_by { |img| img["height"].to_i }[-1]
largest_image = largest_image ? largest_image['src'] : 'none'
@@ -18,7 +21,7 @@ def process_images(doc)
def process_links(doc)
#get all links
- links = doc/"a"
+ links = doc.css("a")
end
def process_css(doc)
@@ -27,12 +30,12 @@ def process_css(doc)
end
def process_words(doc)
- #converting to plain text
- text = doc.to_plain_text
+ #converting to plain text and removing tags
+ text = doc.text
#splitting by words
words = text.split(/[^a-zA-Z]/)
#removing empty string
- words.delete_if{|e| e==""}
+ words.delete_if{|e| e.empty?}
#creating hash
freqs = Hash.new(0)
#calculating stats
@@ -42,14 +45,15 @@ def process_words(doc)
def process_page(url)
puts "Processing page #{url}"
- doc = Hpricot(open(url))
+ doc = Nokogiri(open(url))
images, largest_image, list_of_images = process_images(doc)
- links = process_links(doc)
+ #processing links an making them absolute
+ links = process_links(doc).map{|link| make_absolute( link['href'], url )}.compact
css = process_css(doc)
words_stat = process_words(doc)
puts "Number of images on page:#{images.count}"
puts "Number of css on page:#{css.count}"
- puts "Number of links on page:#{images.count}"
+ puts "Number of links on page:#{links.count}"
puts "Largest image on page:#{largest_image}"
puts "Words frequency:#{words_stat.inspect}"
#putting all in cache
View
1  ruby_ng/web_crawler/page_processor.worker
@@ -1,4 +1,3 @@
-gem 'hpricot'
gem 'iron_worker_ng'
gem 'iron_cache'
gem 'iron_mq'
View
4 ruby_ng/web_crawler/run_crawler.rb
@@ -19,4 +19,6 @@ def params
cache = IronCache::Client.new({"token" => params['iw_token'], "project_id" => params['iw_project_id']})
cache.items.put('pages_count', 0)
#launching worker
-ng_client.tasks.create("WebCrawler", params)
+puts "Launching crawler"
+ng_client.tasks.create("WebCrawler", params)
+puts "Crawler launched! now open http://hud.iron.io"
View
2  ruby_ng/web_crawler/url_utils.rb
@@ -72,7 +72,7 @@ def update_url_if_redirected(url, url_object)
def parse_url(url_object)
doc = nil
begin
- doc = Hpricot(url_object)
+ doc = Nokogiri::HTML(url_object)
rescue
puts 'Could not parse url: ' + url_object.base_uri.to_s
end
View
4 ruby_ng/web_crawler/web_crawler.rb
@@ -1,7 +1,5 @@
-require 'net/http'
-require 'uri'
require 'open-uri'
-require 'hpricot'
+require 'nokogiri'
require 'iron_worker_ng'
require 'iron_cache'
require 'iron_mq'
View
1  ruby_ng/web_crawler/web_crawler.worker
@@ -1,4 +1,3 @@
-gem 'hpricot'
gem 'iron_worker_ng'
gem 'iron_cache'
gem 'iron_mq'
Please sign in to comment.
Something went wrong with that request. Please try again.