Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

web crawling/processing draft

  • Loading branch information...
commit 190f5d562bc999cc99ec9639eafb681e29138dc1 1 parent 4e0836e
@rkononov rkononov authored
View
17 ruby_ng/web_crawler/README.md
@@ -0,0 +1,17 @@
+# WebCrawler Worker
+
+This is an example of web crawler that just get all links on given site and follow them (recursively queue new workers if possible) to find new links and so on with limited deep and only on given domain.
+
+## Getting Started
+
+###Configure crawler
+- url = 'http://sample.com' # url to domain you want to crawl
+- page_limit = 1000 #maximum number of links to collect
+- depth = 3 #maximum deep level
+- max_workers = 2 #max number of concurrent workers to use - workers are fully recursive if this possible worker queue another worker
+- iw_token = iron token
+- iw_project_id = iron project id
+
+### Start crawler
+- upload crawler iron_worker upload web_spider
+- ruby run_crawler.rb
View
55 ruby_ng/web_crawler/url_utils.rb
@@ -0,0 +1,55 @@
+module UrlUtils
+ def relative?(url)
+ url.match(/^http/) ? false : true
+ end
+
+ def make_absolute(potential_base, relative_url)
+ if relative_url.match(/^\//)
+ create_absolute_url_from_base(potential_base, relative_url)
+ else
+ create_absolute_url_from_context(potential_base, relative_url)
+ end
+ end
+
+ def urls_on_same_domain?(url1, url2)
+ get_domain(url1) == get_domain(url2)
+ end
+
+ def get_domain(url)
+ remove_extra_paths(url)
+ end
+
+ private
+
+ def create_absolute_url_from_base(potential_base, relative_url)
+ remove_extra_paths(potential_base) + relative_url
+ end
+
+ def remove_extra_paths(potential_base)
+ index_to_start_slash_search = potential_base.index('://')+3
+ index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
+ if index_of_first_relevant_slash != nil
+ return potential_base[0, index_of_first_relevant_slash]
+ end
+ potential_base
+ end
+
+ def create_absolute_url_from_context(potential_base, relative_url)
+ if potential_base.match(/\/$/)
+ absolute_url = potential_base+relative_url
+ else
+ last_index_of_slash = potential_base.rindex('/')
+ if potential_base[last_index_of_slash-2, 2] == ':/'
+ absolute_url = potential_base+'/'+relative_url
+ else
+ last_index_of_dot = potential_base.rindex('.')
+ if last_index_of_dot < last_index_of_slash
+ absolute_url = potential_base+'/'+relative_url
+ else
+ absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
+ end
+ end
+ end
+ absolute_url
+ end
+end
View
122 ruby_ng/web_crawler/web_spider.rb
@@ -0,0 +1,122 @@
+require 'net/http'
+require 'uri'
+require 'open-uri'
+require 'hpricot'
+require 'iron_worker_ng'
+require 'iron_cache'
+
+load 'url_utils.rb'
+
+include UrlUtils
+
+
+
+def process_page(url)
+ @icache.items.put(CGI::escape(url), "processed")
+ puts "Processing page #{url}"
+end
+
+def crawl_domain(url, depth)
+ url_object = open_url(url)
+ return if url_object == nil
+ parsed_url = parse_url(url_object)
+ return if parsed_url == nil
+ puts "Scanning URL:#{url}"
+ page_urls = find_urls_on_page(parsed_url, url)
+ puts "FOUND links:#{page_urls.count}"
+ page_urls.each_with_index do |page_url,index|
+ if urls_on_same_domain?(url, page_url)
+ pages_count = @icache.items.get('pages_count').value || 0
+ puts "Pages scanned:#{pages_count}"
+ puts "Page url #{page_url},index:#{index}"
+ @icache.items.put('pages_count', pages_count + 1)
+ return if pages_count >= params['page_limit']
+ puts "current depth:#{depth}"
+ page_from_cache = @icache.items.get(CGI::escape(page_url))
+ if page_from_cache.nil?
+ process_page(page_url)
+ queue_worker(depth, page_url) if depth > 1
+ else
+ puts "Link #{page_url} already processed, bypassing"
+ #page_from_cache.delete
+ end
+ end
+ end
+end
+
+def queue_worker(depth, page_url)
+ workers_count = @icache.items.get('workers_count')
+ count = workers_count ? workers_count.value : 0
+ puts "Number of workers:#{count}"
+ if count < params['max_workers'] - 1
+ @icache.items.put('workers_count', count+1)
+ p = {:url => page_url,
+ :page_limit => params["page_limit"],
+ :depth => depth - 1,
+ :max_workers => params["max_workers"],
+ :iw_token => params["iw_token"],
+ :iw_project_id => params["iw_project_id"]
+ }
+ @client.tasks.create("WebCrawler", p)
+ else
+ crawl_domain(page_url, depth-1)
+ end
+end
+
+private
+
+def open_url(url)
+ url_object = nil
+ begin
+ url_object = open(url)
+ rescue
+ puts "Unable to open url: " + url
+ end
+ url_object
+end
+
+def update_url_if_redirected(url, url_object)
+ if url != url_object.base_uri.to_s
+ return url_object.base_uri.to_s
+ end
+ url
+end
+
+def parse_url(url_object)
+ doc = nil
+ begin
+ doc = Hpricot(url_object)
+ rescue
+ puts 'Could not parse url: ' + url_object.base_uri.to_s
+ end
+ puts 'Crawling url ' + url_object.base_uri.to_s
+ doc
+end
+
+def find_urls_on_page(parsed_url, current_url)
+ urls_list = []
+ begin
+ parsed_url.search('a[@href]').map do |x|
+ new_url = x['href'].split('#')[0]
+ unless new_url == nil
+ if relative?(new_url)
+ new_url = make_absolute(current_url, new_url)
+ end
+ urls_list.push(new_url)
+ end
+ end
+ rescue
+ puts "could not find links"
+ end
+ urls_list
+end
+
+@icache = IronCache::Client.new({"token" => params['iw_token'], "project_id" => params['iw_project_id']})
+@client = IronWorkerNG::Client.new(:token => params['iw_token'], :project_id => params['iw_project_id'])
+
+crawl_domain(params['url'], params['depth']||1)
+
+workers_count = @icache.items.get('workers_count')
+count = workers_count ? workers_count.value : 0
+
+@icache.items.put('workers_count', count-1) if count > 0
View
7 ruby_ng/web_crawler/web_spider.worker
@@ -0,0 +1,7 @@
+gem 'hpricot'
+gem 'iron_worker_ng'
+gem 'iron_cache'
+file 'url_utils.rb'
+runtime "ruby"
+exec "web_spider.rb"
+name 'WebCrawler'

3 comments on commit 190f5d5

@frommww
Owner
  • the web_spider name is a bit weird to me. had to think about it for a bit. any reason we don't use web_crawler?

  • return if in crawl_domain forces you to concentrate to understand the logic. is there another way to do this?

  • could use some line spacing around blocks within the procedures to make it easier to scan

  • @icache and @client should probably be closer in naming maybe?

  • where's run_crawler

  • Can we put the page in ironcache and then put the item id onto MQ and have these be processed.

  • Want to include comments for using mulitiple queues and having multiple workers processing a page (look for images, look for products)

  • we could point the web_crawler to meetup.com/sfrails and have the page processor grab the image.

@rkononov
Collaborator

k thanks, will refactor

@rkononov
Collaborator
  • renamed to 'web crawler' and 'page processor'
  • i don't see other way to do this, we could ask @manveru to make review )
  • added (actually removed all secondary stuff in url_utils)
  • renamed
  • it was a draft commit, run_crawler was in second one, please use this url to review - https://github.com/iron-io/iron_worker_examples/pull/51/files
  • done
  • made separate workers - one crawler, second page processor, they are using iron_cache and iron_mq (right now single iron_mq queue, don't see why we need to use more than one)
  • done
Please sign in to comment.
Something went wrong with that request. Please try again.