Skip to content

Commit

Permalink
Fetchers, scrapers, and html data in bzip2ed format
Browse files Browse the repository at this point in the history
  • Loading branch information
Ted Han committed Mar 28, 2010
0 parents commit 8057036
Show file tree
Hide file tree
Showing 16 changed files with 265,535 additions and 0 deletions.
8 changes: 8 additions & 0 deletions fetch/get_article_pages.rb
@@ -0,0 +1,8 @@
require 'rubygems'
here = File.dirname(__FILE__)
page_list = File.open("sorted_pages.txt")
page_list.each_line do |link|
junk, system, id = link.match(/http:\/\/www\.gamerankings\.com\/([^\/]+)\/(\d+)/).to_a
FileUtils.mkdir_p("#{here}/../html/articles/#{system}") unless File.exists?("#{here}/../html/articles/#{system}")
`wget --output-document=#{here}/../html/articles/#{system}/#{id}.html --append-output=article_page.log #{link.sub(/\/index.html$/,"/articles.html")}`
end
1 change: 1 addition & 0 deletions fetch/get_indexes.sh
@@ -0,0 +1 @@
wget -i index_list.txt
8 changes: 8 additions & 0 deletions fetch/get_main_pages.rb
@@ -0,0 +1,8 @@
require 'rubygems'
here = File.dirname(__FILE__)
page_list = File.open("#{here}/../url_lists/sorted_pages.txt")
page_list.each_line do |link|
junk, system, id = link.match(/http:\/\/www\.gamerankings\.com\/([^\/]+)\/(\d+)/).to_a
FileUtils.mkdir_p("#{here}/../html/main/#{system}") unless File.exists?("./html/main/#{system}")
`wget --output-document=#{here}/html/main/#{system}/#{id}.html --append-output=#{here}/../url_lists/main_page.log #{link}`
end
Binary file added html.bz2
Binary file not shown.
12 changes: 12 additions & 0 deletions lib/game.rb
@@ -0,0 +1,12 @@
class Game
include DataMapper::Resource

property :id, Serial
property :name, String, :length => 255
property :release_date, Date
property :overall_rank, String
property :ratio, Float

has n, :reviews
has n, :review_outlets, :through => :reviews
end
9 changes: 9 additions & 0 deletions lib/gamerankings.rb
@@ -0,0 +1,9 @@
require 'rubygems'
require 'dm-core'
require 'dm-validations'
here = File.dirname(__FILE__)
require here + "/game.rb"
require here + "/review.rb"
require here + "/review_outlet.rb"

DataMapper.setup(:default, "mysql://localhost/gamerankings")
15 changes: 15 additions & 0 deletions lib/review.rb
@@ -0,0 +1,15 @@
class Review
include DataMapper::Resource

property :game_id, Integer, :key => true
property :review_outlet_id, Integer, :key => true
property :date, Date
property :rating, String
property :scale, String
property :ratio, Float
property :discounted, Boolean, :default => true
property :system, String

belongs_to :review_outlet
belongs_to :game
end
9 changes: 9 additions & 0 deletions lib/review_outlet.rb
@@ -0,0 +1,9 @@
class ReviewOutlet
include DataMapper::Resource

property :id, Serial
property :name, String, :required => true

has n, :reviews
has n, :games, :through => :reviews
end
16 changes: 16 additions & 0 deletions process/extract_urls_from_indexes.rb
@@ -0,0 +1,16 @@
require 'nokogiri'
require 'open-uri'

page_file = File.open("page_list.txt", "w")
url_base = "http://www.gamerankings.com"

here = File.dirname(__FILE__)
index_home = "#{here}/html/indexes"
index_dir = Dir.open(index_home)
indexes = index_dir.reject{|d| d =~ /^\.\.?$/}
indexes.each do |filename|
page = Nokogiri::HTML open("#{index_home}/#{filename}")
page.css("div#content div.body table a").each do |link|
page_file << "#{url_base}#{link.attributes['href'].text}\n"
end
end
75 changes: 75 additions & 0 deletions process/process_article_pages.rb
@@ -0,0 +1,75 @@
require 'rubygems'
require 'nokogiri'
require 'open-uri'
here = File.dirname(__FILE__)
require here + 'lib/gamerankings'

Review.auto_migrate!
ReviewOutlet.auto_migrate!

systems_path = "#{here}/html/articles"
systems = Dir.open(systems_path).reject{|dir| dir =~ /^\.\.?$/}

systems.each do |system|
pages = Dir.open("#{systems_path}/#{system}").reject{|dir| dir =~ /^\.\.?$/}
pages.each do |page_file|
puts page_file
page = Nokogiri::HTML(open("#{systems_path}/#{system}/#{page_file}"))
rows = page.css("div#main_col table.release").first.css("tr").reject do |tr|
tr.attributes["class"] and tr.attributes["class"].text=="head"
end
rows.each do |row|
site_node, date_node, rating_node, ratio_node = row.css("td")

id = site_node.css("a").first.attributes["href"].text.match(/\/sites\/(\d+)/).to_a.last
outlet = site_node.css("a").text.strip
discounted = site_node.text.strip =~ /\*$/ ? true : false
if date_node.text.strip =~ /^(\d{2}\/\d{2}\/)([10]\d)/
date = date_node.text.strip.sub(/^(\d{2}\/\d{2}\/)([10]\d)/,"#{$1}20#{$2}")
elsif date_node.text.strip =~ /^(\d{2}\/\d{2}\/)([89]\d)/
date = date_node.text.strip.sub(/^(\d{2}\/\d{2}\/)([89]\d)/,"#{$1}19#{$2}")
elsif date_node.text.strip =~ /^(\d{2}\/\d{2}\/)(36)/
# there's one bad record for Indiana Jones and the Emporer's Tomb for XBox which
# has it's release date set to the year 36. It's either a typo or someone having
# a bit too much fun with the exploits of Indiana Jones.
date = date_node.text.strip.sub(/^(\d{2}\/\d{2}\/)(36)/,"#{$1}2003")
else
raise "DATE FAIL on #{date_node.text.strip}"
end
rating_string = rating_node.text.strip
if rating_string =~ /out of/
full_match, rating, scale = rating_string.match(/^([\d.]+) out of (\d+)$/).to_a
raise ArgumentError, "Rating or Scale didn't exist in '#{rating_string}'" unless rating and scale
elsif rating_string =~ /[A-Fa-f][+-]?/
rating = rating_string.match(/[A-Fa-f][+-]?/).to_a.first
scale = "Letter Grade"
else
raise StandardError, "can't parse '#{rating_string}' into a rating"
end
ratio = ratio_node.text.strip.sub(/%/,"")

unless review_outlet = ReviewOutlet.first(:id=>id, :name=>outlet)
puts "Couldn't find #{outlet}, creating record"
review_outlet = ReviewOutlet.create(:id=>id, :name=>outlet)
end

if review = Review.first(:review_outlet_id => id, :game_id => page_file.match(/^\d+/).to_s)
puts review
else
review = Review.new(
:review_outlet_id => id,
:game_id => page_file.match(/^\d+/).to_s,
:date => date,
:rating => rating,
:scale => scale,
:ratio => ratio,
:discounted => discounted
)
unless review.save
raise review.errors.inspect
end
end
# puts [id, outlet, date, rating, scale, ratio].inspect
end
end
end
44 changes: 44 additions & 0 deletions process/process_main_pages.rb
@@ -0,0 +1,44 @@
require 'rubygems'
require 'nokogiri'
require 'open-uri'
here = File.dirname(__FILE__)
require here + 'lib/gamerankings'

systems_path = "#{here}/html/main"
systems = Dir.open(systems_path).reject{|dir| dir =~ /^\.\.?$/}

systems.each do |system|
pages = Dir.open("#{systems_path}/#{system}").reject{|dir| dir =~ /^\.\.?$/}
pages.each do |page_file|
page = Nokogiri::HTML(open("#{systems_path}/#{system}/#{page_file}"))

pods = page.css("div#main_col div.content2 > div.pod")
percent, rank_info = pods[0].css("div.body > table > tr > td")

title = page.css("h1").first.text
crumbs = page.css("div.crumbs a").map{|a|a.text}
rank_info = rank_info.text.split("\n").map{ |l| l.gsub(/\s+/," ").split(/:/).map{|t|t.strip} }
overall_rank = rank_info.assoc("Overall Rank").last
percent = percent.text.sub("%",'').to_f

release_string = page.text.match(/Release Date:([^\n]+)/).to_a.last.strip
begin
release_date = Date.parse(release_string)
rescue ArgumentError
puts "Couldn't parse release date #{release_string}"
release_date = nil
end

game = Game.new(
:id => page_file.match(/\d+/).to_s,
:name => title,
:ratio => percent,
:release_date => release_date,
:overall_rank => overall_rank
)

unless game.save
raise game.errors.inspect
end
end
end

0 comments on commit 8057036

Please sign in to comment.