Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fetchers, scrapers, and html data in bzip2ed format
- Loading branch information
Ted Han
committed
Mar 28, 2010
0 parents
commit 8057036
Showing
16 changed files
with
265,535 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
require 'rubygems' | ||
here = File.dirname(__FILE__) | ||
page_list = File.open("sorted_pages.txt") | ||
page_list.each_line do |link| | ||
junk, system, id = link.match(/http:\/\/www\.gamerankings\.com\/([^\/]+)\/(\d+)/).to_a | ||
FileUtils.mkdir_p("#{here}/../html/articles/#{system}") unless File.exists?("#{here}/../html/articles/#{system}") | ||
`wget --output-document=#{here}/../html/articles/#{system}/#{id}.html --append-output=article_page.log #{link.sub(/\/index.html$/,"/articles.html")}` | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
wget -i index_list.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
require 'rubygems' | ||
here = File.dirname(__FILE__) | ||
page_list = File.open("#{here}/../url_lists/sorted_pages.txt") | ||
page_list.each_line do |link| | ||
junk, system, id = link.match(/http:\/\/www\.gamerankings\.com\/([^\/]+)\/(\d+)/).to_a | ||
FileUtils.mkdir_p("#{here}/../html/main/#{system}") unless File.exists?("./html/main/#{system}") | ||
`wget --output-document=#{here}/html/main/#{system}/#{id}.html --append-output=#{here}/../url_lists/main_page.log #{link}` | ||
end |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
class Game | ||
include DataMapper::Resource | ||
|
||
property :id, Serial | ||
property :name, String, :length => 255 | ||
property :release_date, Date | ||
property :overall_rank, String | ||
property :ratio, Float | ||
|
||
has n, :reviews | ||
has n, :review_outlets, :through => :reviews | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
require 'rubygems' | ||
require 'dm-core' | ||
require 'dm-validations' | ||
here = File.dirname(__FILE__) | ||
require here + "/game.rb" | ||
require here + "/review.rb" | ||
require here + "/review_outlet.rb" | ||
|
||
DataMapper.setup(:default, "mysql://localhost/gamerankings") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
class Review | ||
include DataMapper::Resource | ||
|
||
property :game_id, Integer, :key => true | ||
property :review_outlet_id, Integer, :key => true | ||
property :date, Date | ||
property :rating, String | ||
property :scale, String | ||
property :ratio, Float | ||
property :discounted, Boolean, :default => true | ||
property :system, String | ||
|
||
belongs_to :review_outlet | ||
belongs_to :game | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
class ReviewOutlet | ||
include DataMapper::Resource | ||
|
||
property :id, Serial | ||
property :name, String, :required => true | ||
|
||
has n, :reviews | ||
has n, :games, :through => :reviews | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
require 'nokogiri' | ||
require 'open-uri' | ||
|
||
page_file = File.open("page_list.txt", "w") | ||
url_base = "http://www.gamerankings.com" | ||
|
||
here = File.dirname(__FILE__) | ||
index_home = "#{here}/html/indexes" | ||
index_dir = Dir.open(index_home) | ||
indexes = index_dir.reject{|d| d =~ /^\.\.?$/} | ||
indexes.each do |filename| | ||
page = Nokogiri::HTML open("#{index_home}/#{filename}") | ||
page.css("div#content div.body table a").each do |link| | ||
page_file << "#{url_base}#{link.attributes['href'].text}\n" | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
require 'rubygems' | ||
require 'nokogiri' | ||
require 'open-uri' | ||
here = File.dirname(__FILE__) | ||
require here + 'lib/gamerankings' | ||
|
||
Review.auto_migrate! | ||
ReviewOutlet.auto_migrate! | ||
|
||
systems_path = "#{here}/html/articles" | ||
systems = Dir.open(systems_path).reject{|dir| dir =~ /^\.\.?$/} | ||
|
||
systems.each do |system| | ||
pages = Dir.open("#{systems_path}/#{system}").reject{|dir| dir =~ /^\.\.?$/} | ||
pages.each do |page_file| | ||
puts page_file | ||
page = Nokogiri::HTML(open("#{systems_path}/#{system}/#{page_file}")) | ||
rows = page.css("div#main_col table.release").first.css("tr").reject do |tr| | ||
tr.attributes["class"] and tr.attributes["class"].text=="head" | ||
end | ||
rows.each do |row| | ||
site_node, date_node, rating_node, ratio_node = row.css("td") | ||
|
||
id = site_node.css("a").first.attributes["href"].text.match(/\/sites\/(\d+)/).to_a.last | ||
outlet = site_node.css("a").text.strip | ||
discounted = site_node.text.strip =~ /\*$/ ? true : false | ||
if date_node.text.strip =~ /^(\d{2}\/\d{2}\/)([10]\d)/ | ||
date = date_node.text.strip.sub(/^(\d{2}\/\d{2}\/)([10]\d)/,"#{$1}20#{$2}") | ||
elsif date_node.text.strip =~ /^(\d{2}\/\d{2}\/)([89]\d)/ | ||
date = date_node.text.strip.sub(/^(\d{2}\/\d{2}\/)([89]\d)/,"#{$1}19#{$2}") | ||
elsif date_node.text.strip =~ /^(\d{2}\/\d{2}\/)(36)/ | ||
# there's one bad record for Indiana Jones and the Emporer's Tomb for XBox which | ||
# has it's release date set to the year 36. It's either a typo or someone having | ||
# a bit too much fun with the exploits of Indiana Jones. | ||
date = date_node.text.strip.sub(/^(\d{2}\/\d{2}\/)(36)/,"#{$1}2003") | ||
else | ||
raise "DATE FAIL on #{date_node.text.strip}" | ||
end | ||
rating_string = rating_node.text.strip | ||
if rating_string =~ /out of/ | ||
full_match, rating, scale = rating_string.match(/^([\d.]+) out of (\d+)$/).to_a | ||
raise ArgumentError, "Rating or Scale didn't exist in '#{rating_string}'" unless rating and scale | ||
elsif rating_string =~ /[A-Fa-f][+-]?/ | ||
rating = rating_string.match(/[A-Fa-f][+-]?/).to_a.first | ||
scale = "Letter Grade" | ||
else | ||
raise StandardError, "can't parse '#{rating_string}' into a rating" | ||
end | ||
ratio = ratio_node.text.strip.sub(/%/,"") | ||
|
||
unless review_outlet = ReviewOutlet.first(:id=>id, :name=>outlet) | ||
puts "Couldn't find #{outlet}, creating record" | ||
review_outlet = ReviewOutlet.create(:id=>id, :name=>outlet) | ||
end | ||
|
||
if review = Review.first(:review_outlet_id => id, :game_id => page_file.match(/^\d+/).to_s) | ||
puts review | ||
else | ||
review = Review.new( | ||
:review_outlet_id => id, | ||
:game_id => page_file.match(/^\d+/).to_s, | ||
:date => date, | ||
:rating => rating, | ||
:scale => scale, | ||
:ratio => ratio, | ||
:discounted => discounted | ||
) | ||
unless review.save | ||
raise review.errors.inspect | ||
end | ||
end | ||
# puts [id, outlet, date, rating, scale, ratio].inspect | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
require 'rubygems' | ||
require 'nokogiri' | ||
require 'open-uri' | ||
here = File.dirname(__FILE__) | ||
require here + 'lib/gamerankings' | ||
|
||
systems_path = "#{here}/html/main" | ||
systems = Dir.open(systems_path).reject{|dir| dir =~ /^\.\.?$/} | ||
|
||
systems.each do |system| | ||
pages = Dir.open("#{systems_path}/#{system}").reject{|dir| dir =~ /^\.\.?$/} | ||
pages.each do |page_file| | ||
page = Nokogiri::HTML(open("#{systems_path}/#{system}/#{page_file}")) | ||
|
||
pods = page.css("div#main_col div.content2 > div.pod") | ||
percent, rank_info = pods[0].css("div.body > table > tr > td") | ||
|
||
title = page.css("h1").first.text | ||
crumbs = page.css("div.crumbs a").map{|a|a.text} | ||
rank_info = rank_info.text.split("\n").map{ |l| l.gsub(/\s+/," ").split(/:/).map{|t|t.strip} } | ||
overall_rank = rank_info.assoc("Overall Rank").last | ||
percent = percent.text.sub("%",'').to_f | ||
|
||
release_string = page.text.match(/Release Date:([^\n]+)/).to_a.last.strip | ||
begin | ||
release_date = Date.parse(release_string) | ||
rescue ArgumentError | ||
puts "Couldn't parse release date #{release_string}" | ||
release_date = nil | ||
end | ||
|
||
game = Game.new( | ||
:id => page_file.match(/\d+/).to_s, | ||
:name => title, | ||
:ratio => percent, | ||
:release_date => release_date, | ||
:overall_rank => overall_rank | ||
) | ||
|
||
unless game.save | ||
raise game.errors.inspect | ||
end | ||
end | ||
end |
Oops, something went wrong.