Permalink
Browse files

Fetchers, scrapers, and html data in bzip2ed format

  • Loading branch information...
0 parents commit 805703645aad74068ed6a986d4d3cdc90ee8ef89 Ted Han committed Mar 28, 2010
8 fetch/get_article_pages.rb
@@ -0,0 +1,8 @@
+require 'rubygems'
+here = File.dirname(__FILE__)
+page_list = File.open("sorted_pages.txt")
+page_list.each_line do |link|
+ junk, system, id = link.match(/http:\/\/www\.gamerankings\.com\/([^\/]+)\/(\d+)/).to_a
+ FileUtils.mkdir_p("#{here}/../html/articles/#{system}") unless File.exists?("#{here}/../html/articles/#{system}")
+ `wget --output-document=#{here}/../html/articles/#{system}/#{id}.html --append-output=article_page.log #{link.sub(/\/index.html$/,"/articles.html")}`
+end
1 fetch/get_indexes.sh
@@ -0,0 +1 @@
+wget -i index_list.txt
8 fetch/get_main_pages.rb
@@ -0,0 +1,8 @@
+require 'rubygems'
+here = File.dirname(__FILE__)
+page_list = File.open("#{here}/../url_lists/sorted_pages.txt")
+page_list.each_line do |link|
+ junk, system, id = link.match(/http:\/\/www\.gamerankings\.com\/([^\/]+)\/(\d+)/).to_a
+ FileUtils.mkdir_p("#{here}/../html/main/#{system}") unless File.exists?("./html/main/#{system}")
+ `wget --output-document=#{here}/html/main/#{system}/#{id}.html --append-output=#{here}/../url_lists/main_page.log #{link}`
+end
BIN html.bz2
Binary file not shown.
12 lib/game.rb
@@ -0,0 +1,12 @@
+class Game
+ include DataMapper::Resource
+
+ property :id, Serial
+ property :name, String, :length => 255
+ property :release_date, Date
+ property :overall_rank, String
+ property :ratio, Float
+
+ has n, :reviews
+ has n, :review_outlets, :through => :reviews
+end
9 lib/gamerankings.rb
@@ -0,0 +1,9 @@
+require 'rubygems'
+require 'dm-core'
+require 'dm-validations'
+here = File.dirname(__FILE__)
+require here + "/game.rb"
+require here + "/review.rb"
+require here + "/review_outlet.rb"
+
+DataMapper.setup(:default, "mysql://localhost/gamerankings")
15 lib/review.rb
@@ -0,0 +1,15 @@
+class Review
+ include DataMapper::Resource
+
+ property :game_id, Integer, :key => true
+ property :review_outlet_id, Integer, :key => true
+ property :date, Date
+ property :rating, String
+ property :scale, String
+ property :ratio, Float
+ property :discounted, Boolean, :default => true
+ property :system, String
+
+ belongs_to :review_outlet
+ belongs_to :game
+end
9 lib/review_outlet.rb
@@ -0,0 +1,9 @@
+class ReviewOutlet
+ include DataMapper::Resource
+
+ property :id, Serial
+ property :name, String, :required => true
+
+ has n, :reviews
+ has n, :games, :through => :reviews
+end
16 process/extract_urls_from_indexes.rb
@@ -0,0 +1,16 @@
+require 'nokogiri'
+require 'open-uri'
+
+page_file = File.open("page_list.txt", "w")
+url_base = "http://www.gamerankings.com"
+
+here = File.dirname(__FILE__)
+index_home = "#{here}/html/indexes"
+index_dir = Dir.open(index_home)
+indexes = index_dir.reject{|d| d =~ /^\.\.?$/}
+indexes.each do |filename|
+ page = Nokogiri::HTML open("#{index_home}/#{filename}")
+ page.css("div#content div.body table a").each do |link|
+ page_file << "#{url_base}#{link.attributes['href'].text}\n"
+ end
+end
75 process/process_article_pages.rb
@@ -0,0 +1,75 @@
+require 'rubygems'
+require 'nokogiri'
+require 'open-uri'
+here = File.dirname(__FILE__)
+require here + 'lib/gamerankings'
+
+Review.auto_migrate!
+ReviewOutlet.auto_migrate!
+
+systems_path = "#{here}/html/articles"
+systems = Dir.open(systems_path).reject{|dir| dir =~ /^\.\.?$/}
+
+systems.each do |system|
+ pages = Dir.open("#{systems_path}/#{system}").reject{|dir| dir =~ /^\.\.?$/}
+ pages.each do |page_file|
+ puts page_file
+ page = Nokogiri::HTML(open("#{systems_path}/#{system}/#{page_file}"))
+ rows = page.css("div#main_col table.release").first.css("tr").reject do |tr|
+ tr.attributes["class"] and tr.attributes["class"].text=="head"
+ end
+ rows.each do |row|
+ site_node, date_node, rating_node, ratio_node = row.css("td")
+
+ id = site_node.css("a").first.attributes["href"].text.match(/\/sites\/(\d+)/).to_a.last
+ outlet = site_node.css("a").text.strip
+ discounted = site_node.text.strip =~ /\*$/ ? true : false
+ if date_node.text.strip =~ /^(\d{2}\/\d{2}\/)([10]\d)/
+ date = date_node.text.strip.sub(/^(\d{2}\/\d{2}\/)([10]\d)/,"#{$1}20#{$2}")
+ elsif date_node.text.strip =~ /^(\d{2}\/\d{2}\/)([89]\d)/
+ date = date_node.text.strip.sub(/^(\d{2}\/\d{2}\/)([89]\d)/,"#{$1}19#{$2}")
+ elsif date_node.text.strip =~ /^(\d{2}\/\d{2}\/)(36)/
+ # there's one bad record for Indiana Jones and the Emporer's Tomb for XBox which
+ # has it's release date set to the year 36. It's either a typo or someone having
+ # a bit too much fun with the exploits of Indiana Jones.
+ date = date_node.text.strip.sub(/^(\d{2}\/\d{2}\/)(36)/,"#{$1}2003")
+ else
+ raise "DATE FAIL on #{date_node.text.strip}"
+ end
+ rating_string = rating_node.text.strip
+ if rating_string =~ /out of/
+ full_match, rating, scale = rating_string.match(/^([\d.]+) out of (\d+)$/).to_a
+ raise ArgumentError, "Rating or Scale didn't exist in '#{rating_string}'" unless rating and scale
+ elsif rating_string =~ /[A-Fa-f][+-]?/
+ rating = rating_string.match(/[A-Fa-f][+-]?/).to_a.first
+ scale = "Letter Grade"
+ else
+ raise StandardError, "can't parse '#{rating_string}' into a rating"
+ end
+ ratio = ratio_node.text.strip.sub(/%/,"")
+
+ unless review_outlet = ReviewOutlet.first(:id=>id, :name=>outlet)
+ puts "Couldn't find #{outlet}, creating record"
+ review_outlet = ReviewOutlet.create(:id=>id, :name=>outlet)
+ end
+
+ if review = Review.first(:review_outlet_id => id, :game_id => page_file.match(/^\d+/).to_s)
+ puts review
+ else
+ review = Review.new(
+ :review_outlet_id => id,
+ :game_id => page_file.match(/^\d+/).to_s,
+ :date => date,
+ :rating => rating,
+ :scale => scale,
+ :ratio => ratio,
+ :discounted => discounted
+ )
+ unless review.save
+ raise review.errors.inspect
+ end
+ end
+# puts [id, outlet, date, rating, scale, ratio].inspect
+ end
+ end
+end
44 process/process_main_pages.rb
@@ -0,0 +1,44 @@
+require 'rubygems'
+require 'nokogiri'
+require 'open-uri'
+here = File.dirname(__FILE__)
+require here + 'lib/gamerankings'
+
+systems_path = "#{here}/html/main"
+systems = Dir.open(systems_path).reject{|dir| dir =~ /^\.\.?$/}
+
+systems.each do |system|
+ pages = Dir.open("#{systems_path}/#{system}").reject{|dir| dir =~ /^\.\.?$/}
+ pages.each do |page_file|
+ page = Nokogiri::HTML(open("#{systems_path}/#{system}/#{page_file}"))
+
+ pods = page.css("div#main_col div.content2 > div.pod")
+ percent, rank_info = pods[0].css("div.body > table > tr > td")
+
+ title = page.css("h1").first.text
+ crumbs = page.css("div.crumbs a").map{|a|a.text}
+ rank_info = rank_info.text.split("\n").map{ |l| l.gsub(/\s+/," ").split(/:/).map{|t|t.strip} }
+ overall_rank = rank_info.assoc("Overall Rank").last
+ percent = percent.text.sub("%",'').to_f
+
+ release_string = page.text.match(/Release Date:([^\n]+)/).to_a.last.strip
+ begin
+ release_date = Date.parse(release_string)
+ rescue ArgumentError
+ puts "Couldn't parse release date #{release_string}"
+ release_date = nil
+ end
+
+ game = Game.new(
+ :id => page_file.match(/\d+/).to_s,
+ :name => title,
+ :ratio => percent,
+ :release_date => release_date,
+ :overall_rank => overall_rank
+ )
+
+ unless game.save
+ raise game.errors.inspect
+ end
+ end
+end
121,857 url_lists/article_page.log
121,857 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
221 url_lists/index_list.txt
@@ -0,0 +1,221 @@
+http://www.gamerankings.com/browse.html?page=0
+http://www.gamerankings.com/browse.html?page=1
+http://www.gamerankings.com/browse.html?page=2
+http://www.gamerankings.com/browse.html?page=3
+http://www.gamerankings.com/browse.html?page=4
+http://www.gamerankings.com/browse.html?page=5
+http://www.gamerankings.com/browse.html?page=6
+http://www.gamerankings.com/browse.html?page=7
+http://www.gamerankings.com/browse.html?page=8
+http://www.gamerankings.com/browse.html?page=9
+http://www.gamerankings.com/browse.html?page=10
+http://www.gamerankings.com/browse.html?page=11
+http://www.gamerankings.com/browse.html?page=12
+http://www.gamerankings.com/browse.html?page=13
+http://www.gamerankings.com/browse.html?page=14
+http://www.gamerankings.com/browse.html?page=15
+http://www.gamerankings.com/browse.html?page=16
+http://www.gamerankings.com/browse.html?page=17
+http://www.gamerankings.com/browse.html?page=18
+http://www.gamerankings.com/browse.html?page=19
+http://www.gamerankings.com/browse.html?page=20
+http://www.gamerankings.com/browse.html?page=21
+http://www.gamerankings.com/browse.html?page=22
+http://www.gamerankings.com/browse.html?page=23
+http://www.gamerankings.com/browse.html?page=24
+http://www.gamerankings.com/browse.html?page=25
+http://www.gamerankings.com/browse.html?page=26
+http://www.gamerankings.com/browse.html?page=27
+http://www.gamerankings.com/browse.html?page=28
+http://www.gamerankings.com/browse.html?page=29
+http://www.gamerankings.com/browse.html?page=30
+http://www.gamerankings.com/browse.html?page=31
+http://www.gamerankings.com/browse.html?page=32
+http://www.gamerankings.com/browse.html?page=33
+http://www.gamerankings.com/browse.html?page=34
+http://www.gamerankings.com/browse.html?page=35
+http://www.gamerankings.com/browse.html?page=36
+http://www.gamerankings.com/browse.html?page=37
+http://www.gamerankings.com/browse.html?page=38
+http://www.gamerankings.com/browse.html?page=39
+http://www.gamerankings.com/browse.html?page=40
+http://www.gamerankings.com/browse.html?page=41
+http://www.gamerankings.com/browse.html?page=42
+http://www.gamerankings.com/browse.html?page=43
+http://www.gamerankings.com/browse.html?page=44
+http://www.gamerankings.com/browse.html?page=45
+http://www.gamerankings.com/browse.html?page=46
+http://www.gamerankings.com/browse.html?page=47
+http://www.gamerankings.com/browse.html?page=48
+http://www.gamerankings.com/browse.html?page=49
+http://www.gamerankings.com/browse.html?page=50
+http://www.gamerankings.com/browse.html?page=51
+http://www.gamerankings.com/browse.html?page=52
+http://www.gamerankings.com/browse.html?page=53
+http://www.gamerankings.com/browse.html?page=54
+http://www.gamerankings.com/browse.html?page=55
+http://www.gamerankings.com/browse.html?page=56
+http://www.gamerankings.com/browse.html?page=57
+http://www.gamerankings.com/browse.html?page=58
+http://www.gamerankings.com/browse.html?page=59
+http://www.gamerankings.com/browse.html?page=60
+http://www.gamerankings.com/browse.html?page=61
+http://www.gamerankings.com/browse.html?page=62
+http://www.gamerankings.com/browse.html?page=63
+http://www.gamerankings.com/browse.html?page=64
+http://www.gamerankings.com/browse.html?page=65
+http://www.gamerankings.com/browse.html?page=66
+http://www.gamerankings.com/browse.html?page=67
+http://www.gamerankings.com/browse.html?page=68
+http://www.gamerankings.com/browse.html?page=69
+http://www.gamerankings.com/browse.html?page=70
+http://www.gamerankings.com/browse.html?page=71
+http://www.gamerankings.com/browse.html?page=72
+http://www.gamerankings.com/browse.html?page=73
+http://www.gamerankings.com/browse.html?page=74
+http://www.gamerankings.com/browse.html?page=75
+http://www.gamerankings.com/browse.html?page=76
+http://www.gamerankings.com/browse.html?page=77
+http://www.gamerankings.com/browse.html?page=78
+http://www.gamerankings.com/browse.html?page=79
+http://www.gamerankings.com/browse.html?page=80
+http://www.gamerankings.com/browse.html?page=81
+http://www.gamerankings.com/browse.html?page=82
+http://www.gamerankings.com/browse.html?page=83
+http://www.gamerankings.com/browse.html?page=84
+http://www.gamerankings.com/browse.html?page=85
+http://www.gamerankings.com/browse.html?page=86
+http://www.gamerankings.com/browse.html?page=87
+http://www.gamerankings.com/browse.html?page=88
+http://www.gamerankings.com/browse.html?page=89
+http://www.gamerankings.com/browse.html?page=90
+http://www.gamerankings.com/browse.html?page=91
+http://www.gamerankings.com/browse.html?page=92
+http://www.gamerankings.com/browse.html?page=93
+http://www.gamerankings.com/browse.html?page=94
+http://www.gamerankings.com/browse.html?page=95
+http://www.gamerankings.com/browse.html?page=96
+http://www.gamerankings.com/browse.html?page=97
+http://www.gamerankings.com/browse.html?page=98
+http://www.gamerankings.com/browse.html?page=99
+http://www.gamerankings.com/browse.html?page=100
+http://www.gamerankings.com/browse.html?page=101
+http://www.gamerankings.com/browse.html?page=102
+http://www.gamerankings.com/browse.html?page=103
+http://www.gamerankings.com/browse.html?page=104
+http://www.gamerankings.com/browse.html?page=105
+http://www.gamerankings.com/browse.html?page=106
+http://www.gamerankings.com/browse.html?page=107
+http://www.gamerankings.com/browse.html?page=108
+http://www.gamerankings.com/browse.html?page=109
+http://www.gamerankings.com/browse.html?page=110
+http://www.gamerankings.com/browse.html?page=111
+http://www.gamerankings.com/browse.html?page=112
+http://www.gamerankings.com/browse.html?page=113
+http://www.gamerankings.com/browse.html?page=114
+http://www.gamerankings.com/browse.html?page=115
+http://www.gamerankings.com/browse.html?page=116
+http://www.gamerankings.com/browse.html?page=117
+http://www.gamerankings.com/browse.html?page=118
+http://www.gamerankings.com/browse.html?page=119
+http://www.gamerankings.com/browse.html?page=120
+http://www.gamerankings.com/browse.html?page=121
+http://www.gamerankings.com/browse.html?page=122
+http://www.gamerankings.com/browse.html?page=123
+http://www.gamerankings.com/browse.html?page=124
+http://www.gamerankings.com/browse.html?page=125
+http://www.gamerankings.com/browse.html?page=126
+http://www.gamerankings.com/browse.html?page=127
+http://www.gamerankings.com/browse.html?page=128
+http://www.gamerankings.com/browse.html?page=129
+http://www.gamerankings.com/browse.html?page=130
+http://www.gamerankings.com/browse.html?page=131
+http://www.gamerankings.com/browse.html?page=132
+http://www.gamerankings.com/browse.html?page=133
+http://www.gamerankings.com/browse.html?page=134
+http://www.gamerankings.com/browse.html?page=135
+http://www.gamerankings.com/browse.html?page=136
+http://www.gamerankings.com/browse.html?page=137
+http://www.gamerankings.com/browse.html?page=138
+http://www.gamerankings.com/browse.html?page=139
+http://www.gamerankings.com/browse.html?page=140
+http://www.gamerankings.com/browse.html?page=141
+http://www.gamerankings.com/browse.html?page=142
+http://www.gamerankings.com/browse.html?page=143
+http://www.gamerankings.com/browse.html?page=144
+http://www.gamerankings.com/browse.html?page=145
+http://www.gamerankings.com/browse.html?page=146
+http://www.gamerankings.com/browse.html?page=147
+http://www.gamerankings.com/browse.html?page=148
+http://www.gamerankings.com/browse.html?page=149
+http://www.gamerankings.com/browse.html?page=150
+http://www.gamerankings.com/browse.html?page=151
+http://www.gamerankings.com/browse.html?page=152
+http://www.gamerankings.com/browse.html?page=153
+http://www.gamerankings.com/browse.html?page=154
+http://www.gamerankings.com/browse.html?page=155
+http://www.gamerankings.com/browse.html?page=156
+http://www.gamerankings.com/browse.html?page=157
+http://www.gamerankings.com/browse.html?page=158
+http://www.gamerankings.com/browse.html?page=159
+http://www.gamerankings.com/browse.html?page=160
+http://www.gamerankings.com/browse.html?page=161
+http://www.gamerankings.com/browse.html?page=162
+http://www.gamerankings.com/browse.html?page=163
+http://www.gamerankings.com/browse.html?page=164
+http://www.gamerankings.com/browse.html?page=165
+http://www.gamerankings.com/browse.html?page=166
+http://www.gamerankings.com/browse.html?page=167
+http://www.gamerankings.com/browse.html?page=168
+http://www.gamerankings.com/browse.html?page=169
+http://www.gamerankings.com/browse.html?page=170
+http://www.gamerankings.com/browse.html?page=171
+http://www.gamerankings.com/browse.html?page=172
+http://www.gamerankings.com/browse.html?page=173
+http://www.gamerankings.com/browse.html?page=174
+http://www.gamerankings.com/browse.html?page=175
+http://www.gamerankings.com/browse.html?page=176
+http://www.gamerankings.com/browse.html?page=177
+http://www.gamerankings.com/browse.html?page=178
+http://www.gamerankings.com/browse.html?page=179
+http://www.gamerankings.com/browse.html?page=180
+http://www.gamerankings.com/browse.html?page=181
+http://www.gamerankings.com/browse.html?page=182
+http://www.gamerankings.com/browse.html?page=183
+http://www.gamerankings.com/browse.html?page=184
+http://www.gamerankings.com/browse.html?page=185
+http://www.gamerankings.com/browse.html?page=186
+http://www.gamerankings.com/browse.html?page=187
+http://www.gamerankings.com/browse.html?page=188
+http://www.gamerankings.com/browse.html?page=189
+http://www.gamerankings.com/browse.html?page=190
+http://www.gamerankings.com/browse.html?page=191
+http://www.gamerankings.com/browse.html?page=192
+http://www.gamerankings.com/browse.html?page=193
+http://www.gamerankings.com/browse.html?page=194
+http://www.gamerankings.com/browse.html?page=195
+http://www.gamerankings.com/browse.html?page=196
+http://www.gamerankings.com/browse.html?page=197
+http://www.gamerankings.com/browse.html?page=198
+http://www.gamerankings.com/browse.html?page=199
+http://www.gamerankings.com/browse.html?page=200
+http://www.gamerankings.com/browse.html?page=201
+http://www.gamerankings.com/browse.html?page=202
+http://www.gamerankings.com/browse.html?page=203
+http://www.gamerankings.com/browse.html?page=204
+http://www.gamerankings.com/browse.html?page=205
+http://www.gamerankings.com/browse.html?page=206
+http://www.gamerankings.com/browse.html?page=207
+http://www.gamerankings.com/browse.html?page=208
+http://www.gamerankings.com/browse.html?page=209
+http://www.gamerankings.com/browse.html?page=210
+http://www.gamerankings.com/browse.html?page=211
+http://www.gamerankings.com/browse.html?page=212
+http://www.gamerankings.com/browse.html?page=213
+http://www.gamerankings.com/browse.html?page=214
+http://www.gamerankings.com/browse.html?page=215
+http://www.gamerankings.com/browse.html?page=216
+http://www.gamerankings.com/browse.html?page=217
+http://www.gamerankings.com/browse.html?page=218
+http://www.gamerankings.com/browse.html?page=219
+http://www.gamerankings.com/browse.html?page=220
121,220 url_lists/main_page.log
121,220 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
11,020 url_lists/page_list.txt
11,020 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
11,020 url_lists/sorted_pages.txt
11,020 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.

0 comments on commit 8057036

Please sign in to comment.