Skip to content

Commit

Permalink
Added a gitignore; put the IANA downloader in the Rakefile.
Browse files Browse the repository at this point in the history
  • Loading branch information
halostatue committed Feb 28, 2009
1 parent b864713 commit ffa1b47
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 115 deletions.
6 changes: 6 additions & 0 deletions .gitignore
@@ -0,0 +1,6 @@
*.swp
html
doc
pkg
publish
coverage
238 changes: 123 additions & 115 deletions Rakefile
Expand Up @@ -42,8 +42,7 @@ Hoe.new PKG_NAME, PKG_VERSION do |p|

p.clean_globs << "coverage"

p.spec_extras[:extra_rdoc_files] = MANIFEST.grep(/txt$/) -
["Manifest.txt"]
p.spec_extras[:extra_rdoc_files] = MANIFEST.grep(/txt$/) - ["Manifest.txt"]
end

desc "Build a MIME::Types .tar.gz distribution."
Expand Down Expand Up @@ -72,7 +71,7 @@ file PKG_TAR => [ :test ] do |t|
begin
unless File.directory?(File.dirname(t.name))
require 'fileutils'
File.mkdir_p File.dirname(t.name)
FileUtils.mkdir_p File.dirname(t.name)
end
tf = File.open(t.name, 'wb')
gz = Zlib::GzipWriter.new(tf)
Expand Down Expand Up @@ -118,138 +117,147 @@ task :build_manifest do |t|
end

desc "Download the current MIME type registrations from IANA."
task :download_from_iana do |t|
#!/usr/bin/ruby -w
task :iana, :save, :destination do |t, args|
save_type = args.save || :text
save_type = save_type.to_sym

case save_type
when :text, :both, :html
nil
else
raise "Unknown save type provided. Must be one of text, both, or html."
end

require 'rubygems'
require 'open-uri'
require 'nokogiri'
require 'cgi'

class IANAParser
include Comparable

INDEX = %q(http://www.iana.org/assignments/media-types/)
CONTACT_PEOPLE = %r{http://www.iana.org/assignments/contact-people.html?#(.*)}
RFC_EDITOR = %r{http://www.rfc-editor.org/rfc/rfc(\d+).txt}
IETF_RFC = %r{http://www.ietf.org/rfc/rfc(\d+).txt}
IETF_RFC_TOOLS = %r{http://tools.ietf.org/html/rfc(\d+)}

class << self
def load_index
@types ||= {}

Nokogiri::HTML(open(INDEX) { |f| f.read }).xpath('//p/a').each do |tag|
href_match = %r{^/assignments/media-types/(.+)/$}.match(tag['href'])
next if href_match.nil?
type = href_match.captures[0]
@types[tag.content] = IANAParser.new(tag.content, type)
destination = args.destination || "type-lists"

require 'open-uri'
require 'nokogiri'
require 'cgi'

class IANAParser
include Comparable

INDEX = %q(http://www.iana.org/assignments/media-types/)
CONTACT_PEOPLE = %r{http://www.iana.org/assignments/contact-people.html?#(.*)}
RFC_EDITOR = %r{http://www.rfc-editor.org/rfc/rfc(\d+).txt}
IETF_RFC = %r{http://www.ietf.org/rfc/rfc(\d+).txt}
IETF_RFC_TOOLS = %r{http://tools.ietf.org/html/rfc(\d+)}

class << self
def load_index
@types ||= {}

Nokogiri::HTML(open(INDEX) { |f| f.read }).xpath('//p/a').each do |tag|
href_match = %r{^/assignments/media-types/(.+)/$}.match(tag['href'])
next if href_match.nil?
type = href_match.captures[0]
@types[tag.content] = IANAParser.new(tag.content, type)
end
end
end

attr_reader :types
end
attr_reader :types
end

def initialize(name, type)
@name = name
@type = type
@url = File.join(INDEX, @type)
end
def initialize(name, type)
@name = name
@type = type
@url = File.join(INDEX, @type)
end

attr_reader :name
attr_reader :type
attr_reader :url
attr_reader :html
attr_reader :name
attr_reader :type
attr_reader :url
attr_reader :html

def download(name = nil)
if name
@html = Nokogiri::HTML(open(name) { |f| f.read })
else
@html = Nokogiri::HTML(open(@url) { |f| f.read })
def download(name = nil)
@html = Nokogiri::HTML(open(name || @url) { |f| f.read })
end
end

def save_html
File.open("#@name.html", "wb") { |w| w.write @html }
end
def save_html
File.open("#@name.html", "wb") { |w| w.write @html }
end

def <=>(o)
self.name <=> o.name
end
def <=>(o)
self.name <=> o.name
end

def parse
nodes = html.xpath("//table//table//tr")

# How many <td> children does the first node have?
node_count = nodes.first.children.select { |node| node.elem? }.size

@mime_types = nodes.map do |node|
next if node == nodes.first
elems = node.children.select { |n| n.elem? }
next if elems.size.zero?
raise "size mismatch #{elems.size} != #{node_count}" if node_count != elems.size

case elems.size
when 3
subtype_index = 1
refnode_index = 2
when 4
subtype_index = 1
refnode_index = 3
else
raise "Unknown element size."
end
def parse
nodes = html.xpath("//table//table//tr")

# How many <td> children does the first node have?
node_count = nodes.first.children.select { |node| node.elem? }.size

@mime_types = nodes.map do |node|
next if node == nodes.first
elems = node.children.select { |n| n.elem? }
next if elems.size.zero?
raise "size mismatch #{elems.size} != #{node_count}" if node_count != elems.size

case elems.size
when 3
subtype_index = 1
refnode_index = 2
when 4
subtype_index = 1
refnode_index = 3
else
raise "Unknown element size."
end

subtype = elems[subtype_index].content.chomp.strip
refnodes = elems[refnode_index].children.select { |n| n.elem? }.map { |ref|
case ref['href']
when CONTACT_PEOPLE
tag = CGI::unescape($1).chomp.strip
if tag == ref.content
subtype = elems[subtype_index].content.chomp.strip
refnodes = elems[refnode_index].children.select { |n| n.elem? }.map { |ref|
case ref['href']
when CONTACT_PEOPLE
tag = CGI::unescape($1).chomp.strip
if tag == ref.content
"[#{ref.content}]"
else
else
"[#{ref.content}=#{tag}]"
end
when RFC_EDITOR, IETF_RFC, IETF_RFC_TOOLS
end
when RFC_EDITOR, IETF_RFC, IETF_RFC_TOOLS
"RFC#$1"
when %r{(https?://.*)}
when %r{(https?://.*)}
"{#{ref.content}=#$1}"
else
ref
end
}
refs = refnodes.join(',')
else
ref
end
}
refs = refnodes.join(',')

"#@type/#{subtype} 'IANA,#{refs}"
end.compact
end.compact

@mime_types
end
@mime_types
end

def save_text
File.open("#@name.txt", "wb") { |w| w.write @mime_types.join("\n") }
def save_text
File.open("#@name.txt", "wb") { |w| w.write @mime_types.join("\n") }
end
end
end

puts "Downloading index of MIME types from #{IANAParser::INDEX}."
IANAParser.load_index

IANAParser.types.values.sort.each do |parser|
next if parser.name == "example" or parser.name == "mime"
puts "Downloading #{parser.name} from #{parser.url}"
parser.download
puts "Saving #{parser.name}.html"
parser.save_html
puts "Parsing #{parser.name}"
parser.parse
puts "Saving #{parser.name}.txt"
parser.save_text
end
puts "Downloading index of MIME types from #{IANAParser::INDEX}."
IANAParser.load_index

require 'fileutils'
FileUtils.mkdir_p destination
Dir.chdir destination do
IANAParser.types.values.sort.each do |parser|
next if parser.name == "example" or parser.name == "mime"
puts "Downloading #{parser.name} from #{parser.url}"
parser.download

if :html == save_type || :both == save_type
puts "Saving #{parser.name}.html"
parser.save_html
end

puts "Parsing #{parser.name} HTML"
parser.parse

# foo = IANAParser.types['application']
# foo.download("application.html")
# foo.parse
# foo = IANAParser.types['image']
# foo.download("image.html")
# foo.parse
if :text == save_type || :both == save_type
puts "Saving #{parser.name}.txt"
parser.save_text
end
end
end
end

0 comments on commit ffa1b47

Please sign in to comment.