Skip to content

Commit

Permalink
add producer and director normalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
inukshuk committed Mar 14, 2014
1 parent fbcb9e3 commit c7a5ca4
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 12 deletions.
77 changes: 66 additions & 11 deletions lib/anystyle/parser/normalizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -109,15 +109,39 @@ def normalize_editor(hash)
def normalize_translator(hash)
translators = hash[:translator]

editors.gsub!(/^\W+|\W+$/, '')
translators.gsub!(/^\W+|\W+$/, '')
translators.gsub!(/[^[:alpha:]]*trans(lated)?[^[:alpha:]]*/i, '')
translators.gsub!(/\bby\b/i, '')

hash[:translator] = normalize_names(translators)
hash
end

def normalize_director(hash)
directors = hash[:director]

directors.gsub!(/^\W+|\W+$/, '')
directors.gsub!(/[^[:alpha:]]*direct(or|ed)?[^[:alpha:]]*/i, '')
directors.gsub!(/\bby\b/i, '')

hash[:director] = normalize_names(directors)
hash
end

def normalize_producer(hash)
producers = hash[:producer]

producers.gsub!(/^\W+|\W+$/, '')
producers.gsub!(/[^[:alpha:]]*produc(er|ed)?[^[:alpha:]]*/i, '')
producers.gsub!(/\bby\b/i, '')

hash[:director] = normalize_names(producers)
hash
end

def normalize_names(names)
names.gsub!(/\s*(\.\.\.|…)\s*/, '')

Namae.parse!(names).map { |name|
unless name.given.nil? || name.family.nil?
name.given.gsub!(/\b([[:upper:]])(\s|$)/, '\1.\2')
Expand All @@ -135,11 +159,11 @@ def normalize_names(names)
Namae.options[:prefer_comma_as_separator] = true

def normalize_title(hash)
title, container = hash[:title]
title, source = hash[:title]

unless container.nil?
hash[:container] = container
normalize(:container, hash)
unless source.nil?
hash[:source] = source
normalize(:source, hash)
end

extract_edition(title, hash)
Expand Down Expand Up @@ -202,18 +226,18 @@ def normalize_journal(hash)
hash
end

def normalize_container(hash)
container, *dangling = hash[:container]
unmatched(:container, hash, dangling) unless dangling.empty?
def normalize_source(hash)
source, *dangling = hash[:source]
unmatched(:source, hash, dangling) unless dangling.empty?

case container
case source
when /dissertation abstracts/i
container.gsub!(/\s*section \w: ([[:alnum:]\s]+).*$/i, '')
source.gsub!(/\s*section \w: ([[:alnum:]\s]+).*$/i, '')
hash[:category] = $1 unless $1.nil?
hash[:type] = :phdthesis
end

hash[:container] = container
hash[:source] = source
hash
end

Expand All @@ -227,6 +251,11 @@ def normalize_date(hash)

if date =~ /(\d{4})/
hash[:year] = $1.to_i

if hash.key?(:month) && date =~ /(\d{1,2})\b/
hash[:day] = $1.to_i
end

hash.delete(:date)
end

Expand Down Expand Up @@ -260,6 +289,24 @@ def normalize_volume(hash)
hash
end

def normalize_publisher(hash)
normalize :publisher, hash

case hash[:publisher]
when /^producers?$/i
hash[:publisher] = hash[:producer]

when /^authors?$/i
hash[:publisher] = hash[:author]

when /^editor?$/i
hash[:publisher] = hash[:editor]

end

hash
end

def normalize_pages(hash)
pages, *dangling = hash[:pages]
unmatched(:pages, hash, dangling) unless dangling.empty?
Expand Down Expand Up @@ -316,6 +363,14 @@ def normalize_url(hash)
hash
end

def normalize_medium(hash)
medium, *dangling = hash[:medium]
unmatched(:medium, hash, dangling) unless dangling.empty?

hash[:medium] = medium.split(/\W+/).reject(&:empty?).join('-')
hash
end

private

def unmatched(label, hash, tokens)
Expand Down
4 changes: 3 additions & 1 deletion lib/anystyle/parser/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,13 @@ def classify(hash)
text = hash.values.flatten.join

case
when keys.include?(:medium)
hash[:type] = hash[:medium]
when keys.include?(:journal)
hash[:type] = :article
when text =~ /proceedings/i
hash[:type] = :inproceedings
when keys.include?(:booktitle), keys.include?(:container)
when keys.include?(:booktitle), keys.include?(:source)
hash[:type] = :incollection
when keys.include?(:publisher)
hash[:type] = :book
Expand Down
16 changes: 16 additions & 0 deletions resources/train.txt
Original file line number Diff line number Diff line change
Expand Up @@ -598,3 +598,19 @@
<author>Fenner M</author> <date>(2013)</date> <title>What can article-level metrics do for you?.</title> <url>Available: http://dx.doi.org/10.1371/journal.pbio.1001687.</url>
<author>Fenner M</author> <date>(2013)</date> <title>Letter from the Guest Content Editor: Altmetrics Have Come of Age.</title> <journal>Information Standards Quarterly.</journal> <url>Available: http://dx.doi.org/10.3789/isqv25no2.2013.01.</url>
<author>Fenner M</author> <date>(2008)</date> <title>Targeted therapies for patients with germ cell tumors..</title> <url>Available: http://dx.doi.org/10.1517/13543784.17.4.511.</url>
<author>Boyer, Paul S.</author> <title>Purity in Print: Book Censorship in America from the Gilded Age to the Computer Age.</title> <edition>2nd ed.</edition> <location>Madison:</location> <publisher>University of Wisconsin Press,</publisher> <date>2002.</date>
<title>"Charles R. Van Hise."</title> <container>In Wikipedia.</container> <retrieved>Last modified May 9, 2013.</retrieved> <url>http://en.wikipedia.org/wiki/Charles_R._Van_Hise.</url>
<author>Child, Julia, Louisette Bertholle, and Simone Beck.</author> <title>Mastering the Art of French Cooking.</title> <location>New York:</location> <publisher>Knopf,</publisher> <date>1961.</date>
<author>CIA World Factbook.</author> <location>Washington, DC:</location> <publisher>Central Intelligence Agency,</publisher> <date>2009.</date>
<author>Dunlavy, Colleen.</author> <title>"Why Did American Businesses Get So Big?"</title> <booktitle>In Major Problems in American Business History,</booktitle> <editor>edited by Regina Blaszczyk and Philip Scranton.</editor> <location>New York:</location> <publisher>Houghton-Mifflin,</publisher> <date>2006.</date> <pages>257-263.</pages>
<author>Johnson, Kirk.</author> <title>"Health Care Is Spread Thin on Alaskan Frontier."</title> <journal>New York Times,</journal> <date>May 28, 2013.</date> <url>http://www.nytimes.com/2013/05/29/us/health-care-in-vast-alaska-frontier-is-spread-thin.html?hpw&_r=0.</url>
<author>Lindberg, Sara M.</author> <title>"Gender-Role Identity Development During Adolescence: Individual, Familial, and Social Contextual Predictors of Gender Intensification."</title> <tech>Ph.D. diss.,</tech> <institution>University of Wisconsin-Madison,</institution> <date>2008.</date>
<author>Marshall, Nancy</author> <title>Rose. Review of Joseph Crawhill, 1861-1913, One of the Glasgow Boys.</title> <journal>Victorian Studies</journal> <volume>42</volume> <date>(1999/2000):</date> <pages>358-60.</pages>
<author>Marwell, Gerald, and Pamela Oliver.</author> <title>The Critical Mass in Collective Action.</title> <location>Cambridge:</location> <publisher>Cambridge University Press,</publisher> <date>2007.</date>
<author>Marshall, Tyler.</author> <title>"200th Birthday of Grimms Celebrated."</title> <journal>Los Angeles Times,</journal> <date>15 March 1985,</date> <section>sec. 1A,</section> <pages>p. 3.</pages>
<author>Neville, Leonora.</author> <title>Authority in Byzantine Provincial Society, 950-1100.</title> <location>Cambridge:</location> <publisher>Cambridge University Press,</publisher> <date>2004.</date> <note>Reprint, Cambridge: Cambridge University Press, 2008.</note>
<author>Shakespeare, William.</author> <title>Othello.</title> <publisher>Milwaukee Repertory Theater,</publisher> <location>Milwaukee,</location> <date>April 2012.</date>
<director>Soderbergh, Steven, director.</director> <title>Che.</title> <medium>DVD.</medium> <location>New York:</location> <publisher>Criterion Collection,</publisher> <date>2008.</date>
<author>United Nations.</author> <title>"Human Rights."</title> <retrieved>Accessed May 29, 2013.</retrieved> <url>http://www.un.org/en/globalissues/humanrights/.</url>
<author>Wandel, Lee Palmer.</author> <title>"Setting the Lutheran Eucharist."</title> <journal>Journal of Early Modern History</journal> <volume>17</volume> <date>(1998):</date> <pages>124-55.</pages> <doi>doi: 10.1163/157006598X00135.</doi>
<author>Zukofsky, Louis.</author> <title>"Sincerity and Objectification."</title> <journal>Poetry</journal> <title>37</title> <date>(February 1931):</date> <pages>269.</pages> <note>Quoted in Bonnie Costello, Marianne Moore: Imaginary Possessions (Cambridge: Harvard University Press, 1981), 78.</note>
8 changes: 8 additions & 0 deletions spec/anystyle/parser/normalizer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,14 @@ module Parser
h[:year].should == 1997
h[:month].should == 10
end

it 'extracts days if month and year are present' do
h = n.normalize_date(:date => '(15 May 1984)')

h[:year].should == 1984
h[:month].should == 5
h[:day].should == 15
end
end

end
Expand Down

0 comments on commit c7a5ca4

Please sign in to comment.