diff --git a/lib/anystyle/parser/normalizer.rb b/lib/anystyle/parser/normalizer.rb index ff52fc7..e086173 100644 --- a/lib/anystyle/parser/normalizer.rb +++ b/lib/anystyle/parser/normalizer.rb @@ -109,7 +109,7 @@ def normalize_editor(hash) def normalize_translator(hash) translators = hash[:translator] - editors.gsub!(/^\W+|\W+$/, '') + translators.gsub!(/^\W+|\W+$/, '') translators.gsub!(/[^[:alpha:]]*trans(lated)?[^[:alpha:]]*/i, '') translators.gsub!(/\bby\b/i, '') @@ -117,7 +117,31 @@ def normalize_translator(hash) hash end + def normalize_director(hash) + directors = hash[:director] + + directors.gsub!(/^\W+|\W+$/, '') + directors.gsub!(/[^[:alpha:]]*direct(or|ed)?[^[:alpha:]]*/i, '') + directors.gsub!(/\bby\b/i, '') + + hash[:director] = normalize_names(directors) + hash + end + + def normalize_producer(hash) + producers = hash[:producer] + + producers.gsub!(/^\W+|\W+$/, '') + producers.gsub!(/[^[:alpha:]]*produc(er|ed)?[^[:alpha:]]*/i, '') + producers.gsub!(/\bby\b/i, '') + + hash[:director] = normalize_names(producers) + hash + end + def normalize_names(names) + names.gsub!(/\s*(\.\.\.|…)\s*/, '') + Namae.parse!(names).map { |name| unless name.given.nil? || name.family.nil? name.given.gsub!(/\b([[:upper:]])(\s|$)/, '\1.\2') @@ -135,11 +159,11 @@ def normalize_names(names) Namae.options[:prefer_comma_as_separator] = true def normalize_title(hash) - title, container = hash[:title] + title, source = hash[:title] - unless container.nil? - hash[:container] = container - normalize(:container, hash) + unless source.nil? + hash[:source] = source + normalize(:source, hash) end extract_edition(title, hash) @@ -202,18 +226,18 @@ def normalize_journal(hash) hash end - def normalize_container(hash) - container, *dangling = hash[:container] - unmatched(:container, hash, dangling) unless dangling.empty? + def normalize_source(hash) + source, *dangling = hash[:source] + unmatched(:source, hash, dangling) unless dangling.empty? - case container + case source when /dissertation abstracts/i - container.gsub!(/\s*section \w: ([[:alnum:]\s]+).*$/i, '') + source.gsub!(/\s*section \w: ([[:alnum:]\s]+).*$/i, '') hash[:category] = $1 unless $1.nil? hash[:type] = :phdthesis end - hash[:container] = container + hash[:source] = source hash end @@ -227,6 +251,11 @@ def normalize_date(hash) if date =~ /(\d{4})/ hash[:year] = $1.to_i + + if hash.key?(:month) && date =~ /(\d{1,2})\b/ + hash[:day] = $1.to_i + end + hash.delete(:date) end @@ -260,6 +289,24 @@ def normalize_volume(hash) hash end + def normalize_publisher(hash) + normalize :publisher, hash + + case hash[:publisher] + when /^producers?$/i + hash[:publisher] = hash[:producer] + + when /^authors?$/i + hash[:publisher] = hash[:author] + + when /^editor?$/i + hash[:publisher] = hash[:editor] + + end + + hash + end + def normalize_pages(hash) pages, *dangling = hash[:pages] unmatched(:pages, hash, dangling) unless dangling.empty? @@ -316,6 +363,14 @@ def normalize_url(hash) hash end + def normalize_medium(hash) + medium, *dangling = hash[:medium] + unmatched(:medium, hash, dangling) unless dangling.empty? + + hash[:medium] = medium.split(/\W+/).reject(&:empty?).join('-') + hash + end + private def unmatched(label, hash, tokens) diff --git a/lib/anystyle/parser/parser.rb b/lib/anystyle/parser/parser.rb index 6073e09..fde5bc6 100644 --- a/lib/anystyle/parser/parser.rb +++ b/lib/anystyle/parser/parser.rb @@ -180,11 +180,13 @@ def classify(hash) text = hash.values.flatten.join case + when keys.include?(:medium) + hash[:type] = hash[:medium] when keys.include?(:journal) hash[:type] = :article when text =~ /proceedings/i hash[:type] = :inproceedings - when keys.include?(:booktitle), keys.include?(:container) + when keys.include?(:booktitle), keys.include?(:source) hash[:type] = :incollection when keys.include?(:publisher) hash[:type] = :book diff --git a/resources/train.txt b/resources/train.txt index a07a2a2..0821344 100644 --- a/resources/train.txt +++ b/resources/train.txt @@ -598,3 +598,19 @@ Fenner M (2013) What can article-level metrics do for you?. Available: http://dx.doi.org/10.1371/journal.pbio.1001687. Fenner M (2013) Letter from the Guest Content Editor: Altmetrics Have Come of Age. Information Standards Quarterly. Available: http://dx.doi.org/10.3789/isqv25no2.2013.01. Fenner M (2008) Targeted therapies for patients with germ cell tumors.. Available: http://dx.doi.org/10.1517/13543784.17.4.511. +Boyer, Paul S. Purity in Print: Book Censorship in America from the Gilded Age to the Computer Age. 2nd ed. Madison: University of Wisconsin Press, 2002. +"Charles R. Van Hise." In Wikipedia. Last modified May 9, 2013. http://en.wikipedia.org/wiki/Charles_R._Van_Hise. +Child, Julia, Louisette Bertholle, and Simone Beck. Mastering the Art of French Cooking. New York: Knopf, 1961. +CIA World Factbook. Washington, DC: Central Intelligence Agency, 2009. +Dunlavy, Colleen. "Why Did American Businesses Get So Big?" In Major Problems in American Business History, edited by Regina Blaszczyk and Philip Scranton. New York: Houghton-Mifflin, 2006. 257-263. +Johnson, Kirk. "Health Care Is Spread Thin on Alaskan Frontier." New York Times, May 28, 2013. http://www.nytimes.com/2013/05/29/us/health-care-in-vast-alaska-frontier-is-spread-thin.html?hpw&_r=0. +Lindberg, Sara M. "Gender-Role Identity Development During Adolescence: Individual, Familial, and Social Contextual Predictors of Gender Intensification." Ph.D. diss., University of Wisconsin-Madison, 2008. +Marshall, Nancy Rose. Review of Joseph Crawhill, 1861-1913, One of the Glasgow Boys. Victorian Studies 42 (1999/2000): 358-60. +Marwell, Gerald, and Pamela Oliver. The Critical Mass in Collective Action. Cambridge: Cambridge University Press, 2007. +Marshall, Tyler. "200th Birthday of Grimms Celebrated." Los Angeles Times, 15 March 1985,
sec. 1A,
p. 3. +Neville, Leonora. Authority in Byzantine Provincial Society, 950-1100. Cambridge: Cambridge University Press, 2004. Reprint, Cambridge: Cambridge University Press, 2008. +Shakespeare, William. Othello. Milwaukee Repertory Theater, Milwaukee, April 2012. +Soderbergh, Steven, director. Che. DVD. New York: Criterion Collection, 2008. +United Nations. "Human Rights." Accessed May 29, 2013. http://www.un.org/en/globalissues/humanrights/. +Wandel, Lee Palmer. "Setting the Lutheran Eucharist." Journal of Early Modern History 17 (1998): 124-55. doi: 10.1163/157006598X00135. +Zukofsky, Louis. "Sincerity and Objectification." Poetry 37 (February 1931): 269. Quoted in Bonnie Costello, Marianne Moore: Imaginary Possessions (Cambridge: Harvard University Press, 1981), 78. diff --git a/spec/anystyle/parser/normalizer_spec.rb b/spec/anystyle/parser/normalizer_spec.rb index 6117fc9..c05a72f 100644 --- a/spec/anystyle/parser/normalizer_spec.rb +++ b/spec/anystyle/parser/normalizer_spec.rb @@ -88,6 +88,14 @@ module Parser h[:year].should == 1997 h[:month].should == 10 end + + it 'extracts days if month and year are present' do + h = n.normalize_date(:date => '(15 May 1984)') + + h[:year].should == 1984 + h[:month].should == 5 + h[:day].should == 15 + end end end