Skip to content

Commit

Permalink
Improve name normalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
inukshuk committed May 30, 2018
1 parent 497287b commit 2e7a856
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 9 deletions.
13 changes: 10 additions & 3 deletions lib/anystyle/normalizer/names.rb
Expand Up @@ -43,7 +43,7 @@ def repeater?(value)
def strip(value)
value
.gsub(/^[Ii]n:?\s+/, '')
.gsub(/\b[EÉeé]d(s?\.|itors?|ited|iteurs?|ité)(\s+(by|par)\s+|\b|$)?/, '')
.gsub(/\b[EÉeé]d(s?\.|itors?\.?|ited|iteurs?|ité)(\s+(by|par)\s+|\b|$)/, '')
.gsub(/\b([Hh](rsg|gg?)\.|Herausgeber)\s+/, '')
.gsub(/\b[Hh]erausgegeben von\s+/, '')
.gsub(/\b((d|ein)er )?[Üü]ber(s\.|setzt|setzung|tragen|tragung) v(\.|on)\s+/, '')
Expand All @@ -52,14 +52,21 @@ def strip(value)
.gsub(/\b([Dd]ir(\.|ected))(\s+by)?\s+/, '')
.gsub(/\b([Pp]rod(\.|uce[rd]))(\s+by)?\s+/, '')
.gsub(/\b([Pp]erf(\.|orme[rd]))(\s+by)?\s+/, '')
.gsub(/\*/, '')
.gsub(/\([^\)]*\)?/, '')
.gsub(/\[[^\]]*\)?/, '')
.gsub(/[;:]/, ',')
.strip
.gsub(/^\p{^L}+|\s+\p{^L}+$/, '')
.gsub(/[\s,]+$/, '')
.gsub(/,{2,}/, ',')
.gsub(/\s+\./, '.')
end

def parse(value)
raise ArgumentError if value.empty?

others = value.sub!(
/(\bet\s+(al|coll)\b|\bu\.\s*a\.|(\band|\&)\s+others).*$/, ''
/(,\s+)?((\&\s+)?\bet\s+(al|coll)\b|\bu\.\s*a\b|(\band|\&)\s+others).*$/, ''
) || value.sub!(/\.\.\.|…/, '')

# Add surname/initial punctuation separator for Vancouver-style names
Expand Down
17 changes: 11 additions & 6 deletions spec/anystyle/normalizer/names_spec.rb
Expand Up @@ -80,14 +80,14 @@ def n(author, **opts)
{ family: 'Kelly', given: 'J.W.' }
]],
['Bouchard J-P.', [{ family: 'Bouchard', given: 'J.-P.' }]],
['Edgar A. Poe; Herman Melville', [poe, melville]],
['- Edgar A. Poe; Herman Melville', [poe, melville]],
['Poe, Edgar A., Melville, Herman', [poe, melville]],
['Aeschlimann Magnin, E.', [{ family: 'Aeschlimann Magnin', given: 'E.' }]],
['Yang, Q., Mudambi, R., & Meyer, K. E.', [
{ family: 'Yang', given: 'Q.' },
{ family: 'Mudambi', given: 'R.' },
{ family: 'Meyer', given: 'K.E.' }
]]
]],
].each do |input, output|
expect(n(input)).to eq(output)
end
Expand All @@ -99,6 +99,7 @@ def n(author, **opts)
expect(n('In: D. Knuth (ed.)')).to eq([knuth])
expect(n('in: D. Knuth ed.')).to eq([knuth])
expect(n('in D. Knuth (ed)')).to eq([knuth])
expect(n('In D. Knuth, editor')).to eq([knuth])
end

it "does not strip 'ed' etc. from names" do
Expand Down Expand Up @@ -126,16 +127,20 @@ def n(author, **opts)

it "strips and resolves 'et al' / others" do
expect(n('J Doe et al')).to eq([doe, others])
expect(n('J Doe et al.')).to eq([doe, others])
expect(n('Doe, J., et al.')).to eq([doe, others])
expect(n('J Doe u.a.')).to eq([doe, others])
expect(n('J Doe u. a.')).to eq([doe, others])
expect(n('J Doe and others')).to eq([doe, others])
expect(n('J Doe & others')).to eq([doe, others])
expect(n('J Doe et coll.')).to eq([doe, others])
expect(n('J Doe ...')).to eq([doe, others])
end
end

it "#strip" do
expect(subject.strip('Piveteau, J. (ed.).')).to eq('Piveteau, J.')
expect(subject.strip('In D. Knuth, editor.')).to eq('D. Knuth')
end

describe "Parsed Core Data" do
before :all do
@data = resource('parser/core.xml')
Expand All @@ -148,8 +153,8 @@ def n(author, **opts)
let(:lit) { @data.select { |name| !name[:literal].nil? } }
let(:nam) { @data.select { |name| name[:literal].nil? } }

it "accepts more than 95% of names" do
expect(nam.length.to_f / @data.length).to be > 0.95
it "accepts more than 98% of names" do
expect(nam.length.to_f / @data.length).to be > 0.98
end
end
end
Expand Down

0 comments on commit 2e7a856

Please sign in to comment.