Skip to content

Commit

Permalink
Merge e1ce931 into 973924a
Browse files Browse the repository at this point in the history
  • Loading branch information
inukshuk committed Nov 29, 2017
2 parents 973924a + e1ce931 commit f931750
Show file tree
Hide file tree
Showing 31 changed files with 8,059 additions and 7,257 deletions.
4 changes: 2 additions & 2 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ group :development, :test do
gem 'rspec', '~>3.0'
gem 'language_detector', github: 'feedbackmine/language_detector'
gem 'lmdb'
gem 'redis'
gem 'redis-namespace'
end

group :coverage do
Expand All @@ -27,7 +29,5 @@ group :extra do
gem 'autotest-fsevent', :require => false
gem 'yard'
gem 'ZenTest'
gem 'redis'
gem 'hiredis'
gem 'redis-namespace'
end
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ Anystyle-Parser
===============
[![Build Status](https://travis-ci.org/inukshuk/anystyle-parser.svg?branch=master)](https://travis-ci.org/inukshuk/anystyle-parser)
[![Coverage Status](https://coveralls.io/repos/github/inukshuk/anystyle-parser/badge.svg?branch=master)](https://coveralls.io/github/inukshuk/anystyle-parser?branch=master)
[![Flattr us](http://api.flattr.com/button/flattr-badge-large.png)](https://flattr.com/submit/auto?user_id=inukshuk&url=https://anystyle.io&title=AnyStyle&description=Parses%20scholarly%20references%20in%20no%20time!&tags=programming,bibliography,parser,machine%20learning,api&language=en_GB&category=software)

Anystyle-Parser is a very fast and smart parser for academic references. It
is inspired by [ParsCit](http://aye.comp.nus.edu.sg/parsCit/) and
Expand Down
28 changes: 28 additions & 0 deletions lib/anystyle/feature.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
module Anystyle
class Feature
@available = {}

class << self
def inherited(feature)
@available[feature.feature_name] = feature
end

def feature_name
@feature_name || name.downcase
end
end

def name
self.class.feature_name
end

# TODO sequence features should be called just once
def sequence?
false
end

def elicit(token, alpha, offset, sequence)
raise NotImplementedError
end
end
end
43 changes: 43 additions & 0 deletions lib/anystyle/feature/affix.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
module Anystyle
class Feature
class Affix < Feature
attr_reader :size

def initialize(size: 4, prefix: true, suffix: false)
@size, @suffix = size, (suffix || !prefix)
end

def elicit(token, *args)
build(extract(token)) { |chars| join(chars) }
end

def extract(token)
if suffix?
token.chars.reverse.take(size)
else
token.chars.take(size)
end
end

def join(chars)
if suffix?
chars.reverse.join('')
else
chars.join('')
end
end

def build(chars)
(1..size).map { |n| yield chars.take(n) }
end

def suffix?
!!@suffix
end

def prefix?
!suffix?
end
end
end
end
26 changes: 26 additions & 0 deletions lib/anystyle/feature/caps.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
module Anystyle
class Feature
class Caps < Feature
def elicit(token, alpha, offset, sequence)
case alpha
when /^[[:upper:]]$/
:single
when /^[[:upper:]][[:lower:]]/
:initial
when /^[[:upper:]]+$/
:all
#when /^\p{Lu}+$/
# :caps
#when /^\p{Lt}/
# :title
#when /^\p{Ll}/
# :lower
#when /^\p{Lu}/
# :single # :upper
else
:other
end
end
end
end
end
30 changes: 30 additions & 0 deletions lib/anystyle/feature/category.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
module Anystyle
class Feature
class Category < Feature
attr_reader :index

# TODO support multiple indices?
def initialize(index: -1)
@index = index
end

def elicit(token, *args)
categorize(token.chars[index])
end

# TODO use more unicode categories
def categorize(char)
case char
when /\p{Lu}/
:upper
when /\p{Ll}/
:lower
when /\p{N}/
:numeric
else
char
end
end
end
end
end
15 changes: 15 additions & 0 deletions lib/anystyle/feature/dictionary.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
module Anystyle
class Feature
class Dictionary < Feature
attr_reader :dictionary

def initialize(dictionary:)
@dictionary = dictionary
end

def elicit(token, alpha, offset, sequence)
dictionary.tags(alpha.downcase)
end
end
end
end
13 changes: 13 additions & 0 deletions lib/anystyle/feature/downcase.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
module Anystyle
class Feature
class Downcase < Feature
def elicit(token, alpha, offset, sequence)
if alpha.empty?
:EMPTY
else
alpha.downcase
end
end
end
end
end
18 changes: 18 additions & 0 deletions lib/anystyle/feature/editor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
module Anystyle
class Feature
class Editor < Feature
def elicit(token, alpha, offset, sequence)
sequence.any?(&method(:match?)) ? :editors : :'no-editors'
end

# TODO improve patterns / disambiguate edition?
def match?(token)
token =~ /^(ed|editor|editors|eds|edited|hrsg)$/i
end

def sequence?
true
end
end
end
end
20 changes: 20 additions & 0 deletions lib/anystyle/feature/locator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
module Anystyle
class Feature
class Locator < Feature
def elicit(token, alpha, offset, sequence)
case token
when /retrieved/i
:retrieved
when /isbn/i
:isbn
when /^doi:/i
:doi
when /^url|http|www\.[\w\.]+/i
:url
else
:none
end
end
end
end
end
37 changes: 37 additions & 0 deletions lib/anystyle/feature/number.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
module Anystyle
class Feature
class Number < Feature
# TODO check/improve patterns
def elicit(token, *args)
case token
when /\d\(\d+([—–-]\d+)?\)/
:volume
when /^\(\d{4}\)[^[:alnum:]]*$/, /^(1\d{3}|20\d{2})[\.,;:]?$/
:year
when /\d{4}\s*[—–-]+\s*\d{4}/
:'year-range'
when /\d+\s*[—–-]+\s*\d+/, /^[^[:alnum:]]*pp?\.\d*[^[:alnum:]]*$/, /^((pp?|s)\.?|pages?)$/i
:page
when /^\d$/
:single
when /^\d{2}$/
:double
when /^\d{3}$/
:triple
when /^\d+$/
:digits
when /^\d+[\d-]+$/
:serial
when /^-\d+$/
:negative
when /\d+(th|st|nd|rd)[^[:alnum:]]*/i
:ordinal
when /\d/
:numeric
else
:none
end
end
end
end
end
9 changes: 9 additions & 0 deletions lib/anystyle/feature/offset.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
module Anystyle
class Feature
class Offset < Feature
def elicit(token, alpha, offset, sequence)
((offset.to_f / sequence.length) * 10).round
end
end
end
end
25 changes: 25 additions & 0 deletions lib/anystyle/feature/pubtype.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
module Anystyle
class Feature
class PubType < Feature
# TODO sequence or token feature?
# TODO improve or remove?
def elicit(token, alpha, offset, sequence)
s = sequence.join(' ')
case
when s =~ /dissertation abstract/i
:dissertation
when s =~ /proceeding/i
:proceedings
when alpha =~ /^in$/i && sequence[offset+1].to_s =~ /^[[:upper:]]/ && sequence[offset-1].to_s =~ /["'”’´‘“`\.;,]$/
:collection
else
:other
end
end

#def sequence?
# true
#end
end
end
end
54 changes: 54 additions & 0 deletions lib/anystyle/feature/punctuation.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
module Anystyle
class Feature
class Punctuation < Feature
# TODO Fix order
# TODO Use unicode category patterns
def elicit(token, alpha, offset, sequence)
case token
when /^["'”’´‘“`]/
:quote
when /["'”’´‘“`][!\?\.]$/
:'terminal-unquote'
when /["'”’´‘“`][,;:-]$/
:'internal-unquote'
when /["'”’´‘“`]$/
:unquote
when /^[\[\{].*[\}\]][!\?\.,;:-]?$/
:braces
when /^<.*>[!\?\.,;:-]?$/
:tags
when /^[\(].*[\)][!\?\.]$/
:'terminal-parens'
when /^\(.*\)[,;:-]$/
:'internal-parens'
when /^\(.*\)$/
:parens
when /^[\[\{]/
:'opening-brace'
when /[\}\]][!\?\.,;:-]?$/
:'closing-brace'
when /^</
:'opening-tag'
when />[!\?\.,;:-]?$/
:'closing-tag'
when /^\(/
:'opening-parens'
when /\)[,;:-]$/
:'internal-closing-parens'
when /^\)$/
:'closing-parens'
when /[,;:-]$/
:internal
when /[!\?\."']$/
:terminal
when /^\d{2,5}\(\d{2,5}\).?$/
:volume
when /-+/
:hyphen
else
:others
end
end
end
end
end
23 changes: 19 additions & 4 deletions lib/anystyle/parser.rb
Original file line number Diff line number Diff line change
@@ -1,17 +1,32 @@

# TODO remove
require 'singleton'

# TODO make optional
require 'bibtex'

require 'builder'
require 'wapiti'
require 'namae'

# TODO hoist
require 'anystyle/parser/version'
require 'anystyle/parser/errors'

require 'anystyle/parser/utility'
require 'anystyle/parser/dictionary'
require 'anystyle/parser/features'

require 'anystyle/feature'
require 'anystyle/feature/affix'
require 'anystyle/feature/caps'
require 'anystyle/feature/category'
require 'anystyle/feature/dictionary'
require 'anystyle/feature/downcase'
require 'anystyle/feature/editor'
require 'anystyle/feature/locator'
require 'anystyle/feature/number'
require 'anystyle/feature/offset'
require 'anystyle/feature/pubtype'
require 'anystyle/feature/punctuation'

require 'anystyle/parser/parser'
require 'anystyle/parser/normalizer'

require 'anystyle/parser/utility'

0 comments on commit f931750

Please sign in to comment.