-
Notifications
You must be signed in to change notification settings - Fork 86
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
31 changed files
with
8,059 additions
and
7,257 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
module Anystyle | ||
class Feature | ||
@available = {} | ||
|
||
class << self | ||
def inherited(feature) | ||
@available[feature.feature_name] = feature | ||
end | ||
|
||
def feature_name | ||
@feature_name || name.downcase | ||
end | ||
end | ||
|
||
def name | ||
self.class.feature_name | ||
end | ||
|
||
# TODO sequence features should be called just once | ||
def sequence? | ||
false | ||
end | ||
|
||
def elicit(token, alpha, offset, sequence) | ||
raise NotImplementedError | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
module Anystyle | ||
class Feature | ||
class Affix < Feature | ||
attr_reader :size | ||
|
||
def initialize(size: 4, prefix: true, suffix: false) | ||
@size, @suffix = size, (suffix || !prefix) | ||
end | ||
|
||
def elicit(token, *args) | ||
build(extract(token)) { |chars| join(chars) } | ||
end | ||
|
||
def extract(token) | ||
if suffix? | ||
token.chars.reverse.take(size) | ||
else | ||
token.chars.take(size) | ||
end | ||
end | ||
|
||
def join(chars) | ||
if suffix? | ||
chars.reverse.join('') | ||
else | ||
chars.join('') | ||
end | ||
end | ||
|
||
def build(chars) | ||
(1..size).map { |n| yield chars.take(n) } | ||
end | ||
|
||
def suffix? | ||
!!@suffix | ||
end | ||
|
||
def prefix? | ||
!suffix? | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
module Anystyle | ||
class Feature | ||
class Caps < Feature | ||
def elicit(token, alpha, offset, sequence) | ||
case alpha | ||
when /^[[:upper:]]$/ | ||
:single | ||
when /^[[:upper:]][[:lower:]]/ | ||
:initial | ||
when /^[[:upper:]]+$/ | ||
:all | ||
#when /^\p{Lu}+$/ | ||
# :caps | ||
#when /^\p{Lt}/ | ||
# :title | ||
#when /^\p{Ll}/ | ||
# :lower | ||
#when /^\p{Lu}/ | ||
# :single # :upper | ||
else | ||
:other | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
module Anystyle | ||
class Feature | ||
class Category < Feature | ||
attr_reader :index | ||
|
||
# TODO support multiple indices? | ||
def initialize(index: -1) | ||
@index = index | ||
end | ||
|
||
def elicit(token, *args) | ||
categorize(token.chars[index]) | ||
end | ||
|
||
# TODO use more unicode categories | ||
def categorize(char) | ||
case char | ||
when /\p{Lu}/ | ||
:upper | ||
when /\p{Ll}/ | ||
:lower | ||
when /\p{N}/ | ||
:numeric | ||
else | ||
char | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
module Anystyle | ||
class Feature | ||
class Dictionary < Feature | ||
attr_reader :dictionary | ||
|
||
def initialize(dictionary:) | ||
@dictionary = dictionary | ||
end | ||
|
||
def elicit(token, alpha, offset, sequence) | ||
dictionary.tags(alpha.downcase) | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
module Anystyle | ||
class Feature | ||
class Downcase < Feature | ||
def elicit(token, alpha, offset, sequence) | ||
if alpha.empty? | ||
:EMPTY | ||
else | ||
alpha.downcase | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
module Anystyle | ||
class Feature | ||
class Editor < Feature | ||
def elicit(token, alpha, offset, sequence) | ||
sequence.any?(&method(:match?)) ? :editors : :'no-editors' | ||
end | ||
|
||
# TODO improve patterns / disambiguate edition? | ||
def match?(token) | ||
token =~ /^(ed|editor|editors|eds|edited|hrsg)$/i | ||
end | ||
|
||
def sequence? | ||
true | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
module Anystyle | ||
class Feature | ||
class Locator < Feature | ||
def elicit(token, alpha, offset, sequence) | ||
case token | ||
when /retrieved/i | ||
:retrieved | ||
when /isbn/i | ||
:isbn | ||
when /^doi:/i | ||
:doi | ||
when /^url|http|www\.[\w\.]+/i | ||
:url | ||
else | ||
:none | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
module Anystyle | ||
class Feature | ||
class Number < Feature | ||
# TODO check/improve patterns | ||
def elicit(token, *args) | ||
case token | ||
when /\d\(\d+([—–-]\d+)?\)/ | ||
:volume | ||
when /^\(\d{4}\)[^[:alnum:]]*$/, /^(1\d{3}|20\d{2})[\.,;:]?$/ | ||
:year | ||
when /\d{4}\s*[—–-]+\s*\d{4}/ | ||
:'year-range' | ||
when /\d+\s*[—–-]+\s*\d+/, /^[^[:alnum:]]*pp?\.\d*[^[:alnum:]]*$/, /^((pp?|s)\.?|pages?)$/i | ||
:page | ||
when /^\d$/ | ||
:single | ||
when /^\d{2}$/ | ||
:double | ||
when /^\d{3}$/ | ||
:triple | ||
when /^\d+$/ | ||
:digits | ||
when /^\d+[\d-]+$/ | ||
:serial | ||
when /^-\d+$/ | ||
:negative | ||
when /\d+(th|st|nd|rd)[^[:alnum:]]*/i | ||
:ordinal | ||
when /\d/ | ||
:numeric | ||
else | ||
:none | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
module Anystyle | ||
class Feature | ||
class Offset < Feature | ||
def elicit(token, alpha, offset, sequence) | ||
((offset.to_f / sequence.length) * 10).round | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
module Anystyle | ||
class Feature | ||
class PubType < Feature | ||
# TODO sequence or token feature? | ||
# TODO improve or remove? | ||
def elicit(token, alpha, offset, sequence) | ||
s = sequence.join(' ') | ||
case | ||
when s =~ /dissertation abstract/i | ||
:dissertation | ||
when s =~ /proceeding/i | ||
:proceedings | ||
when alpha =~ /^in$/i && sequence[offset+1].to_s =~ /^[[:upper:]]/ && sequence[offset-1].to_s =~ /["'”’´‘“`\.;,]$/ | ||
:collection | ||
else | ||
:other | ||
end | ||
end | ||
|
||
#def sequence? | ||
# true | ||
#end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
module Anystyle | ||
class Feature | ||
class Punctuation < Feature | ||
# TODO Fix order | ||
# TODO Use unicode category patterns | ||
def elicit(token, alpha, offset, sequence) | ||
case token | ||
when /^["'”’´‘“`]/ | ||
:quote | ||
when /["'”’´‘“`][!\?\.]$/ | ||
:'terminal-unquote' | ||
when /["'”’´‘“`][,;:-]$/ | ||
:'internal-unquote' | ||
when /["'”’´‘“`]$/ | ||
:unquote | ||
when /^[\[\{].*[\}\]][!\?\.,;:-]?$/ | ||
:braces | ||
when /^<.*>[!\?\.,;:-]?$/ | ||
:tags | ||
when /^[\(].*[\)][!\?\.]$/ | ||
:'terminal-parens' | ||
when /^\(.*\)[,;:-]$/ | ||
:'internal-parens' | ||
when /^\(.*\)$/ | ||
:parens | ||
when /^[\[\{]/ | ||
:'opening-brace' | ||
when /[\}\]][!\?\.,;:-]?$/ | ||
:'closing-brace' | ||
when /^</ | ||
:'opening-tag' | ||
when />[!\?\.,;:-]?$/ | ||
:'closing-tag' | ||
when /^\(/ | ||
:'opening-parens' | ||
when /\)[,;:-]$/ | ||
:'internal-closing-parens' | ||
when /^\)$/ | ||
:'closing-parens' | ||
when /[,;:-]$/ | ||
:internal | ||
when /[!\?\."']$/ | ||
:terminal | ||
when /^\d{2,5}\(\d{2,5}\).?$/ | ||
:volume | ||
when /-+/ | ||
:hyphen | ||
else | ||
:others | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,32 @@ | ||
|
||
# TODO remove | ||
require 'singleton' | ||
|
||
# TODO make optional | ||
require 'bibtex' | ||
|
||
require 'builder' | ||
require 'wapiti' | ||
require 'namae' | ||
|
||
# TODO hoist | ||
require 'anystyle/parser/version' | ||
require 'anystyle/parser/errors' | ||
|
||
require 'anystyle/parser/utility' | ||
require 'anystyle/parser/dictionary' | ||
require 'anystyle/parser/features' | ||
|
||
require 'anystyle/feature' | ||
require 'anystyle/feature/affix' | ||
require 'anystyle/feature/caps' | ||
require 'anystyle/feature/category' | ||
require 'anystyle/feature/dictionary' | ||
require 'anystyle/feature/downcase' | ||
require 'anystyle/feature/editor' | ||
require 'anystyle/feature/locator' | ||
require 'anystyle/feature/number' | ||
require 'anystyle/feature/offset' | ||
require 'anystyle/feature/pubtype' | ||
require 'anystyle/feature/punctuation' | ||
|
||
require 'anystyle/parser/parser' | ||
require 'anystyle/parser/normalizer' | ||
|
||
require 'anystyle/parser/utility' |
Oops, something went wrong.