Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for rapidfuzz #77

Merged
merged 6 commits into from
Jun 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 6 additions & 4 deletions .drone.star
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ def main(ctx):
steps=[
dict(
name="install task",
image="alpine:latest",
image="debian:latest",
commands=[
"apk add --no-cache wget",
"apt update",
"apt install -y wget",
"wget https://taskfile.dev/install.sh",
"sh install.sh -- latest",
"rm install.sh",
Expand All @@ -34,14 +35,15 @@ def main(ctx):
def step(env, python):
result = dict(
name="{} (py{})".format(env, python),
image="python:{}-alpine".format(python),
image="python:{}-buster".format(python),
depends_on=["install task"],
environment=dict(
# set coverage database file name to avoid conflicts between steps
COVERAGE_FILE=".coverage.{}.{}".format(env, python),
),
commands=[
"apk add curl git gcc libc-dev",
"apt update",
"apt install -y curl git build-essential",
"./bin/task PYTHON_BIN=python3 VENVS=/opt/py{python}/ -f {env}:run".format(
python=python,
env=env,
Expand Down
8 changes: 3 additions & 5 deletions Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ vars:
ISORT_ENV: "{{.VENVS}}isort"
TWINE_ENV: "{{.VENVS}}twine"

TESTS_PATH: tests/

tasks:
venv:create:
status:
Expand All @@ -21,7 +19,7 @@ tasks:
- "{{.ENV}}/bin/python3 -m pip install -U pip setuptools wheel"
pip:install:
sources:
- pyproject.toml
- setup.py
- "{{.ENV}}/bin/activate"
deps:
- task: venv:create
Expand Down Expand Up @@ -74,7 +72,7 @@ tasks:
ENV: "{{.PYTEST_PURE_ENV}}"
EXTRA: test
cmds:
- "{{.PYTEST_PURE_ENV}}/bin/pytest -m 'not external' {{.ARGS}} {{.TESTS_PATH}}"
- "{{.PYTEST_PURE_ENV}}/bin/pytest -m 'not external' {{.CLI_ARGS}}"

pytest-external:run:
deps:
Expand All @@ -83,7 +81,7 @@ tasks:
ENV: "{{.PYTEST_EXT_ENV}}"
EXTRA: test,benchmark
cmds:
- "{{.PYTEST_EXT_ENV}}/bin/pytest {{.ARGS}} {{.TESTS_PATH}}"
- "{{.PYTEST_EXT_ENV}}/bin/pytest {{.CLI_ARGS}}"

isort:run:
sources:
Expand Down
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
'numpy', # for SmithWaterman and other
'python-Levenshtein', # for Jaro and Levenshtein
'pyxDamerauLevenshtein', # for DamerauLevenshtein
'rapidfuzz>=2.0.0', # for Jaro, Levenshtein and other
],

# needed for benchmarking, optimization and testing
Expand All @@ -22,6 +23,7 @@
'numpy',
'python-Levenshtein',
'pyxDamerauLevenshtein',
'rapidfuzz>=2.0.0',
# slow
'distance',
'pylev',
Expand All @@ -43,17 +45,21 @@
],
'Hamming': [
'python-Levenshtein', # only same length and strings
'rapidfuzz>=2.0.0', # only same length, any iterators of hashable elements
'jellyfish', # only strings, any length
'distance', # only same length, any iterators
'abydos', # any iterators
],
'Jaro': [
'rapidfuzz>=2.0.0', # any iterators of hashable elements
'python-Levenshtein', # only text
],
'JaroWinkler': [
'rapidfuzz>=2.0.0', # any iterators of hashable elements
'jellyfish', # only text
],
'Levenshtein': [
'rapidfuzz>=2.0.0', # any iterators of hashable elements
'python-Levenshtein', # only text
# yeah, other libs slower than textdistance
],
Expand Down
45 changes: 31 additions & 14 deletions tests/test_external.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,15 @@

libraries = prototype.clone()

# numpy throws a bunch of warning about abydos using `np.int` isntead of `int`.
ABYDOS_WARNINGS = (
'ignore:`np.int` is a deprecated alias',
'ignore:`np.float` is a deprecated alias',
'ignore:Using or importing the ABCs',
)


@pytest.mark.filterwarnings(*ABYDOS_WARNINGS)
@pytest.mark.external
@pytest.mark.parametrize('alg', libraries.get_algorithms())
@hypothesis.settings(deadline=None)
Expand All @@ -37,34 +45,40 @@ def test_compare(left, right, alg):
assert isclose(int_result, ext_result), str(lib)


@pytest.mark.filterwarnings(*ABYDOS_WARNINGS)
@pytest.mark.external
@pytest.mark.parametrize('alg', libraries.get_algorithms())
@hypothesis.given(
left=hypothesis.strategies.text(min_size=1),
right=hypothesis.strategies.text(min_size=1),
)
def test_qval(left, right, alg):
@pytest.mark.parametrize('qval', (None, 1, 2, 3))
def test_qval(left, right, alg, qval):
for lib in libraries.get_libs(alg):
conditions = lib.conditions or {}
internal_func = getattr(textdistance, alg)(external=False, **conditions)
external_func = lib.get_function()
# algorithm doesn't support q-grams
if not hasattr(internal_func, 'qval'):
continue
for qval in (None, 1, 2, 3):
internal_func.qval = qval
# if qval unsopporting already set for lib
s1, s2 = internal_func._get_sequences(left, right)
if not lib.check_conditions(internal_func, s1, s2):
continue

# test
int_result = internal_func(left, right)
s1, s2 = lib.prepare(s1, s2)
ext_result = external_func(s1, s2)
assert isclose(int_result, ext_result), str(lib)

internal_func.qval = qval
# if qval unsopporting already set for lib
s1, s2 = internal_func._get_sequences(left, right)
if not lib.check_conditions(internal_func, s1, s2):
continue
quick_answer = internal_func.quick_answer(s1, s2)
if quick_answer is not None:
continue

# test
int_result = internal_func(left, right)
s1, s2 = lib.prepare(s1, s2)
ext_result = external_func(s1, s2)
assert isclose(int_result, ext_result), f'{lib}({repr(s1)}, {repr(s2)})'


@pytest.mark.filterwarnings(*ABYDOS_WARNINGS)
@pytest.mark.external
@pytest.mark.parametrize('alg', libraries.get_algorithms())
@hypothesis.given(
Expand All @@ -79,10 +93,13 @@ def test_list_of_numbers(left, right, alg):
if external_func is None:
raise RuntimeError('cannot import {}'.format(str(lib)))

quick_answer = internal_func.quick_answer(left, right)
if quick_answer is not None:
continue
if not lib.check_conditions(internal_func, left, right):
continue

int_result = internal_func(left, right)
s1, s2 = lib.prepare(left, right)
ext_result = external_func(s1, s2)
assert isclose(int_result, ext_result), str(lib)
assert isclose(int_result, ext_result), f'{lib}({repr(s1)}, {repr(s2)})'
9 changes: 8 additions & 1 deletion textdistance/algorithms/edit_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class Hamming(_Base):

https://en.wikipedia.org/wiki/Hamming_distance
"""

def __init__(self, qval=1, test_func=None, truncate=False, external=True):
self.qval = qval
self.test_func = test_func or self._ident
Expand Down Expand Up @@ -62,6 +63,7 @@ class Levenshtein(_Base):
https://en.wikipedia.org/wiki/Levenshtein_distance
TODO: https://gist.github.com/kylebgorman/1081951/9b38b7743a3cb5167ab2c6608ac8eea7fc629dca
"""

def __init__(self, qval=1, test_func=None, external=True):
self.qval = qval
self.test_func = test_func or self._ident
Expand Down Expand Up @@ -130,6 +132,7 @@ class DamerauLevenshtein(_Base):

https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
"""

def __init__(self, qval=1, test_func=None, external=True):
self.qval = qval
self.test_func = test_func or self._ident
Expand Down Expand Up @@ -229,6 +232,7 @@ class JaroWinkler(_BaseSimilarity):
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro.js
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro-winkler.js
"""

def __init__(self, long_tolerance=False, winklerize=True, qval=1, external=True):
self.qval = qval
self.long_tolerance = long_tolerance
Expand Down Expand Up @@ -302,7 +306,7 @@ def __call__(self, s1, s2, prefix_weight=0.1):
# adjust for up to first 4 chars in common
j = min(min_len, 4)
i = 0
while i < j and s1[i] == s2[i] and s1[i]:
while i < j and s1[i] == s2[i]:
i += 1
if i:
weight += i * prefix_weight * (1.0 - weight)
Expand Down Expand Up @@ -422,6 +426,7 @@ class SmithWaterman(_BaseSimilarity):
https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/smith-waterman.js
"""

def __init__(self, gap_cost=1.0, sim_func=None, qval=1, external=True):
self.qval = qval
self.gap_cost = gap_cost
Expand Down Expand Up @@ -464,6 +469,7 @@ class Gotoh(NeedlemanWunsch):
penalties:
https://www.cs.umd.edu/class/spring2003/cmsc838t/papers/gotoh1982.pdf
"""

def __init__(self, gap_open=1, gap_ext=0.4, sim_func=None, qval=1, external=True):
self.qval = qval
self.gap_open = gap_open
Expand Down Expand Up @@ -687,6 +693,7 @@ class MLIPNS(_BaseSimilarity):
http://www.sial.iias.spb.su/files/386-386-1-PB.pdf
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/mlipns.js
"""

def __init__(self, threshold=0.25, maxmismatches=2, qval=1, external=True):
self.qval = qval
self.threshold = threshold
Expand Down
16 changes: 16 additions & 0 deletions textdistance/libraries.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
"Levenshtein",
"hamming"
],
[
"rapidfuzz.distance.hamming",
"distance"
],
[
"jellyfish",
"hamming_distance"
Expand All @@ -32,6 +36,10 @@
]
],
"Jaro": [
[
"rapidfuzz.distance.Jaro",
"similarity"
],
[
"Levenshtein",
"jaro"
Expand All @@ -46,12 +54,20 @@
]
],
"JaroWinkler": [
[
"rapidfuzz.distance.JaroWinkler",
"similarity"
],
[
"jellyfish",
"jaro_winkler_similarity"
]
],
"Levenshtein": [
[
"rapidfuzz.distance.Levenshtein",
"distance"
],
[
"Levenshtein",
"distance"
Expand Down
5 changes: 5 additions & 0 deletions textdistance/libraries.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,17 @@ class SameLengthTextLibrary(SameLengthLibrary, TextLibrary):
prototype.register('Hamming', SameLengthLibrary('distance', 'hamming'))
prototype.register('Hamming', SameLengthTextLibrary('Levenshtein', 'hamming'))
prototype.register('Hamming', TextLibrary('jellyfish', 'hamming_distance'))
prototype.register('Hamming', SameLengthLibrary('rapidfuzz.distance.Hamming', 'distance'))

prototype.register('Jaro', TextLibrary('jellyfish', 'jaro_similarity'))
prototype.register('Jaro', LibraryBase('rapidfuzz.distance.Jaro', 'similarity'))
# prototype.register('Jaro', TextLibrary('Levenshtein', 'jaro'))
# prototype.register('Jaro', TextLibrary('py_stringmatching.similarity_measure.jaro', 'jaro'))

# prototype.register('JaroWinkler', LibraryBase('py_stringmatching.similarity_measure.jaro_winkler', 'jaro_winkler'))
prototype.register('JaroWinkler', TextLibrary('jellyfish', 'jaro_winkler_similarity', conditions=dict(winklerize=True)))
prototype.register('JaroWinkler', LibraryBase('rapidfuzz.distance.JaroWinkler', 'similarity',
conditions=dict(winklerize=True)))
# https://github.com/life4/textdistance/issues/39
# prototype.register('JaroWinkler', TextLibrary('Levenshtein', 'jaro_winkler', conditions=dict(winklerize=True)))

Expand All @@ -174,4 +178,5 @@ class SameLengthTextLibrary(SameLengthLibrary, TextLibrary):
prototype.register('Levenshtein', LibraryBase('pylev', 'levenshtein'))
prototype.register('Levenshtein', TextLibrary('jellyfish', 'levenshtein_distance'))
prototype.register('Levenshtein', TextLibrary('Levenshtein', 'distance'))
prototype.register('Levenshtein', LibraryBase('rapidfuzz.distance.Levenshtein', 'distance'))
# prototype.register('Levenshtein', TextLibrary('py_stringmatching.similarity_measure.levenshtein', 'levenshtein'))