Skip to content

Commit

Permalink
libs structure & optimization improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
orsinium committed Apr 13, 2018
1 parent 285d809 commit dd6ad5f
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 31 deletions.
3 changes: 2 additions & 1 deletion run_tests.py
Expand Up @@ -6,9 +6,10 @@
import unittest

import textdistance
from textdistance.libraries import not_optimized_libraries as libraries
from textdistance.libraries import prototype


libraries = prototype.clone()
# CONSTRAINTS = os.getenv('WITH_CONSTRAINTS', 'yes') == 'yes'
# NUMPY = os.getenv('WITH_NUMPY', 'yes') == 'yes'
CONSTRAINTS = os.environ['WITH_CONSTRAINTS'] == 'yes'
Expand Down
5 changes: 2 additions & 3 deletions textdistance/algorithms/base.py
@@ -1,8 +1,9 @@
from collections import Counter
from ..utils import find_ngrams
from ..libraries import libraries
from ..libraries import prototype


libraries = prototype.clone()
libraries.optimize()


Expand Down Expand Up @@ -55,8 +56,6 @@ def external_answer(self, *sequences):
return
# try to get external libs for algorithm
libs = libraries.get_libs(self.__class__.__name__)
if not libs:
return
for lib in libs:
# if conditions not satisfied
if not lib.check_conditions(self, *sequences):
Expand Down
3 changes: 2 additions & 1 deletion textdistance/benchmark.py
Expand Up @@ -4,13 +4,14 @@

from tabulate import tabulate

from .libraries import not_optimized_libraries as libraries
from .libraries import prototype
from .libraries import LIBRARIES_FILE


# python3 -m textdistance.benchmark


libraries = prototype.clone()
Lib = namedtuple('Lib', ['algorithm', 'library', 'function', 'time', 'presets'])


Expand Down
68 changes: 42 additions & 26 deletions textdistance/libraries.py
Expand Up @@ -15,25 +15,45 @@ def __init__(self):
self.libs = defaultdict(list)

def register(self, alg, lib):
"""Register new lib
"""
self.libs[alg].append(lib)

def optimize(self):
"""Sort algorithm implementations by speed.
"""
# load benchmarks results
with open(LIBRARIES_FILE, 'r') as f:
libs_data = json.load(f)
# optimize
for alg, libs_names in libs_data.items():
libs = self.get_libs(alg)
if not libs:
continue
# drop slow libs
self.libs[alg] = [lib for lib in libs if [lib.module_name, lib.func_name] in libs_names]
# sort libs by speed
self.libs[alg].sort(key=lambda lib: libs_names.index([lib.module_name, lib.func_name]))

def get_algorithms(self):
"""Get list of available algorithms.
"""
return list(self.libs.keys())

def get_libs(self, alg):
"""Get libs list for algorithm
"""
if alg not in self.libs:
return
return []
return self.libs[alg]

def clone(self):
"""Clone library manager prototype
"""
obj = self.__class__()
obj.libs = deepcopy(self.libs)
return obj


class LibraryBase(object):
func = NotImplemented
Expand Down Expand Up @@ -119,33 +139,29 @@ class SameLengthTextLibrary(SameLengthLibrary, TextLibrary):
pass


libraries = LibrariesManager()
prototype = LibrariesManager()

libraries.register('DamerauLevenshtein', LibraryBase('abydos.distance', 'damerau_levenshtein'))
libraries.register('DamerauLevenshtein', LibraryBase('pylev', 'damerau_levenshtein'))
libraries.register('DamerauLevenshtein', LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance'))
libraries.register('DamerauLevenshtein', TextLibrary('jellyfish', 'damerau_levenshtein_distance'))
prototype.register('DamerauLevenshtein', LibraryBase('abydos.distance', 'damerau_levenshtein'))
prototype.register('DamerauLevenshtein', LibraryBase('pylev', 'damerau_levenshtein'))
prototype.register('DamerauLevenshtein', LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance'))
prototype.register('DamerauLevenshtein', TextLibrary('jellyfish', 'damerau_levenshtein_distance'))

libraries.register('Hamming', LibraryBase('abydos.distance', 'hamming'))
libraries.register('Hamming', SameLengthLibrary('distance', 'hamming'))
libraries.register('Hamming', SameLengthTextLibrary('Levenshtein', 'hamming'))
libraries.register('Hamming', TextLibrary('jellyfish', 'hamming_distance'))
prototype.register('Hamming', LibraryBase('abydos.distance', 'hamming'))
prototype.register('Hamming', SameLengthLibrary('distance', 'hamming'))
prototype.register('Hamming', SameLengthTextLibrary('Levenshtein', 'hamming'))
prototype.register('Hamming', TextLibrary('jellyfish', 'hamming_distance'))

libraries.register('Jaro', TextLibrary('jellyfish', 'jaro_distance'))
libraries.register('Jaro', TextLibrary('Levenshtein', 'jaro'))
libraries.register('Jaro', TextLibrary('py_stringmatching.similarity_measure.jaro', 'jaro'))
prototype.register('Jaro', TextLibrary('jellyfish', 'jaro_distance'))
prototype.register('Jaro', TextLibrary('Levenshtein', 'jaro'))
prototype.register('Jaro', TextLibrary('py_stringmatching.similarity_measure.jaro', 'jaro'))

# libraries.register('JaroWinkler', LibraryBase('py_stringmatching.similarity_measure.jaro_winkler', 'jaro_winkler'))
libraries.register('JaroWinkler', TextLibrary('jellyfish', 'jaro_winkler', conditions=dict(winklerize=True)))
libraries.register('JaroWinkler', TextLibrary('Levenshtein', 'jaro_winkler', conditions=dict(winklerize=True)))

libraries.register('Levenshtein', LibraryBase('abydos.distance', 'levenshtein'))
libraries.register('Levenshtein', LibraryBase('distance', 'levenshtein'))
libraries.register('Levenshtein', LibraryBase('pylev', 'levenshtein'))
libraries.register('Levenshtein', TextLibrary('jellyfish', 'levenshtein_distance'))
libraries.register('Levenshtein', TextLibrary('Levenshtein', 'distance'))
libraries.register('Levenshtein', TextLibrary('py_stringmatching.similarity_measure.levenshtein', 'levenshtein'))


not_optimized_libraries = LibrariesManager()
not_optimized_libraries.libs = deepcopy(libraries.libs)
prototype.register('JaroWinkler', TextLibrary('jellyfish', 'jaro_winkler', conditions=dict(winklerize=True)))
prototype.register('JaroWinkler', TextLibrary('Levenshtein', 'jaro_winkler', conditions=dict(winklerize=True)))

prototype.register('Levenshtein', LibraryBase('abydos.distance', 'levenshtein'))
prototype.register('Levenshtein', LibraryBase('distance', 'levenshtein'))
prototype.register('Levenshtein', LibraryBase('pylev', 'levenshtein'))
prototype.register('Levenshtein', TextLibrary('jellyfish', 'levenshtein_distance'))
prototype.register('Levenshtein', TextLibrary('Levenshtein', 'distance'))
prototype.register('Levenshtein', TextLibrary('py_stringmatching.similarity_measure.levenshtein', 'levenshtein'))

0 comments on commit dd6ad5f

Please sign in to comment.