In [115]:
from collections import Mapping, Iterable
from functools import partial
from fuzzywuzzy import fuzz, process
import fuzzywuzzy
import heapq
import multiprocessing
import numpy as np
from string import ascii_lowercase

from htools import BasicPipeline, item, magics

In [43]:
# slightly tweaked fuzzywuzzy library code
def extract(query, choices, processor=fuzzywuzzy.utils.full_process, scorer=fuzz.ratio, limit=5):
    """Select the best match in a list or dictionary of choices.

    Arguments:
        query: An object representing the thing we want to find.
        choices: An iterable or dictionary-like object containing choices
            to be matched against the query. Dictionary arguments of
            {key: value} pairs will attempt to match the query against
            each value.
        scorer: Optional function for scoring matches between the query and
            an individual processed choice. This should be a function
            of the form f(query, choice) -> int.
            By default, fuzz.WRatio() is used and expects both query and
            choice to be strings.
        limit: Optional maximum for the number of elements returned. Defaults
            to 5.

    Returns:
        List of tuples containing the match and its score.
    """
    sl = process.extractWithoutOrder(query, choices, processor, scorer)
    return heapq.nlargest(limit, sl, key=lambda i: i[1]) if limit is not None else \
        sorted(sl, key=lambda i: i[1], reverse=True)

In [45]:
d = {}
d['dog'] = 0
d['cat'] = 1
d['alley cat'] = 2
d['pig'] = 3
d['cow'] = 4
d['cowbell'] = 5
d['baby cow'] = 6

In [116]:
def extract_unordered(query, choices, scorer=fuzz.ratio):
    proc = fuzzywuzzy.utils.full_process
    query = proc(query)
    full_scorer = BasicPipeline(proc, partial(scorer, query))
    curr_choices = choices.keys() if hasattr(choices, 'keys') else choices
    with multiprocessing.Pool() as p:
        res = p.map(full_scorer, curr_choices)
    return zip(choices, res)

In [117]:
def extract_parallel(query, choices, scorer=fuzz.ratio, limit=5):
    scored = extract_unordered(query, choices, scorer)
    return heapq.nlargest(limit, scored, key=lambda i: i[1]) \
        if limit is not None else \
        sorted(scored, key=lambda i: i[1], reverse=True)

## Testing speed

In [129]:
chars = list(ascii_lowercase)
d = {''.join(np.random.choice(chars, 4)): np.random.randint(100) for i in range(30_000)}

32 chars, 30k keys

In [122]:
%%race -n 4 -r 4
extract('goat', d.keys(), limit=3)
extract_parallel('goat', d.keys(), limit=3)

139 ms ± 6.41 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)
138 ms ± 11.3 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)


16 chars, 30k keys

In [124]:
%%race -n 4 -r 4
extract('goat', d.keys(), limit=3)
extract_parallel('goat', d.keys(), limit=3)

133 ms ± 3.07 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)
138 ms ± 12 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)


12 chars, 30k keys

In [126]:
%%race -n 4 -r 4
extract('goat', d.keys(), limit=3)
extract_parallel('goat', d.keys(), limit=3)

129 ms ± 3.47 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)
140 ms ± 10.6 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)


8 chars, 30k keys

In [128]:
%%race -n 4 -r 4
extract('goat', d.keys(), limit=3)
extract_parallel('goat', d.keys(), limit=3)

125 ms ± 3.99 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)
141 ms ± 10.2 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)


4 chars, 30k keys

In [130]:
%%race -n 4 -r 4
extract('goat', d.keys(), limit=3)
extract_parallel('goat', d.keys(), limit=3)

119 ms ± 4.64 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)
139 ms ± 12.8 ms per loop (mean ± std. dev. of 4 runs, 4 loops each)


## 100k key tests

12 chars, 100k keys

In [100]:
%%race -n 3 -r 3
extract('goat', d.keys(), limit=3)
extract_parallel('goat', d.keys(), limit=3)

460 ms ± 15.6 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)
412 ms ± 27.1 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


8 chars, 100k keys

In [94]:
%%race -n 3 -r 3
extract('goat', d.keys(), limit=3)
extract_parallel('goat', d.keys(), limit=3)

476 ms ± 49 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)
351 ms ± 1.78 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


4 chars, 100k keys

In [102]:
%%race -n 3 -r 3
extract('goat', d.keys(), limit=3)
extract_parallel('goat', d.keys(), limit=3)

473 ms ± 61.3 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)
317 ms ± 48.5 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


2 chars, 100k keys

In [105]:
%%race -n 3 -r 3
extract('goat', d.keys(), limit=3)
extract_parallel('goat', d.keys(), limit=3)

6.45 ms ± 1.52 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)
132 ms ± 8.12 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


### Conclusion

Seems like for enormous vocab, multiprocessing might help a little bit, but for our typical use case the default implementation should be fine. The default is also far faster for small to medium vocab sizes.

In [50]:
# class FuzzyKeyDict(dict):
#     """Dictionary that will try to find similar keys if a key is missing and
#     return their corresponding values. This could be useful when working with
#     embeddings, where we could try mapping missing words to a combination of
#     existing words.
    
#     Examples
#     --------
#     d = FuzzyKeyDict(limit=3, verbose=True)
#     d['dog'] = 0
#     d['cat'] = 1
#     d['alley cat'] = 2
#     d['pig'] = 3
#     d['cow'] = 4
#     d['cowbell'] = 5
#     d['baby cow'] = 6
    
#     # Keys and similarity scores are displayed because we're in verbose mode.
#     >>> res = d['house cat']
#     [('alley cat', 56), ('cat', 50), ('cowbell', 25)]
    
#     # Values correspond to d['alley cat'], d['cat'], d['cowbell'].
#     >>> res
#     [2, 1, 5]
    
#     # "cat" is in our dict so no similarity scores are printed and output is
#     # an integer, not a list. 
#     >>> d['cat']
#     1
#     """
    
#     def __init__(self, limit=3, verbose=False):
#         """
#         Parameters
#         ----------
#         limit: int
#             Number of similar keys to find when trying to retrieve the value
#             for a missing key.
#         verbose: bool
#             If True, this will print the similar keys and their similarity to
#             the queried key when trying to retrieve a missing key.
#         """
#         self.limit = limit
#         self.verbose = verbose

#     def __getitem__(self, key):
#         """
#         Returns
#         -------
#         any or list[any]: If key is present in dict, the corresponding value
#             is returned. If not, the n closest keys are identified and their
#             corresponding values are returned in a list (where n is defined
#             by the `limit` argument specified in the constructor). Values are
#             sorted in descending order by the neighboring keys' similarity to 
#             the missing key in.
#         """
#         try:
#             return super().__getitem__(key)
#         except KeyError:
#             res = process.extract(key, self.keys(), limit=self.limit,
#                                   scorer=fuzz.ratio)
#             if self.verbose: print(res)
#             return [self[k] for k, v in res]

class FuzzyKeyDict(dict):
    """Dictionary that will try to find similar keys if a key is missing and
    return their corresponding values. This could be useful when working with
    embeddings, where we could try mapping missing words to a combination of
    existing words.
    
    Examples
    --------
    d = FuzzyKeyDict(limit=3, verbose=True)
    d['dog'] = 0
    d['cat'] = 1
    d['alley cat'] = 2
    d['pig'] = 3
    d['cow'] = 4
    d['cowbell'] = 5
    d['baby cow'] = 6
    
    # Keys and similarity scores are displayed because we're in verbose mode.
    >>> res = d['house cat']
    [('alley cat', 56), ('cat', 50), ('cowbell', 25)]
    
    # Values correspond to d['alley cat'], d['cat'], d['cowbell'].
    >>> res
    [2, 1, 5]
    
    # "cat" is in our dict so no similarity scores are printed and output is
    # an integer, not a list. 
    >>> d['cat']
    1
    """
    
    def __init__(self, data=None, limit=3):
        """
        Parameters
        ---------- 
        data: Iterable (optional)
            Sequence of pairs, such as a dictionary or a list of tuples. If
            provided, this will be used to populate the FuzzyKeyDict.
        limit: int
            Number of similar keys to find when trying to retrieve the value
            for a missing key.
        """
        if isinstance(data, Mapping):
            for k, v in data.items():
                self[k] = v
        elif isinstance(data, Iterable):
            for k, v in data:
                self[k] = v
        self.limit = limit

    def __getitem__(self, key):
        """
        Returns
        -------
        any or list[any]: If key is present in dict, the corresponding value
            is returned. If not, the n closest keys are identified and their
            corresponding values are returned in a list (where n is defined
            by the `limit` argument specified in the constructor). Values are
            sorted in descending order by the neighboring keys' similarity to 
            the missing key in.
        """
        try:
            return super().__getitem__(key)
        except KeyError:
            return [self[k] for k in self.similar_keys(key)]
        
    def similar_keys(self, key, return_distances=False):
        pairs = process.extract(key, self.keys(), limit=self.limit, 
                                scorer=fuzz.ratio)
        if return_distances:
            return pairs
        return [p[0] for p in pairs]

In [2]:
process.extract??

In [275]:
d = FuzzyKeyDict()
d

{}

In [276]:
basic = {x: i for i, x in enumerate('abcdefg')}
print(basic)

fd = FuzzyKeyDict(basic)
print(fd)

'a' in fd, 'z' in fd

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6}
{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6}


(True, False)

In [277]:
fd['a'], fd['adder']

(0, [0, 3, 4])

In [279]:
tups = list(basic.items())
print(tups)

fd = FuzzyKeyDict(tups)
fd

[('a', 0), ('b', 1), ('c', 2), ('d', 3), ('e', 4), ('f', 5), ('g', 6)]


{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6}

In [281]:
fd['a'], fd['baag']

(0, [0, 1, 6])

In [228]:
d['dog'] = 0
d['cat'] = 1
d['alley cat'] = 2
d['pig'] = 3
d['cow'] = 4
d['cowbell'] = 5
d['baby cow'] = 6

d

{'dog': 0,
 'cat': 1,
 'alley cat': 2,
 'pig': 3,
 'cow': 4,
 'cowbell': 5,
 'baby cow': 6}

In [229]:
d['cat']

1

In [230]:
'cat' in d.keys()

True

In [231]:
d['whale']

[2, 5, 1]

In [232]:
d.similar_keys('whale')

['alley cat', 'cowbell', 'cat']

In [233]:
d.similar_keys('whale', return_distances=True)

[('alley cat', 43), ('cowbell', 33), ('cat', 25)]

In [234]:
d['house cat']

[2, 1, 5]

In [235]:
d['piglet']

[3, 2, 0]

In [236]:
'piglet' in d

False

In [237]:
'pig' in d

True

In [238]:
d['pig']

3

 ## With list values

In [240]:
d = FuzzyKeyDict(verbose=False)
d

{}

In [241]:
d['dog'] = [0, 1]
d['cat'] = [1, 2]
d['alley cat'] = [2, 3]
d['pig'] = [3, 4]
d['cow'] = [4, 5]
d['cowbell'] = [5, 6]
d['baby cow'] = [6, 7]

d

{'dog': [0, 1],
 'cat': [1, 2],
 'alley cat': [2, 3],
 'pig': [3, 4],
 'cow': [4, 5],
 'cowbell': [5, 6],
 'baby cow': [6, 7]}

In [242]:
d['cat']

[1, 2]

In [243]:
d['whale']

[[2, 3], [5, 6], [1, 2]]

In [244]:
d['house cat']

[[2, 3], [1, 2], [5, 6]]

In [245]:
d['piglet']

[[3, 4], [2, 3], [0, 1]]

In [246]:
d['pig'][0] = 111
d

{'dog': [0, 1],
 'cat': [1, 2],
 'alley cat': [2, 3],
 'pig': [111, 4],
 'cow': [4, 5],
 'cowbell': [5, 6],
 'baby cow': [6, 7]}

In [247]:
d['piglet'] = 99
d

{'dog': [0, 1],
 'cat': [1, 2],
 'alley cat': [2, 3],
 'pig': [111, 4],
 'cow': [4, 5],
 'cowbell': [5, 6],
 'baby cow': [6, 7],
 'piglet': 99}

## Scratch

In [248]:
with open('/Users/hmamin/corporate_ipsum.txt', 'r') as f:
    text = f.read()

In [249]:
np.random.choice(list(ascii_lowercase), 2)

array(['i', 'z'], dtype='<U1')

In [250]:
words = [w+''.join(np.random.choice(list(ascii_lowercase), 2))
         for _ in range(100) for w in text.split(' ')]
len(words), len(set(words))

(32900, 27839)

In [252]:
%%timeit -n 5 -r 5
process.extract('people', words)

1.57 s ± 233 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [253]:
%%timeit -n 5 -r 5
process.extractBests('people', words)

1.42 s ± 97.2 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [254]:
%%timeit -n 5 -r 5
process.extractOne('people', words)

1.78 s ± 236 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
