In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from tldextract import extract

from htools import *

In [2]:
path = Path(
    '~/DatascienceBase/Delphi/v3-0-0/data/alpha_test/beacon_urls_851105.csv'
).expanduser()
df = pd.read_csv(path)

In [43]:
fqdns = [f for f in sorted(df.fqdn.str.replace('http://', '')
                                  .str.replace('www\d?\.', '').unique()) 
         if f]
len(fqdns)

  


11111

In [46]:
def preprocess_fqdn(fqdn):
    suff = extract(fqdn).suffix
    return fqdn.rpartition(suff)[0]\
               .replace('.', ' ')\
               .strip()

In [47]:
raw2proc = {fqdn: preprocess_fqdn(fqdn) for fqdn in fqdns}
proc2raw = {v: k for k, v in raw2proc.items()}

# Some domains resolve to the same preprocessed domain. Handle later.
len(raw2proc), len(proc2raw)

(11111, 10978)

In [50]:
fd = FuzzyKeyDict(proc2raw, scorer=fuzz.token_sort_ratio)

In [80]:
def nearest_match(query, threshold=75, fuzzy=fd):
    if query in fuzzy.values():
        print('Found exact match.')
        return query
    if ' ' in query or '.' not in query:
        print('Non URL. Looking for fuzzy match with token_sort_ratio.')
        proc, match, score = fuzzy.similar(query,
                                           mode='keys_values_similarities')[0]
    else:
        print('Non-present URL. Looking for fuzzy match with default ratio.')
        query = re.sub('www\d?\.', '', query.replace('http://', ''))
        match, score = process.extractOne(query, fuzzy.values())
    print('score:', score)
    return match if score >= threshold else ''

In [82]:
queries = [
    'classroom.google.com',
    'www.classroom.google.com',
    'www.espn.com',
    'espn.go.com',
    'google classroom',
    'google meet', 
    'google docs',
    'clever',
    'youtube',
    'khanacademy.com',   # Wrong suffix
    'kahnacademy.org',   # Typo
    'kahnacademy'        # Typo and no suffix
]
for query in queries:
    print('query:', query)
    print(nearest_match(query))
    print(spacer())

query: classroom.google.com
Found exact match.
classroom.google.com

-------------------------------------------------------------------------------

query: www.classroom.google.com
Non-present URL. Looking for fuzzy match with default ratio.
score: 100
classroom.google.com

-------------------------------------------------------------------------------

query: www.espn.com
Non-present URL. Looking for fuzzy match with default ratio.
score: 100
espn.com

-------------------------------------------------------------------------------

query: espn.go.com
Non-present URL. Looking for fuzzy match with default ratio.
score: 95
espn.com

-------------------------------------------------------------------------------

query: google classroom
Non URL. Looking for fuzzy match with token_sort_ratio.
score: 100
classroom.google.com

-------------------------------------------------------------------------------

query: google meet
Non URL. Looking for fuzzy match with token_sort_ratio.
score: 100

## Extend LSHDict

See if I can tweak lshdict to allow us to work on lists of strings rather than strings.

UPDATE: Seems like this would require changes sufficiently large to justify a separate implementation (if I decide it's worth it at some point). Don't try to shoehorn it into this implementation: think of this as StrLSHDict.

In [89]:
from itertools import islice

from htools.structures import _FuzzyDictBase

In [113]:
class LSHDict(_FuzzyDictBase):
    """Dictionary that returns the value corresponding to a key's nearest
    neighbor if the key isn't present in the dict. This is intended for use
    as a word2index dict when using embeddings in deep learning: e.g. if we
    have domain embeddings for the top 100k websites, some of our options for
    dealing with unknown domains are:

    1. Encode all of them as <UNK>. This loses a lot of information.
    2. Create a FuzzyKeyDict which will search for similar keys using variants
    of Levenshtein distance. Lookup is O(N) and for 100k domains, that comes
    out to 0.6 seconds per item. We might have thousands or millions of
    lookups over the course of training so this can be a significant cost.
    3. Create an LSHDict (lookups are O(1)). Indexing into the dict as usual
    (e.g. my_lsh_dict[key]) will provide the key's index if present and the
    (approximate) nearest neighbor's index otherwise. Either way, the result
    can be used to index into your embedding layer.
    4. Create an LSHDict and use the `similar_values` method to return n>1
    neighbors. Then pass their indices to an Embedding layer and
    compute the sum/average/weighted average of the results. This may be
    preferable to #3 cases such as web domain lookup, where similar URLs are
    not guaranteed to represent similar sites. (This is basically
    equivalent to an EmbeddingBag layer, but in torch that doesn't store
    intermediate representations so we wouldn't be able to use our pretrained
    embeddings.)

    LSHDict does NOT support pickling as of version 6.0.6 (note: setitem seems
    to be called before init when unpickling, meaning we try to access
    self.forest in self._update_forest before it's been defined. Even if we
    change setitem so reindexing does not occur by default, it still tries to
    hash the new word and add it to the forest so unpickling will still fail).
    """

    def __init__(self, data, n_candidates=None, n_keys=3, ngram_size=3,
                 scorer=fuzz.ratio):
        """
        Parameters
        ----------
        data: dict or list[tuple]
            The base dictionary. Unlike FuzzyKeyDict, we require this since
            adding items one by one is computationally infeasible for large
            datasets. Just build up your dictionary first.
        n_candidates: int or None
            Number of reasonably similar keys to retrieve when trying to index
            in with a key that's missing (or when using the `similar` method).
            You can override this in `similar` but not when using
            __getitem__'s square bracket syntax. If not specified, this will
            be auto initialized to vocab size/1,000, clipped to lie in
            [20, 500]. See `similar` docstring for more on this.
        n_keys: int
            Default number of similar keys to retrieve in `similar`.
        scorer: function
            Default scoring function to use to narrow `n_candidates` keys down
            to `n_keys`. Should be a fuzzywuzzy function where scores lie in
            [0, 100] and higher values indicate high similarity.
        """
        if len(data) < 10_000 and len(next(iter(data))) < 100:
            warnings.warn(
                'It looks like you\'re working with a relatively small '
                'amount of data. FuzzyKeyDict may be fast enough for your '
                'use case and would provide the set of strictly most similar '
                'keys rather than an approximation of that set.'
            )

        super().__init__(data)
        self.scorer = scorer
        self.hash_word = partial(self.lsh_hash_word, n=ngram_size)
        self.forest = MinHashLSHForest(num_perm=128)
        self._initialize_forest()

        # Datasketch's LSH implementation usually gives pretty decent results
        # even with numbers as low as 5-10, but increasing that by a factor of
        # 10 comes with minimal time cost: Fuzzywuzzy matching doesn't get
        # particularly slow until we get into the thousands. The fact that
        # we cap this at 500 makes this lookup asymptotically O(1) while
        # FuzzyKeyDict's is O(N).
        self.n_candidates = n_candidates or np.clip(len(self) // 1_000,
                                                    20, 500)
        self.n_keys = n_keys

    def __setitem__(self, key, val):
        """Try to add keys all at once in the constructor because adding new
        keys can be extremely slow.
        """
        super().__setitem__(key, val)
        self._update_forest(key, val)

    def _update_forest(self, key, val, index=True):
        """Used in __setitem__ to update our LSH Forest. Forest's index method
        seems to recompute everything so adding items to a large LSHDict will
        be incredibly slow. Luckily, our deep learning use case rarely/never
        requires us to update object2index dicts after instantiation so that's
        not as troubling as it might seem.

        Parameters
        ----------
        key: str
        val: any
        index: bool
            If True, reindex the forest (essentially making the key
            queryable). This should be False when initializing the forest so
            we just index once after everything's been added.
        """
        self.forest.add(key, self.hash_word(key))
        if index: self.forest.index()

    def _initialize_forest(self):
        """Called once in __init__ to add all items to LSH Forest. This is
        necessary because dict specifically calls its own __setitem__, not
        its children's.
        """
        for k, v in self.items():
            self._update_forest(k, v, False)
        self.forest.index()

    @add_docstring(_FuzzyDictBase._filter_similarity_pairs)
    def similar(self, key, mode='keys_values', n_candidates=None,
                n_keys=None, scorer=None):
        """Find a list of similar keys. This is used in __getitem__ but can
        also be useful as a user-facing method if you want to get more than
        1 neighbor or you want to get similarity scores as well.

        Parameters
        ----------
        key: str
            Word/URL/etc. to find similar keys to.
        mode: str
            See section below `Returns`.
        n_candidates: int or None
            Number of similar candidates to retrieve. This uses Jaccard
            Similarity which isn't always a great metric for string
            similarity. This is also where the LSH comes in so they're not
            strictly the n best candidates, but rather a close approximation
            of that set. If None, this will fall back to self.n_candidates.
            Keep in mind this determines how many keys to
        n_keys: int or None
            Number of similar keys to return. If None, this will fall back to
            self.n_keys.
        scorer: function or None
            Fuzzywuzzy scoring function, e.g. fuzz.ratio or
            fuzz.partial_ratio, which will be used to score each candidate and
            select which to return. Higher scores indicate higher levels of
            similarity. If None, this will fall back to self.scorer.

        Returns
        -------
        list: List if `mode` is "keys" or "values". List of tuples otherwise.
        """
        candidates = self.forest.query(self.hash_word(key),
                                       n_candidates or self.n_candidates)
        if not candidates: raise KeyError('No similar keys found.')

        # List of (key, score) where higher means more similar.
        pairs = process.extract(key, candidates,
                                limit=n_keys or self.n_keys,
                                scorer=scorer or self.scorer)
        return self._filter_similarity_pairs(pairs, mode=mode)

#     @staticmethod
#     @add_docstring(ngrams)
#     def lsh_hash_word(word, num_perm=128, **ngram_kwargs):
#         """Hash an input word (str) and return a MinHash object that can be
#         added to an LSHForest.

#         Parameters
#         ----------
#         word: str
#             Word to hash.
#         num_perm: int
#         ngram_kwargs: any
#             Forwarded to `ngrams`.

#         Returns
#         -------
#         datasketch MinHash object
#         """
#         mhash = MinHash(num_perm=num_perm)
#         for ng in ngrams(word, **ngram_kwargs):
#             # TODO: diff
#             print(ng)
#             mhash.update(ng.encode('utf8'))
#         return mhash

    @staticmethod
    @add_docstring(ngrams)
    def lsh_hash_word(word, num_perm=128, **ngram_kwargs):
        """Hash an input word (str) and return a MinHash object that can be
        added to an LSHForest.

        Parameters
        ----------
        word: str
            Word to hash.
        num_perm: int
        ngram_kwargs: any
            Forwarded to `ngrams`.

        Returns
        -------
        datasketch MinHash object
        """
        mhash = MinHash(num_perm=num_perm)
        for ng in ngrams(word, **ngram_kwargs):
            # TODO: diff
            if isinstance(ng, tuple) and len(ng) == 1: ng = ng[0]
            mhash.update(ng.encode('utf8'))
        return mhash

In [99]:
chunks2raw = {
    tuple(k.split()): v for k, v in 
    select(proc2raw, ['classroom google', 'khanacademy', 'meet google',
                      'google', 'docs google']).items()
}
chunks2raw

{('classroom', 'google'): 'classroom.google.com',
 ('khanacademy',): 'khanacademy.org',
 ('meet', 'google'): 'meet.google.com',
 ('google',): 'google.it',
 ('docs', 'google'): 'docs.google.com'}

In [107]:
ngrams(list(chunks2raw)[0], n=1)

[('classroom',), ('google',)]

In [108]:
ngrams(list(chunks2raw)[0], n=2)

[('classroom', 'google')]

In [109]:
ngrams('abcd')

['abc', 'bcd']

In [104]:
ngrams(['abc'], n=1)

[['abc']]

In [118]:
lsh = LSHDict(chunks2raw, ngram_size=1, n_candidates=1)



In [119]:
lsh

{('classroom', 'google'): 'classroom.google.com',
 ('khanacademy',): 'khanacademy.org',
 ('meet', 'google'): 'meet.google.com',
 ('google',): 'google.it',
 ('docs', 'google'): 'docs.google.com'}

In [126]:
key = ('google', 'classroom')
lsh[key]

TypeError: expected string or bytes-like object

In [128]:
match = lsh.forest.query(lsh.hash_word(key), lsh.n_candidates)
print(key)
print(match)

('google', 'classroom')
[('classroom', 'google')]


In [135]:
lsh._filter_similarity_pairs(((match, -1),), mode='keys_values')

TypeError: unhashable type: 'list'