In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from tldextract import extract

from htools import *

In [2]:
path = Path(
    '~/DatascienceBase/Delphi/v3-0-0/data/alpha_test/beacon_urls_851105.csv'
).expanduser()
df = pd.read_csv(path)

In [3]:
fqdns = [f for f in sorted(df.fqdn.unique()) 
         if f.replace('http://', '').replace('www', '')]
len(fqdns)

11182

In [4]:
def preprocess_fqdn(fqdn):
    suff = extract(fqdn).suffix
    return fqdn.replace('http://', '')\
               .replace('www', '')\
               .rpartition(suff)[0]\
               .replace('.', ' ')\
               .strip()

In [5]:
raw2proc = {fqdn: preprocess_fqdn(fqdn) for fqdn in fqdns}
proc2raw = {v: k for k, v in raw2proc.items()}

# Some domains resolve to the same preprocessed domain. Handle later.
len(raw2proc), len(proc2raw)

(11182, 10985)

In [6]:
fd = FuzzyKeyDict(proc2raw, scorer=fuzz.token_sort_ratio)

In [8]:
fd['http://www.classroom.google.com']

'http://classroom.google.com'

In [9]:
fd['google classroom']

'http://classroom.google.com'

In [10]:
fd['classroom google']

'http://classroom.google.com'

In [15]:
fd['youtube']

'http://www.youtube.com'

In [16]:
fd.similar('google classroom', mode='keys_values_similarities')

[('classroom google', 'http://classroom.google.com', 100),
 ('classroom ozobot', 'http://classroom.ozobot.com', 75),
 ('classroom freckle', 'http://classroom.freckle.com', 73)]

In [17]:
fd.similar('google meet', mode='keys_values_similarities')

[('meet google', 'http://meet.google.com', 100),
 ('keep google', 'http://keep.google.com', 82),
 ('messages google', 'http://messages.google.com', 77)]

In [18]:
fd.similar('clever', mode='keys_values_similarities')

[('clever', 'http://clever.com', 100),
 ('cliver', 'http://cliver.site', 83),
 ('cleverism', 'http://www.cleverism.com', 80)]