# Fuzzy String Matching

### Import Module

In [1]:
import requests
import bs4
import multiprocessing as mp
import numpy as np
import time
from difflib import SequenceMatcher

### Get Sample Data

In [2]:
sec_list_html = requests.get('https://www.sec.gov/rules/other/4-460list.htm').content
sec_list_soup = bs4.BeautifulSoup(sec_list_html, 'lxml')
companies_list = sec_list_soup.find_all('tr')[1:-1]
companies_list_clean = [str(x.text.strip()).lower() for x in companies_list]

In [3]:
companies_list_clean[:5]

['3com corp',
 '3m company',
 'a.g. edwards inc.',
 'abbott laboratories',
 'abercrombie & fitch co.']

In [4]:
len(companies_list_clean)

947

### Acak beberapa huruf untuk uji coba fuzzy string matching

In [5]:
def switch_3_characters_randomly(name, seed = None):
    if seed:
        np.random.seed(seed)
    name_split = list(name)
    flip_indices = np.random.choice(len(name), 3, replace = False)
    a, b, c = flip_indices[0], flip_indices[1], flip_indices[2]
    name_split[a], name_split[b], name_split[c] = name_split[c], name_split[b], name_split[a]
    return ''.join(name_split)

In [6]:
synthetic_companies_list = list(map(switch_3_characters_randomly, companies_list_clean))
synthetic_companies_list[:5]

['3cmo corp',
 'om c3mpany',
 'a.g. edwardsni c.',
 'abbott larobatories',
 'abercrohbie & fitcm co.']

In [7]:
len(synthetic_companies_list)

947

### Fungsi Fuzzy String Matching

In [8]:
def match_ratio(name1, name2):
    s = SequenceMatcher(None, name1, name2)
    return s.ratio()

In [9]:
def get_basic_fuzzy_matches(synthetic_name, threshold = 0.75):
    match_list = []
    flag = None
    for name1 in companies_list_clean:
        ratio = match_ratio(name1, synthetic_name)
        if ratio > threshold:
            if not flag:
                flag = 1
            match_list.append((synthetic_name, name1, ratio))
            
    if not flag:
        match_list.append((synthetic_name, None, 0))
            
    return match_list

In [None]:
matches = list(map(get_basic_fuzzy_matches, synthetic_companies_list[:50]))
matches[:5]

[[('3cmo corp', '3com corp', 0.8888888888888888)],
 [('om c3mpany', '3m company', 0.8)],
 [('a.g. edwardsni c.', 'a.g. edwards inc.', 0.8823529411764706)],
 [('abbott larobatories', 'abbott laboratories', 0.8947368421052632)],
 [('abercrohbie & fitcm co.', 'abercrombie & fitch co.', 0.9130434782608695)]]

### Membuat pooled matches (mencocokkan satu string dengan beberapa string yang mirip)

In [None]:
pool = mp.Pool()
pooled_matches = list(pool.map(get_basic_fuzzy_matches, synthetic_companies_list[:50]))
pool.close()
pool.join()

In [None]:
pooled_matches[:20]