In [34]:
import pandas as pd

# Import fuzzy matching benchmark dataset
# https://www.kaggle.com/datasets/shahrukhkhan/fuzzy-matching-benchmark
words = pd.read_csv('data.csv')

In [35]:
words.head()

Unnamed: 0,word1,word2,score
0,inevitable,inevyitable,0.95
1,elbativeni,inevyitable,0.29
2,inevitable,elbatiyveni,0.29
3,elbativeni,elbatiyveni,0.95
4,Computing,Service,0.12


In [44]:
from fastfuzzymatch import FastFuzzyMatch
import rapidfuzz
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

ffm = FastFuzzyMatch(
    clean=True,
    merge=False,
    embedding_model=TfidfVectorizer(analyzer='char', ngram_range=(1, 4)),
    ##dimensionality_reduction_model=TruncatedSVD(n_components=100),
    clustering_model=NearestNeighbors(n_neighbors=1, metric='cosine', n_jobs=-1),
    fuzzy_model=rapidfuzz,
    fuzzy_scorer=rapidfuzz.fuzz.token_sort_ratio
)

In [46]:
test = ffm.find_matches(
    clean_df=words,
    clean_column='word1',
    dirty_df=words,
    dirty_column='word2'
)

test.head()

2024-07-18 17:49:42,798 - INFO - Cleaning text in column: word1
2024-07-18 17:49:42,805 - INFO - Cleaning text in column: word2
2024-07-18 17:49:42,812 - INFO - Starting similarity search.
2024-07-18 17:49:42,812 - INFO - Creating embeddings for clean and dirty data.
2024-07-18 17:49:42,990 - INFO - Clustering data.
2024-07-18 17:49:43,401 - INFO - Starting fuzzy search.
Fuzzy Matching Progress: 100%|██████████| 5649/5649 [00:00<00:00, 388668.55it/s]
2024-07-18 17:49:43,419 - INFO - Fuzzy matching completed in 0.6068644523620605 seconds


Unnamed: 0,word2,Result,score
0,inevyitable,coming inevyitable,75.862069
1,elbatiyveni,elbativeni,95.238095
2,service,service incloden,60.869565
3,ecivres,ecivresyawliar,66.666667
4,visited,visit,83.333333
