In [1]:
import pandas as pd
hotel_rooms = pd.read_csv('room_type.csv')
hotel_rooms

Unnamed: 0,Expedia,Booking.com
0,"Deluxe Room, 1 King Bed",Deluxe King Room
1,"Standard Room, 1 King Bed, Accessible",Standard King Roll-in Shower Accessible
2,"Grand Corner King Room, 1 King Bed",Grand Corner King Room
3,"Suite, 1 King Bed (Parlor)",King Parlor Suite
4,"High-Floor Premium Room, 1 King Bed",High-Floor Premium King Room
...,...,...
98,"Room, 1 King Bed, Accessible, Resort View (Ali...",Alii Tower Resort View With King Bed - Mobilit...
99,"Room, 1 King Bed, Accessible, View (Rainbow, B...",Rainbow Tower Ocean View With King Bed - Mobil...
100,"Room, 1 King Bed, Ocean View (Alii)",Alii Tower Ocean View With King Bed
101,"Room, 1 King Bed, Oceanfront (Rainbow)",Rainbow Tower Ocean Front with King Bed


In [2]:
import logging
from fastfuzzymatch import FastFuzzyMatch
import rapidfuzz
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

ffm = FastFuzzyMatch(
    clean=True,
    embedding_model=TfidfVectorizer(analyzer='char', ngram_range=(1, 4)),
    ##dimensionality_reduction_model=TruncatedSVD(n_components=100),
    clustering_model=NearestNeighbors(n_neighbors=1, metric='cosine', n_jobs=-1),
    fuzzy_model=rapidfuzz,
    fuzzy_scorer=rapidfuzz.fuzz.token_sort_ratio
)   

test = ffm.find_matches(
    clean_df=hotel_rooms,
    clean_column='Booking.com',
    dirty_df=hotel_rooms,
    dirty_column='Expedia'
)


2024-07-18 13:14:28,481 - INFO - Cleaning text in column: Booking.com
2024-07-18 13:14:28,482 - INFO - Cleaning text in column: Expedia
2024-07-18 13:14:28,484 - INFO - Starting similarity search.
2024-07-18 13:14:28,484 - INFO - Creating embeddings for clean and dirty data.
2024-07-18 13:14:28,491 - INFO - Clustering data.
2024-07-18 13:14:28,524 - INFO - Starting fuzzy search.
Fuzzy Matching Progress: 100%|██████████| 103/103 [00:00<00:00, 81236.05it/s]
2024-07-18 13:14:28,527 - INFO - Fuzzy matching completed in 0.04312562942504883 seconds


In [13]:
test

Unnamed: 0,Expedia,Booking.com,Ratio
0,deluxe room 1 king bed,deluxe room one king bed,86.956522
1,standard room 1 king bed accessible,standard king rollin shower accessible,76.712329
2,grand corner king room 1 king bed,grand corner king room,80.000000
3,suite 1 king bed parlor,king parlor suite,85.000000
4,highfloor premium room 1 king bed,highfloor premium king room,90.000000
...,...,...,...
98,room 1 king bed accessible resort view alii ba...,alii tower resort view with king bed mobility ...,68.468468
99,room 1 king bed accessible view rainbow bathtub,rainbow tower ocean view with king bed mobilit...,62.385321
100,room 1 king bed ocean view alii,ocean view room with king bed,80.000000
101,room 1 king bed oceanfront rainbow,rainbow tower ocean front with king bed,65.753425


In [3]:
# Inspired by https://audhiaprilliant.medium.com/fuzzy-string-matching-optimization-using-tf-idf-and-knn-b07fce69b58f

# https://medium.com/@tacettincankrc/fuzzy-matching-with-large-datasets-challenges-and-solutions-901b8446dcdc

# TF-IDF based Fuzzy Matching function
import pandas as pd
import numpy as np
import re
import itertools
from typing import Union, List, Tuple
# Import module for fuzzy matching
from rapidfuzz import process, fuzz
# Import module for TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
# Import module for cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
# Import module for KNN
from sklearn.neighbors import NearestNeighbors
import multiprocessing

# String pre-processing
def preprocess_string(s):
    # Remove spaces between strings with one or two letters
    s = re.sub(r'(?<=\b\w)\s*[ &]\s*(?=\w\b)', '', s)
    return s

# String matching - TF-IDF
def build_vectorizer(
        clean: pd.Series,
        analyzer: str = 'char',
        ngram_range: Tuple[int, int] = (1, 4),
        n_neighbors: int = 1,
        **kwargs
) -> Tuple:
    # Create vectorizer
    vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, **kwargs)
    X = vectorizer.fit_transform(clean.values.astype('U'))

    # Fit nearest neighbors corpus
    nbrs = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(X)
    return vectorizer, nbrs

# String matching - KNN
def tfidf_nn(
        messy,
        clean,
        n_neighbors = 1,
        **kwargs
):
    # Fit clean data and transform messy data
    vectorizer, nbrs = build_vectorizer(clean, n_neighbors = n_neighbors, **kwargs)
    input_vec = vectorizer.transform(messy)

    # Determine best possible matches
    distances, indices = nbrs.kneighbors(input_vec, n_neighbors = n_neighbors)
    nearest_values = np.array(clean)[indices]
    return nearest_values, distances

# String matching - match fuzzy
def find_matches_fuzzy(
        row,
        match_candidates,
        limit = 5
):
    row_matches = process.extract(
        row, dict(enumerate(match_candidates)),
        scorer = fuzz.token_sort_ratio,
        limit = limit
    )
    result = [(row, match[0], match[1]) for match in row_matches]
    return result


# String matching - TF-IDF
def fuzzy_nn_match(
        messy,
        clean,
        column,
        col,
        n_neighbors = 100,
        limit = 5, **kwargs):
    nearest_values, _ = tfidf_nn(messy, clean, n_neighbors, **kwargs)
    
    # Iterate over all messy values
    results = [find_matches_fuzzy(row, nearest_values[i], limit) for i, row in enumerate(messy)]
    
    df = pd.DataFrame(itertools.chain.from_iterable(results),
                      columns = [column, col, 'Ratio']
                      )
    return df

# String matching - Fuzzy
def fuzzy_tf_idf(
        df: pd.DataFrame,
        column: str,
        clean: pd.Series,
        mapping_df: pd.DataFrame,
        col: str,
        analyzer: str = 'char',
        ngram_range: Tuple[int, int] = (1, 3)
) -> pd.Series:
    # Create vectorizer
    clean = clean.drop_duplicates().reset_index(drop = True)
    messy_prep = df[column].drop_duplicates().dropna().reset_index(drop = True).astype(str)
    messy = messy_prep.apply(preprocess_string) # messy_prep
    result = fuzzy_nn_match(messy = messy, clean = clean, column = column, col = col, n_neighbors = 1)
    # Map value from messy to clean
    # Merge the result with the original data
    df_result = df.merge(result, how='left', left_on=column, right_on=column)

    return df_result

In [4]:
import time

# Run the fuzzy string matching algorithm
start = time.time()
df_result = (hotel_rooms.pipe(fuzzy_tf_idf, # Function and messy data
                     column = 'Expedia', # Messy column in data
                     clean = hotel_rooms['Booking.com'], # Master data (list)
                     mapping_df = hotel_rooms, # Master data
                     col = 'Result') # Can be customized
             )
end = time.time()
# Print the computation time
print('Fuzzy string matching in {} seconds'.format(end - start))
# View the result of fuzzy string matching
df_result.head()

Fuzzy string matching in 0.00982213020324707 seconds


Unnamed: 0,Expedia,Booking.com,Result,Ratio
0,"Deluxe Room, 1 King Bed",Deluxe King Room,Deluxe Room - One King Bed,85.714286
1,"Standard Room, 1 King Bed, Accessible",Standard King Roll-in Shower Accessible,Standard King Roll-in Shower Accessible,73.684211
2,"Grand Corner King Room, 1 King Bed",Grand Corner King Room,Grand Corner King Room,78.571429
3,"Suite, 1 King Bed (Parlor)",King Parlor Suite,King Parlor Suite,55.813953
4,"High-Floor Premium Room, 1 King Bed",High-Floor Premium King Room,High-Floor Premium King Room,88.888889
