## Natural Language Processing for Fuzzy String Matching with Python

In [58]:
# Install dependencies
# ! pip install fuzzywuzzy
# ! pip install python-Levenshtein

In [59]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [60]:
df = pd.read_csv("data/restaurants.csv",)
df.columns = ['restaurant_name']
df.head()

Unnamed: 0,restaurant_name
0,Aura Cafe
1,Bawarchi
2,Onesta
3,Shah Ghouse Hotel & Restaurant
4,By The Bottle


In [61]:
# Replacing special characters to avoid string match errors
df['refined_restaurant_name'] = df['restaurant_name'].str.replace('[^A-Za-z0-9 ]', '')
df.head()

Unnamed: 0,restaurant_name,refined_restaurant_name
0,Aura Cafe,Aura Cafe
1,Bawarchi,Bawarchi
2,Onesta,Onesta
3,Shah Ghouse Hotel & Restaurant,Shah Ghouse Hotel Restaurant
4,By The Bottle,By The Bottle


In [62]:
# Taking user input search
restaurant_query = str(input("Enter Restaurant Name: "))

Enter Restaurant Name:  kings burger


In [63]:
# All restaurant choices collected
restaurant_choices = df['refined_restaurant_name']

In [64]:
# Using fuzzywuzzy to extract top 5 matches with match cutoff score > 85
top_five = process.extractBests(restaurant_query, restaurant_choices, score_cutoff= 85, limit=5)
pd.DataFrame(top_five, columns=['refined_restaurant_name', 'match_score', 'database_index'], index=None)

Unnamed: 0,refined_restaurant_name,match_score,database_index
0,Kings Burger,100,1370
1,Mr Kings Burger,95,32472
2,Burger King,91,38
3,King,90,5963
4,Kings,90,7294


In [65]:
# Gathering index's to return original name of restaurants prior to preprocessing
results_index = [i[2] for i in top_five]
results_index

[1370, 32472, 38, 5963, 7294]

In [66]:
# Printing results
df.iloc[results_index][['restaurant_name']]

Unnamed: 0,restaurant_name
1370,King's Burger
32472,Mr. Kings Burger
38,Burger King
5963,King
7294,Kings
