In [15]:
#https://www.datacamp.com/community/tutorials/fuzzy-string-python
#https://towardsdatascience.com/how-to-do-fuzzy-matching-in-python-pandas-dataframe-6ce3025834a6
#https://towardsdatascience.com/fuzzywuzzy-find-similar-strings-within-one-column-in-a-pandas-data-frame-99f6c2a0c212
import pandas as pd
import numpy as np
from fuzzywuzzy import process, fuzz
OSM = pd.read_json('amenities-vancouver.json.gz', lines=True)

In [16]:
rest_list = ['cafe', 'fast_food', 'bbq', 'restaurant', 'pub',
       'bar', 'ice_cream', 'bistro', 'juice_bar']
restaurants = OSM[OSM.amenity.isin(rest_list)]
restaurants['name'] = restaurants['name'].str.lower()
pd.options.mode.chained_assignment = None #disable weird warnings
restaurants

Unnamed: 0,lat,lon,timestamp,amenity,name,tags
0,49.260812,-123.125736,2020-03-20T18:22:12.000-07:00,cafe,starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ..."
1,49.260953,-123.125704,2019-08-02T18:11:20.000-07:00,fast_food,salad loop,{'opening_hours': 'Mo-Fr 07:00-17:00; Sa 10:00...
3,49.249848,-122.959708,2011-09-06T03:52:10.000-07:00,bbq,,{}
13,49.126650,-123.182470,2020-03-30T09:08:51.000-07:00,restaurant,best bite indian cuisine,"{'addr:housenumber': '10-3891', 'phone': '+1-6..."
16,49.283192,-123.109050,2015-12-18T21:41:07.000-08:00,pub,the cambie,"{'toilets:wheelchair': 'no', 'wheelchair': 'li..."
...,...,...,...,...,...,...
17712,49.250408,-123.076261,2017-07-08T05:22:57.000-07:00,restaurant,house of dosas,"{'addr:housenumber': '1391', 'phone': '+1-604-..."
17713,49.278424,-122.806704,2013-03-26T23:45:49.000-07:00,cafe,creekside coffee,{}
17714,49.278770,-122.797628,2013-03-26T23:45:49.000-07:00,restaurant,togo sushi,{'cuisine': 'japanese'}
17716,49.282666,-122.826978,2019-09-13T13:56:49.000-07:00,pub,brown's social house,"{'addr:housenumber': '215', 'brewery': 'Guinne..."


In [17]:
grouped_names = restaurants.groupby('name', as_index=False).agg(['count'])
grouped_names = grouped_names.iloc[:, 0:1].reset_index(drop=False)
grouped_names.columns = ['name', 'count']
grouped_names = grouped_names.sort_values('count', ascending=False)
grouped_names[:20]

Unnamed: 0,name,count
2689,starbucks,217
2720,subway,177
3138,tim hortons,124
1780,mcdonald's,59
36,a&w,55
3342,white spot,26
993,freshii,24
995,freshslice pizza,24
2238,pizza hut,23
2332,quiznos,23


In [4]:
#pd.set_option('display.max_rows', 100)

# We have 71 rows without a name here, try to fill in if possible.
restaurants[restaurants.name.isnull()]

Unnamed: 0,lat,lon,timestamp,amenity,name,tags
3,49.249848,-122.959708,2011-09-06T03:52:10.000-07:00,bbq,,{}
786,49.049771,-122.319001,2019-09-02T22:08:26.000-07:00,fast_food,,"{'official_name': 'Kami Sushi Enterprises', 'a..."
1263,49.237670,-122.782747,2012-03-01T05:24:27.000-08:00,bbq,,{}
1553,49.263266,-123.110529,2014-07-11T19:59:54.000-07:00,pub,,{}
2046,49.229367,-123.004155,2017-03-22T04:32:05.000-07:00,bar,,{'addr:housenumber': '1822'}
...,...,...,...,...,...,...
16520,49.050337,-122.800687,2014-05-17T04:16:00.000-07:00,restaurant,,{'cuisine': 'sushi'}
16667,49.200029,-122.911494,2013-03-03T22:23:02.000-08:00,fast_food,,"{'level': '1', 'cuisine': 'seafood'}"
17089,49.312247,-122.925978,2019-07-15T07:01:57.000-07:00,cafe,,{}
17628,49.139018,-122.889496,2020-03-21T05:16:15.000-07:00,restaurant,,{'cuisine': 'thai'}


In [5]:
unique_name = restaurants['name'].astype(str).unique().tolist()
len(unique_name)
unique_name[unique_name == 'Nan']
#sorted(unique_name)[:20]

'starbucks'

In [6]:
unique_name

['starbucks',
 'salad loop',
 'nan',
 'best bite indian cuisine',
 'the cambie',
 'mahony and sons',
 'tim hortons',
 'oriental rice noodle',
 "nando's",
 'boston pizza',
 "mcdonald's",
 "koerner's pub",
 'waves',
 'subway',
 'legends pub',
 'kfc',
 'pizza hut (takeout)',
 'burger king',
 "triple o's",
 'faculty brewing',
 'argo cafe',
 'r & b ale & pizza house',
 'peaceful restaurant',
 'pho 99',
 'tacofino commissary',
 'miyako sushi',
 "what's up? hot dog!",
 'waves coffee',
 'st lawrence restaurant',
 'cuhillo',
 'the uncommon cafe',
 'juniper',
 'gold stone bakery & restaurant',
 'back and forth bar',
 'the sardine can',
 'soft peaks ice cream',
 '131 water kitchen & bar',
 'jules bistro',
 'crystal palace',
 "tokyo joe's",
 'cactus club cafe',
 'joongwon',
 'chongqing on robson',
 "stepho's",
 'a taste of india',
 "hon's wun-tun house",
 'miss korean bbq',
 'forage',
 'timber',
 'miko sushi',
 'cora',
 'chatime',
 "pok'e time",
 'abode',
 'shenanigans',
 "hail mary's",
 'red burr

In [7]:
#Create tuples of brand names, matched brand names, and the score
score_sort = [(x,) + i
             for x in unique_name 
             for i in process.extract(x, unique_name, scorer=fuzz.token_sort_ratio)]
#Create a dataframe from the tuples
similarity_sort = pd.DataFrame(score_sort, columns=['name_sort','match_sort','score_sort'])
similarity_sort.head()

Unnamed: 0,name_sort,match_sort,score_sort
0,starbucks,starbucks,100
1,starbucks,starbucks - whiterock,64
2,starbucks,stackables,63
3,starbucks,steamworks,63
4,starbucks,sushi star,63


In [9]:
similarity_sort['sorted_name_sort'] = np.minimum(similarity_sort['name_sort'], similarity_sort['match_sort'])
similarity_sort.head()

Unnamed: 0,name_sort,match_sort,score_sort,sorted_name_sort
0,starbucks,starbucks,100,starbucks
1,starbucks,starbucks - whiterock,64,starbucks
2,starbucks,stackables,63,stackables
3,starbucks,steamworks,63,starbucks
4,starbucks,sushi star,63,starbucks


In [20]:
high_score_sort = similarity_sort[(similarity_sort['score_sort'] >= 80) &
                (similarity_sort['name_sort'] !=  similarity_sort['match_sort']) &
                (similarity_sort['sorted_name_sort'] != similarity_sort['match_sort'])]
high_score_sort = high_score_sort.drop('sorted_name_sort',axis=1).copy()
high_score_sort

Unnamed: 0,name_sort,match_sort,score_sort
91,triple o's,triple-o's,100
92,triple o's,triple o’s,100
93,triple o's,triple os,95
128,miyako sushi,sushi misoya,83
136,waves coffee,waves coffee house,80
...,...,...,...
17064,c&u vietnamese restaurant,pho duy vietnamese restaurant,81
17161,café foam,jam café,80
17201,grand chinese restaurant,on may chinese restaurant,86
17202,grand chinese restaurant,the real chinese restaurant,82


In [None]:
high_score_sort.groupby(['name_sort','score_sort']).agg(
                        {'match_sort': ', '.join}).sort_values(
                        ['score_sort'], ascending=False)