In [1]:
#https://www.datacamp.com/community/tutorials/fuzzy-string-python
#https://towardsdatascience.com/how-to-do-fuzzy-matching-in-python-pandas-dataframe-6ce3025834a6
import pandas as pd
import numpy as np
from fuzzywuzzy import process, fuzz

In [2]:
# Load the OSM data and extract entries that are restaurants.
OSM = pd.read_json('amenities-vancouver.json.gz', lines=True)
rest_list = ['cafe', 'fast_food', 'bbq', 'restaurant', 'pub',
       'bar', 'ice_cream', 'bistro', 'juice_bar']
restaurants = OSM[OSM.amenity.isin(rest_list)]
restaurants['name'] = restaurants['name'].str.lower()
restaurants = restaurants[~restaurants.name.isnull()].reset_index(drop=True) #remove null names
restaurants

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  restaurants['name'] = restaurants['name'].str.lower()


Unnamed: 0,lat,lon,timestamp,amenity,name,tags
0,49.260812,-123.125736,2020-03-20T18:22:12.000-07:00,cafe,starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ..."
1,49.260953,-123.125704,2019-08-02T18:11:20.000-07:00,fast_food,salad loop,{'opening_hours': 'Mo-Fr 07:00-17:00; Sa 10:00...
2,49.126650,-123.182470,2020-03-30T09:08:51.000-07:00,restaurant,best bite indian cuisine,"{'addr:housenumber': '10-3891', 'phone': '+1-6..."
3,49.283192,-123.109050,2015-12-18T21:41:07.000-08:00,pub,the cambie,"{'toilets:wheelchair': 'no', 'wheelchair': 'li..."
4,49.265951,-123.246630,2011-11-19T08:06:36.000-08:00,pub,mahony and sons,{'wheelchair': 'yes'}
...,...,...,...,...,...,...
5057,49.250408,-123.076261,2017-07-08T05:22:57.000-07:00,restaurant,house of dosas,"{'addr:housenumber': '1391', 'phone': '+1-604-..."
5058,49.278424,-122.806704,2013-03-26T23:45:49.000-07:00,cafe,creekside coffee,{}
5059,49.278770,-122.797628,2013-03-26T23:45:49.000-07:00,restaurant,togo sushi,{'cuisine': 'japanese'}
5060,49.282666,-122.826978,2019-09-13T13:56:49.000-07:00,pub,brown's social house,"{'addr:housenumber': '215', 'brewery': 'Guinne..."


In [3]:
# Load the text file of the list of chain restaurants.
chains_text = open("canadian_chains.txt", "r")
chainList = chains_text.read().split('\n')
chainList = [item.lower() for item in chainList]

In [4]:
# Match names to the list of chains.
matchList = []
ratioList = []
def match_chains(str2Match):
    ratio = process.extractOne(str2Match, chainList, scorer=fuzz.token_set_ratio, score_cutoff=91)
    if ratio == None:
        matchList.append(None)
        ratioList.append(None)
    else:
        matchList.append(ratio[0])
        ratioList.append(ratio[1])
    
ratios = np.vectorize(match_chains,otypes=[tuple])(restaurants['name'])

restaurants['match'] = matchList
restaurants['ratio'] = ratioList

In [5]:
indiedf = restaurants[(restaurants['match'].isnull())]
indiedf

Unnamed: 0,lat,lon,timestamp,amenity,name,tags,match,ratio
1,49.260953,-123.125704,2019-08-02T18:11:20.000-07:00,fast_food,salad loop,{'opening_hours': 'Mo-Fr 07:00-17:00; Sa 10:00...,,
2,49.126650,-123.182470,2020-03-30T09:08:51.000-07:00,restaurant,best bite indian cuisine,"{'addr:housenumber': '10-3891', 'phone': '+1-6...",,
3,49.283192,-123.109050,2015-12-18T21:41:07.000-08:00,pub,the cambie,"{'toilets:wheelchair': 'no', 'wheelchair': 'li...",,
4,49.265951,-123.246630,2011-11-19T08:06:36.000-08:00,pub,mahony and sons,{'wheelchair': 'yes'},,
9,49.171276,-123.134873,2019-10-27T17:08:25.000-07:00,restaurant,oriental rice noodle,"{'addr:housenumber': '8100', 'phone': '+1-604-...",,
...,...,...,...,...,...,...,...,...
5053,49.219011,-122.928008,2018-02-12T04:09:16.000-08:00,cafe,raw cuts sandwich shop,"{'addr:housenumber': '7885', 'level': '0', 'ad...",,
5054,49.262659,-123.251745,2019-11-04T20:07:41.000-08:00,cafe,magma cafè,"{'wheelchair': 'yes', 'addr:housenumber': '633...",,
5057,49.250408,-123.076261,2017-07-08T05:22:57.000-07:00,restaurant,house of dosas,"{'addr:housenumber': '1391', 'phone': '+1-604-...",,
5058,49.278424,-122.806704,2013-03-26T23:45:49.000-07:00,cafe,creekside coffee,{},,


In [6]:
chainsdf = restaurants[~(restaurants['match'].isnull())]
chainsdf

Unnamed: 0,lat,lon,timestamp,amenity,name,tags,match,ratio
0,49.260812,-123.125736,2020-03-20T18:22:12.000-07:00,cafe,starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ...",starbucks,100.0
5,49.193580,-123.180788,2019-09-14T05:00:22.000-07:00,cafe,tim hortons,"{'brand:wikidata': 'Q175106', 'level': '0', 'b...",tim horton's,96.0
6,49.228400,-122.848383,2019-11-20T11:48:37.000-08:00,cafe,tim hortons,"{'brand:wikidata': 'Q175106', 'addr:housenumbe...",tim horton's,96.0
7,49.212449,-122.919749,2019-07-02T01:10:30.000-07:00,cafe,tim hortons,"{'brand:wikidata': 'Q175106', 'website': 'http...",tim horton's,96.0
8,49.212659,-122.919347,2019-07-02T01:10:30.000-07:00,cafe,starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ...",starbucks,100.0
...,...,...,...,...,...,...,...,...
5051,49.233313,-123.117346,2019-08-29T17:49:06.000-07:00,restaurant,white spot,"{'brand:wikidata': 'Q7995414', 'addr:housenumb...",white spot,100.0
5055,49.264044,-123.174166,2019-09-13T13:56:49.000-07:00,cafe,tim hortons,"{'brand:wikidata': 'Q175106', 'addr:housenumbe...",tim horton's,96.0
5056,49.263998,-123.173758,2019-06-08T06:31:22.000-07:00,fast_food,subway,"{'brand:wikidata': 'Q244457', 'website': 'http...",subway,100.0
5059,49.278770,-122.797628,2013-03-26T23:45:49.000-07:00,restaurant,togo sushi,{'cuisine': 'japanese'},togo sushi,100.0
