In [1]:
import pandas as pd
import numpy as np

In [2]:
google_places_path = "/home/jczestochowska/workspace/ada/chicago_food/data/processed/google_places.csv"
restaurants_path = "/home/jczestochowska/workspace/ada/chicago_food/data/processed/restaurants.csv"

In [3]:
google_places = pd.read_csv(google_places_path)
restaurants = pd.read_csv(restaurants_path)

Let's see if all restaurants we got from google are the ones we asked for and are in Chicago

In [4]:
google_places.head()

Unnamed: 0,place_id,place_name,rating,total_number_of_ratings,price_level,address,city,zip_code
0,ChIJleWegJHTD4gRsIMGm3IjUcM,yolk - test kitchen,4.4,180.0,2.0,1767 n milwaukee ave,chicago,60647
1,ChIJI1dehI0xDogRRkvKIpUspHM,las asadas restaurant,4.0,166.0,2.0,3834 w 47th st,chicago,60632-4136
2,ChIJBRuLGC_MD4gRfUVkzaQBIZk,mini palapita,4.3,342.0,1.0,4968 n elston ave,chicago,60630
3,ChIJ-auSyWItDogRQwUqcwljejE,crazy bird chicken,4.7,67.0,,1160 w grand ave,chicago,60642-5837
4,ChIJIYRFg7IsDogRAW0zLguKL-A,redhead piano bar,4.2,989.0,2.0,16 w ontario st,chicago,60654


In [5]:
restaurants.head()

Unnamed: 0,place_id,place_name,latitude,longitude,address,zip_code
0,ChIJleWegJHTD4gRsIMGm3IjUcM,yolk test kitchen,41.913588,-87.682203,1767 n milwaukee ave,60647.0
1,ChIJI1dehI0xDogRRkvKIpUspHM,las asadas mexican grill,41.808025,-87.720037,3834 w 47th st,60632.0
2,ChIJBRuLGC_MD4gRfUVkzaQBIZk,la palapita,41.808025,-87.720037,3834 w 47th st,60632.0
3,ChIJ-auSyWItDogRQwUqcwljejE,crazy bird,41.891193,-87.657055,1160 w grand ave,60642.0
4,ChIJIYRFg7IsDogRAW0zLguKL-A,the redhead piano bar,41.893371,-87.628783,16-18 w ontario st,60654.0


We notice that address from chicago database has whitespace at the end so we remove it from each row

In [6]:
restaurants.address = restaurants.address.apply(str.strip)

In [7]:
print(f"Google places API returned {len(google_places)} restaurants")

Google places API returned 13006 restaurants


Check if all restaurants are in Chicago

In [8]:
not_chicago_restaurants = google_places[~(google_places.city == "chicago")]
print(f"Number of restaurants with non-chicago city {len(not_chicago_restaurants)}")
not_chicago_restaurants.city.unique()

Number of restaurants with non-chicago city 2053


array(['60666', 'des plaines', 'libertyville', 'asheville', 'schaumburg',
       'elk grove village', 'morton grove', 'wauwatosa', '60622',
       'oak park', 'harwood heights', 'frankfort', 'lubbock', 'houston',
       '60641', 'holton', '45014', nan, 'naperville', '28594', 'rosemont',
       'carol stream', 'stone park', 'westchester', '46617', 'cicero',
       'forest park', 'madison', 'crestwood', 'addison', 'mancelona',
       'khet phra nakhon', '87544', 'sterling heights', '60642',
       'downers grove', 'st. louis', '050015', '60618', 'cary', 'joliet',
       'east chicago', 'miami', 'roseland', '60632', 'new york', '60603',
       'wilmette', '60649', 'dallas', 'evanston', '60623', 'st. charles',
       'bolingbrook', 'hoffman estates', 'springfield', 'elgin', '1205',
       'atlanta', '60629', 'detroit', '60657', 'skokie', 'wood dale',
       'thornlie', 'hickory hills', 'clayton', 'hinsdale', 'highland',
       'franklin park', 'yuma', 'lansing', 'wheaton', 'bellwood',
    

We can see that sometimes the zip code was returned as a city, Chicago zip codes start with 60 therefore those entries where city equals a number starting 60 are probably in Chicago and their city name is in address column

In [9]:
address_chicago_count = len(google_places[google_places.address == "chicago"])
print(f"Number of places with chicago as address and zip code in city column is {address_chicago_count}")

Number of places with chicago as address and zip code in city column is 163


In [10]:
google_places[google_places.city.str.startswith("60", na=False)]

Unnamed: 0,place_id,place_name,rating,total_number_of_ratings,price_level,address,city,zip_code
13,ChIJVwHOlyi0D4gRNjLVMUoK00A,romano's macaroni grill,3.7,860.0,2.0,chicago,60666,
74,ChIJqQyWfSa0D4gRN_IZ45o_HCE,argo tea,3.5,45.0,1.0,chicago,60666,
106,ChIJTxSyOsfSD4gRUy8JoHTgkVw,piece out,,,,chicago,60622,
183,ChIJIVrbHPjMD4gRQfUHdZXic5U,la estrella blanca,3.9,29.0,,chicago,60641,
278,ChIJM_1zjXe2D4gRDAmIxvGytuw,chili's grill & bar,3.2,913.0,2.0,chicago,60666,
...,...,...,...,...,...,...,...,...
12644,ChIJq9zsX_TSD4gRrRW0u2HLN1E,sweet bean & more,5.0,2.0,,chicago,60614,
12699,ChIJj_2hl5DMD4gRNunA9w1SI1I,central starlite family restaurant,,,,chicago,60639,
12775,ChIJz5h8elIzDogRgDuu28QU6Ic,ruby's soul food,,,,chicago,60651,
12813,ChIJDcM0Ey4zDogR2rNIPGIV_gs,come & get it,,,,chicago,60644,


If zip code starts with 60 move it to zip code column

In [11]:
chicago_idx = google_places.city.str.startswith("60", na=False)
google_places.loc[chicago_idx, 'zip_code'] = google_places['city']

In [12]:
google_places[google_places.city.str.startswith("60", na=False)]

Unnamed: 0,place_id,place_name,rating,total_number_of_ratings,price_level,address,city,zip_code
13,ChIJVwHOlyi0D4gRNjLVMUoK00A,romano's macaroni grill,3.7,860.0,2.0,chicago,60666,60666
74,ChIJqQyWfSa0D4gRN_IZ45o_HCE,argo tea,3.5,45.0,1.0,chicago,60666,60666
106,ChIJTxSyOsfSD4gRUy8JoHTgkVw,piece out,,,,chicago,60622,60622
183,ChIJIVrbHPjMD4gRQfUHdZXic5U,la estrella blanca,3.9,29.0,,chicago,60641,60641
278,ChIJM_1zjXe2D4gRDAmIxvGytuw,chili's grill & bar,3.2,913.0,2.0,chicago,60666,60666
...,...,...,...,...,...,...,...,...
12644,ChIJq9zsX_TSD4gRrRW0u2HLN1E,sweet bean & more,5.0,2.0,,chicago,60614,60614
12699,ChIJj_2hl5DMD4gRNunA9w1SI1I,central starlite family restaurant,,,,chicago,60639,60639
12775,ChIJz5h8elIzDogRgDuu28QU6Ic,ruby's soul food,,,,chicago,60651,60651
12813,ChIJDcM0Ey4zDogR2rNIPGIV_gs,come & get it,,,,chicago,60644,60644


if address is chicago move it to city column

In [13]:
address_idx = (google_places.address == "chicago")
google_places.loc[address_idx, "city"] = "chicago"
google_places.loc[address_idx, "address"] = np.nan

In [14]:
google_places[google_places.address == 'chicago']

Unnamed: 0,place_id,place_name,rating,total_number_of_ratings,price_level,address,city,zip_code


In [15]:
not_chicago_restaurants = google_places[~(google_places.city == "chicago")]
print(f"Number of restaurants with non-chicago city {len(not_chicago_restaurants)}")
not_chicago_restaurants.city.unique()

Number of restaurants with non-chicago city 1890


array(['des plaines', 'libertyville', 'asheville', 'schaumburg',
       'elk grove village', 'morton grove', 'wauwatosa', 'oak park',
       'harwood heights', 'frankfort', 'lubbock', 'houston', 'holton',
       '45014', nan, 'naperville', '28594', 'rosemont', 'carol stream',
       'stone park', 'westchester', '46617', 'cicero', 'forest park',
       'madison', 'crestwood', 'addison', 'mancelona', 'khet phra nakhon',
       '87544', 'sterling heights', 'downers grove', 'st. louis',
       '050015', 'cary', 'joliet', 'east chicago', 'miami', 'roseland',
       'new york', 'wilmette', 'dallas', 'evanston', 'st. charles',
       'bolingbrook', 'hoffman estates', 'springfield', 'elgin', '1205',
       'atlanta', 'detroit', 'skokie', 'wood dale', 'thornlie',
       'hickory hills', 'clayton', 'hinsdale', 'highland',
       'franklin park', 'yuma', 'lansing', 'wheaton', 'bellwood',
       'north york', 'grand rapids', 'milwaukee', 'bridgeview', 'jeddah',
       'oakbrook terrace', 'hillside

We can see that sometimes 60 startin zip codes are still present, let check city for those values

In [16]:
google_places[google_places.city.str.startswith("60", na=False)]

Unnamed: 0,place_id,place_name,rating,total_number_of_ratings,price_level,address,city,zip_code
2408,ChIJ6xFHBL6vD4gRchf5OrA9qUg,northwest transportation center,4.0,20.0,,schaumburg,60173,60173
5469,ChIJsTHl8TevD4gRVlyTfs6j5Y0,nic's organic fast food,3.1,31.0,,schaumburg,60173,60173
6613,ChIJzQ0X9bYzDogRHT07KfAcfNA,mc café,4.5,2.0,,cicero,60804,60804
7438,ChIJT0JHiqUzDogRlqHkJuSCgrs,nueva vida,4.2,6.0,,cicero,60804,60804
8272,ChIJt1juNU80DogRzGgeK6nsrs0,red robin gourmet burgers and brews,3.9,1032.0,2.0,north riverside,60546,60546
8551,ChIJo3gfbs66D4gRNGDpJP0lFEo,mr d's sports bar & grill,4.3,56.0,,arlington heights,60005,60005
8572,ChIJR5CMqiGuD4gR3N5M6VH7ryE,vincenzo palmieri dpm,4.8,19.0,,elk grove village,60007,60007
9575,ChIJQ-X4Dq4zDogR543sVOJGvsk,hibachi seafood buffet,3.4,8.0,,cicero,60804,60804
9736,ChIJ7aMROx4FD4gRRAVVecrnbUY,fantasea destinations,,,,elgin,60123,60123
9794,ChIJs-zK38JKDogROyQY0tbyclU,sharky's fish,4.5,2.0,,bellwood,60104,60104


It turns out that entries as cicero, north riverside, schaumburg or elgin are within Chicago aglomeration they are smaller towns or districts.

Let's check how many entries don't have Chicago area zip code

In [17]:
google_places[(~(google_places.zip_code.str.startswith("60", na=False)) & (google_places.city != "chicago"))]

Unnamed: 0,place_id,place_name,rating,total_number_of_ratings,price_level,address,city,zip_code
25,ChIJ8W5KpKj0WYgRXlQU3Lknu-c,liberty house coffee and café,4.7,234.0,2.0,221 s liberty st,asheville,28801-2334
94,ChIJs1HOuK4FBYgREBfd2sHyGwg,la terraza,4.6,77.0,,11520 w bluemound rd # 2,wauwatosa,53226-4000
163,ChIJFXchGjxt_oYR3OZ7Dv4sz3A,mi linda michoacana,4.5,240.0,,2002 34th st,lubbock,79411-1832
174,ChIJ2TzjVki8QIYR8rBeeXUQuGA,doña chela restaurant,4.3,212.0,1.0,1112 76th st,houston,77012-1004
186,ChIJf4UCjadCGYgRPk63PEDsfwo,compass group usa inc,,,,6477 syers rd,holton,49425-7508
...,...,...,...,...,...,...,...,...
12967,ChIJ8WUga24zXIgRrtJ5T0sC08M,mediterranean delight,4.8,237.0,1.0,160 bus terminal rd a,oak ridge,37830
12974,ChIJScl8s92LOIgRwRKOCn4btuI,couscous house,4.9,100.0,1.0,1611 morse rd,columbus,43229
12975,ChIJrUpLxVkDBYgR2GJViXvlxkY,champion chicken,3.5,391.0,1.0,8718 w lisbon ave,milwaukee,53222-2859
13001,ChIJIVSzGQ3nEYgR8nZhwaBQ9ZM,food king china buffet,3.6,224.0,1.0,214 w ridge rd,griffith,46319-1041


We have 1071 rows which correspond to restaurants in different places than Chicago area. They are of no use to us so we remove them.

In [18]:
google_places_cleaned = google_places[((google_places.zip_code.str.startswith("60", na=False)) | (google_places.city == "chicago"))]

In [19]:
google_places_cleaned

Unnamed: 0,place_id,place_name,rating,total_number_of_ratings,price_level,address,city,zip_code
0,ChIJleWegJHTD4gRsIMGm3IjUcM,yolk - test kitchen,4.4,180.0,2.0,1767 n milwaukee ave,chicago,60647
1,ChIJI1dehI0xDogRRkvKIpUspHM,las asadas restaurant,4.0,166.0,2.0,3834 w 47th st,chicago,60632-4136
2,ChIJBRuLGC_MD4gRfUVkzaQBIZk,mini palapita,4.3,342.0,1.0,4968 n elston ave,chicago,60630
3,ChIJ-auSyWItDogRQwUqcwljejE,crazy bird chicken,4.7,67.0,,1160 w grand ave,chicago,60642-5837
4,ChIJIYRFg7IsDogRAW0zLguKL-A,redhead piano bar,4.2,989.0,2.0,16 w ontario st,chicago,60654
...,...,...,...,...,...,...,...,...
12999,ChIJW9Fy4mXLD4gRZa_kB9Ch0y8,golden crown restaurant,4.6,11.0,,2341 n narragansett ave,chicago,60639-2654
13000,ChIJF1jwR_AsDogRqxHTjLkRiCk,lalo's on maxwell,3.9,1005.0,2.0,733 w maxwell st,chicago,60607-5016
13002,ChIJ5fNghEVMDogR4UWYi3iHLPE,dip'n good dips,,,,17 monterey ave,villa park,60181-2822
13004,ChIJ6a1jjZssDogR3PQwtmza-vs,mei's kitchen,3.5,86.0,,1108 s michigan ave,chicago,60605


We will remove any duplicating rows from both dataframes

In [20]:
google_places_cleaned.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [21]:
restaurants.drop_duplicates(inplace=True)

Now we want to compare addresses and names of restaurants. we will start by joining two dataframes on place_id

In [22]:
merged = restaurants.merge(google_places_cleaned, on="place_id", how="inner", suffixes=("_restaurants","_google"))

In [23]:
len(merged)

12057

In [24]:
merged.columns

Index(['place_id', 'place_name_restaurants', 'latitude', 'longitude',
       'address_restaurants', 'zip_code_restaurants', 'place_name_google',
       'rating', 'total_number_of_ratings', 'price_level', 'address_google',
       'city', 'zip_code_google'],
      dtype='object')

to assess if for sure we have correct entries we will compare zip codes if no zip_code is available from restaurants then we will compare names, first step would be to check where `zip_code_restaurants` is NaN
and second would be to remove second zip code component from `zip_code_google`

In [25]:
merged[merged.zip_code_restaurants.isna()]

Unnamed: 0,place_id,place_name_restaurants,latitude,longitude,address_restaurants,zip_code_restaurants,place_name_google,rating,total_number_of_ratings,price_level,address_google,city,zip_code_google
960,ChIJq7u58SPTD4gRqV7j2UThRLM,s.g.a. coffee cafe,41.958548,-87.786919,4300 n narragansett,,coffee lab & roasters,4.9,40.0,,2823 n lincoln ave,chicago,60657
1351,ChIJo3LREajSD4gROdM5Sjl8pTg,casa central la posada,41.8781,,,,casa central,3.5,31.0,,1343 n california ave,chicago,60622-2803
7752,ChIJBV-jIEvTD4gROrQeGm3dLt0,starbucks coffee,41.896318,-87.63584,750 n franklin st.,,starbucks,4.0,275.0,2.0,750 n franklin st,chicago,60654
8872,ChIJV1gyFIErDogR4vUnQw74VN4,mccornick place,41.851047,-87.622093,2301 s indiana,,mccormick place,4.5,8493.0,,2301 s king dr,chicago,60616
8873,ChIJV1gyFIErDogR4vUnQw74VN4,mccornick place,41.851047,-87.622093,2301 s indiana,,mccormick place,4.5,8494.0,,2301 s king dr,chicago,60616
10790,ChIJTwjQ-HssDogRz558Go2FZu8,burger king,41.850454,-87.623891,2320-2322 s michigan ave,,burger king,3.4,731.0,1.0,2328 s michigan ave,chicago,60616
11136,ChIJYZbHo0zTD4gReP8dX6Qfaco,think simple foods,41.896504,-87.632784,141 w chicago ave,,think simple,5.0,2.0,,770 n lasalle dr,chicago,60610


We have only couple of rows where zip_code_restaurants is not available, we can review them manually, it turns out we will have to remove row 960, 11136 and McCormick place which is not a restaurant but a business center in Chicago.

In [26]:
merged.drop(axis=0, index=[960, 11136, 8872,8873], inplace=True)

Now we will remove second zip code google component

In [27]:
merged[merged.zip_code_google.isna()]

Unnamed: 0,place_id,place_name_restaurants,latitude,longitude,address_restaurants,zip_code_restaurants,place_name_google,rating,total_number_of_ratings,price_level,address_google,city,zip_code_google
2974,ChIJQQMfpi_TD4gRbkamIyMmFvU,kikka- kingsbury,41.909399,-87.653051,1550 n kingsbury st,60642.0,north kingsbury street,,,,n kingsbury st,chicago,
3411,ChIJ2aXLDIA3DogRtia7eMEyzE4,"harris family grill on 83rd, llc",41.744483,-87.556373,2801 e 83rd st,60617.0,west 83rd street,,,,w 83rd st,chicago,
3751,ChIJ-fxWG6PND4gR7_Y1lx9ULuo,elston nugget inc,41.932281,-87.68819,2406 w diversey ave,60647.0,north elston avenue,,,,n elston ave,chicago,
6132,ChIJedCe4RXOD4gRl2sn7ZmQxyQ,sushi world,41.995479,-87.712875,6251 n mccormick rd,60659.0,sushi world,4.4,410.0,2.0,,chicago,
6133,ChIJedCe4RXOD4gRl2sn7ZmQxyQ,sushi luxe,41.976357,-87.668568,5204 n clark st,60640.0,sushi world,4.4,410.0,2.0,,chicago,
6702,ChIJrS7zoBzND4gRFlZJQaAkBBw,hermosa,41.917156,-87.736189,4356 w armitage ave,60639.0,hermosa,,,,,chicago,
9671,ChIJra52C_AsDogRw9Xe4NvtHps,maxwell st & fresh lemonade,41.794426,-87.644363,740 w garfield blvd,60609.0,west maxwell street,,,,w maxwell st,chicago,
9672,ChIJra52C_AsDogRw9Xe4NvtHps,maxwell street,41.90967,-87.751213,4948 w north ave,60639.0,west maxwell street,,,,w maxwell st,chicago,
10021,ChIJ1XHQCS4tDogRehV3FO2Qr_M,grandbar,41.891087,-87.667204,1600 w grand ave,60622.0,grand & ashland,,,,,chicago,
10468,ChIJqRc0MtMoDogR6i4WiN_xSk0,new park manor,41.765395,-87.61511,7109 s dr martin luther king jr dr,60619.0,park manor,,,,,chicago,


Most of those entries also have rating NaN, drop then as those rows are useless for us

In [28]:
merged.dropna(subset=["rating"],axis=0, inplace=True)

Filling nans with string value to remove second `zip_code_google` component

In [29]:
merged.zip_code_google.fillna("nan", inplace=True)

In [30]:
merged.zip_code_google.str.len().unique()

array([ 5, 10,  3])

In [31]:
merged.zip_code_google = merged.zip_code_google.apply(lambda x: x[:5])

In [32]:
merged.zip_code_google = merged.zip_code_google.replace("nan", np.nan)

In [33]:
merged.zip_code_google = merged.zip_code_google.apply(float)

Now we will take only those entries where every place has a matching zip code

In [34]:
matching = merged[merged.zip_code_google == merged.zip_code_restaurants]

At last to be sure that we have rating corresponding to a correct place we will compare names using python difflib, from docs "ratio() returns a float in [0, 1], measuring the similarity of the sequences. As a rule of thumb, a ratio() value over 0.6 means the sequences are close matches:". We will use this property to discard any rows where name similarity is lower than 0.6

In [35]:
from difflib import SequenceMatcher

def get_similarity_score(restaurant_name, google_name):
    return SequenceMatcher(lambda x: x == " ",
                    restaurant_name,
                    google_name).ratio()

In [36]:
matching['name_similarity_score'] = matching.apply(lambda row: get_similarity_score(row["place_name_restaurants"], row["place_name_google"]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
matching = matching[matching.name_similarity_score > 0.6]

As a last step we remove rows with duplicated place_id

In [38]:
matching = matching.drop_duplicates(subset=["place_id"])

In [39]:
len(matching)

5700

we have ratings for 5700 restaurants

We will transform matching dataframe to use it for further analysis, we will remove useless columns and save it to csv file

In [40]:
matching.drop(["name_similarity_score", "place_name_google", "price_level", "address_google", "zip_code_google", "city"], axis=1, inplace=True)

In [41]:
matching.rename(columns={"place_name_restaurants": "place_name", "address_restaurants": "address", "zip_code_restaurants": "zip_code"}, inplace=True)

In [43]:
matching.to_csv("../data/processed/restaurants_with_ratings.csv")