# Univ. of Illinois Data Mining Project on Coursera
## Task 00 - Extract Cuisines from Yelp Restaurants
2018-09-22
loganjtravis@gmail.com (Logan Travis)

In [1]:
# Imports
import json, os
import numpy as np
import pandas as pd

### Summary

Extract all "cuisines" for restaurants in the Yelp data set. See course page [Week 1 > Orientation > Data Set and Toolkit Acquisition](https://www.coursera.org/learn/data-mining-project/supplement/Ij7rp/data-set-and-toolkit-acquisition). Unfortunatley, the `yelp_academic_dataset_business.json` data set groups cuisines with other business descriptors in the `categories` feature. Extracted "cuisines" therefore include descriptions not normally associated with food...

### Get Data Set

In [2]:
# Set paths to data source and output
PATH_SOURCE = "source/yelp_dataset_challenge_academic_dataset/"
PATH_OUTPUT = "source/"

# Set review file path
PATH_SOURCE_YELP_BUSINESSES = PATH_SOURCE + "yelp_academic_dataset_business.json"

# Set ouptput file path (using GZIP compression)
PATH_OUTPUT_YELP_CUISINES = PATH_OUTPUT + "yelp_academic_dataset_cuisine.csv"
PATH_OUTPUT_YELP_REST_TO_CUISINES = PATH_OUTPUT + "yelp_academic_dataset_restaurant_to_cuisine.pkl.gzip"

The Yelp data set did not provide a schema. All files use JSON formatting but the `pandas.read_json` ([link to documentation](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html)) requires an `orient` parameter to correctly parse JSON. Here are the first three lines of the review dataset:

In [3]:
# Since the data did not include a schema description, print the first 3 lines
with open(PATH_SOURCE_YELP_BUSINESSES) as f:
    for i in range(3):
        print(f"Line {i:d}: {json.loads(next(f))}\n")

Line 0: {'business_id': 'vcNAWiLM4dR7D2nwwJ7nCA', 'full_address': '4840 E Indian School Rd\nSte 101\nPhoenix, AZ 85018', 'hours': {'Tuesday': {'close': '17:00', 'open': '08:00'}, 'Friday': {'close': '17:00', 'open': '08:00'}, 'Monday': {'close': '17:00', 'open': '08:00'}, 'Wednesday': {'close': '17:00', 'open': '08:00'}, 'Thursday': {'close': '17:00', 'open': '08:00'}}, 'open': True, 'categories': ['Doctors', 'Health & Medical'], 'city': 'Phoenix', 'review_count': 7, 'name': 'Eric Goldberg, MD', 'neighborhoods': [], 'longitude': -111.983758, 'state': 'AZ', 'stars': 3.5, 'latitude': 33.499313, 'attributes': {'By Appointment Only': True}, 'type': 'business'}

Line 1: {'business_id': 'JwUE5GmEO-sH1FuwJgKBlQ', 'full_address': '6162 US Highway 51\nDe Forest, WI 53532', 'hours': {}, 'open': True, 'categories': ['Restaurants'], 'city': 'De Forest', 'review_count': 26, 'name': 'Pine Cone Restaurant', 'neighborhoods': [], 'longitude': -89.335844, 'state': 'WI', 'stars': 4.0, 'latitude': 43.2388

Each row is an object and they appear consistent so set `orient` to "records" and `lines` to `True`.

In [4]:
# Load businesses into Pandas dataframe
dfYelpBusinesses = pd.read_json(PATH_SOURCE_YELP_BUSINESSES, orient="records", lines=True)

In [5]:
# Print dataframe shape and head
print(f"Shape: {dfYelpBusinesses.shape}")
dfYelpBusinesses.head()

Shape: (42153, 15)


Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,{'By Appointment Only': True},vcNAWiLM4dR7D2nwwJ7nCA,"[Doctors, Health & Medical]",Phoenix,"4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...","{'Tuesday': {'close': '17:00', 'open': '08:00'...",33.499313,-111.983758,"Eric Goldberg, MD",[],True,7,3.5,AZ,business
1,"{'Take-out': True, 'Good For': {'dessert': Fal...",JwUE5GmEO-sH1FuwJgKBlQ,[Restaurants],De Forest,"6162 US Highway 51\nDe Forest, WI 53532",{},43.238893,-89.335844,Pine Cone Restaurant,[],True,26,4.0,WI,business
2,"{'Take-out': True, 'Good For': {'dessert': Fal...",uGykseHzyS5xAMWoN6YUqA,"[American (Traditional), Restaurants]",De Forest,"505 W North St\nDe Forest, WI 53532","{'Monday': {'close': '22:00', 'open': '06:00'}...",43.252267,-89.353437,Deforest Family Restaurant,[],True,16,4.0,WI,business
3,"{'Take-out': True, 'Wi-Fi': 'free', 'Takes Res...",LRKJF43s9-3jG9Lgx4zODg,"[Food, Ice Cream & Frozen Yogurt, Fast Food, R...",De Forest,"4910 County Rd V\nDe Forest, WI 53532","{'Monday': {'close': '22:00', 'open': '10:30'}...",43.251045,-89.374983,Culver's,[],True,7,4.5,WI,business
4,"{'Take-out': True, 'Has TV': False, 'Outdoor S...",RgDg-k9S5YD_BaxMckifkg,"[Chinese, Restaurants]",De Forest,"631 S Main St\nDe Forest, WI 53532","{'Monday': {'close': '22:00', 'open': '11:00'}...",43.240875,-89.343722,Chang Jiang Chinese Kitchen,[],True,3,4.0,WI,business


### Simplify Data Prior to Cuisine Extraction

In [6]:
# Set `business_id` as index
dfYelpBusinesses.set_index("business_id", inplace=True)

In [7]:
# Filter for restaurants
dfYelpBusinesses = dfYelpBusinesses[\
        dfYelpBusinesses.categories.apply(lambda cats: "Restaurants" in cats)\
]

In [8]:
# Remove "Restuarants" from `categories`
_ = dfYelpBusinesses.categories.apply(lambda cats: cats.remove("Restaurants"))

In [9]:
# Drop unnecessary columns; Note: used full `loc` syntax
# in case I want to include more than `categories` later
dfYelpBusinesses = dfYelpBusinesses.loc[:, ["categories"]] #, "type", "attributes"]]

In [10]:
# Print simplified dataframe shape and head
print(f"Simplified shape: {dfYelpBusinesses.shape}")
dfYelpBusinesses.head()

Simplified shape: (14303, 1)


Unnamed: 0_level_0,categories
business_id,Unnamed: 1_level_1
JwUE5GmEO-sH1FuwJgKBlQ,[]
uGykseHzyS5xAMWoN6YUqA,[American (Traditional)]
LRKJF43s9-3jG9Lgx4zODg,"[Food, Ice Cream & Frozen Yogurt, Fast Food]"
RgDg-k9S5YD_BaxMckifkg,[Chinese]
rdAdANPNOcvUtoFgcaY9KA,[American (Traditional)]


### Extract Cuisines

In [11]:
# Build a set of all cuisines
cuisines = set(dfYelpBusinesses.categories.aggregate(sum))

# Convert to Pandas series
cuisines = pd.Series(list(cuisines))

In [12]:
# Print number of cuisines plus sample
print(f"Found {cuisines.size:,} cuisines. Sample:")
print(cuisines[:10])

Found 240 cuisines. Sample:
0    Ice Cream & Frozen Yogurt
1                    Hospitals
2                      Seafood
3               Transportation
4                    Cambodian
5                       Diners
6       Food Delivery Services
7                       Donuts
8                   Salvadoran
9           Convenience Stores
dtype: object


Not all of the above represent cuisines. Mixed businesses (e.g., a restaurant inside a golf pro-shop) includes non-food descriptors. Close enough for now.

In [13]:
# Exclude categories from cuisines
exclude = [
    "Flowers & Gifts",
    "Horseback Riding",
    "Golf",
    "Hospitals",
    "Performing Arts",
    "Shopping",
    "Arcades",
    "Arts & Crafts",
    "Drugstores",
    "Venues & Event Spaces",
    "Grocery",
    "Outlet Stores",
    "Festivals",
    "Gyms",
    "Leisure Centers",
    "Fitness & Instruction",
    "Cooking Schools",
    "Colleges & Universities",
    "Social Clubs",
    "Auto Repair",
    "Music Venues",
    "Real Estate",
    "Apartments",
    "Casinos",
    "Food Delivery Services",
    "Adult Entertainment",
    "Jazz & Blues",
    "Medical Spas",
    "Country Dance Halls",
    "Home Decor",
    "Tours",
    "Gas & Service Stations",
    "Art Galleries",
    "Gift Shops",
    "Hotels & Travel",
    "Specialty Schools",
    "Taxis",
    "Appliances",
    "Sports Wear",
    "Amusement Parks",
    "Print Media",
    "Caterers",
    "Arts & Entertainment",
    "Health & Medical",
    "Karaoke",
    "RV Parks",
    "Mass Media",
    "Lounges",
    "Event Planning & Services",
    "Day Spas",
    "Active Life",
    "Personal Shopping",
    "Education",
    "Transportation",
    "Dance Clubs",
    "Dry Cleaning & Laundry",
    "Beauty & Spas",
    "Kitchen & Bath",
    "Car Wash",
    "Home Services",
    "Shopping Centers",
    "Nightlife",
    "Internet Cafes",
    "Bowling",
    "Cinema",
    "Home & Garden",
    "Public Services & Government",
    "Cultural Center",
    "Kids Activities",
    "Airports",
    "Gay Bars",
    "Hookah Bars",
    "Convenience Stores",
    "Local Services",
    "Fashion",
    "Pool Halls",
    "Sporting Goods",
    "Do-It-Yourself Food",
    "Party & Event Planning",
    "Automotive",
    "Hotels",
    "Health Markets",
    "Swimming Pools",
    "Personal Chefs",
    "Landmarks & Historical Buildings"
]
cuisines = cuisines[~cuisines.isin(exclude)]

In [14]:
# Print number of cuisines after exclusions
print(f"Found {cuisines.size:,} cuisines after exclusion. Sample:")
print(cuisines[:10])

Found 155 cuisines after exclusion. Sample:
0     Ice Cream & Frozen Yogurt
2                       Seafood
4                     Cambodian
5                        Diners
7                        Donuts
8                    Salvadoran
11                   Vegetarian
13                Mediterranean
15                   Venezuelan
17                 Fish & Chips
dtype: object


### Limit Business Categories to Cuisines

In [15]:
# Update business `categories` with to just cuisines
xsetCuisines = lambda listOfStr: np.intersect1d(listOfStr, cuisines.values)
dfYelpBusinesses.categories = dfYelpBusinesses.categories.apply(lambda cats: xsetCuisines(cats))

In [16]:
# Label empty categories as "Food"
defaultFood = lambda listOfStr: ["Food"] if len(listOfStr) == 0 else listOfStr
dfYelpBusinesses.categories = dfYelpBusinesses.categories.apply(lambda cats: defaultFood(cats))

### Save Data

Save the both cuisine series as CSV and restaurant-to-cuisine dataframe as a GZIPped pickle to "/source/{filename}.{extension}".

In [19]:
dfYelpBusinesses.to_pickle(PATH_OUTPUT_YELP_REST_TO_CUISINES)
print(f"Restauarant-to-Cuisines size: {os.path.getsize(PATH_OUTPUT_YELP_REST_TO_CUISINES) >> 10:,} KB")

Restauarant-to-Cuisines size: 969 KB


In [18]:
cuisines.to_csv(PATH_OUTPUT_YELP_CUISINES, index=False)
print(f"Cuisines size: {os.path.getsize(PATH_OUTPUT_YELP_CUISINES) >> 10:,} KB")

Cuisines size: 1 KB
