### Load raw dataset

In [1]:
import pandas as pd
import seaborn as sns

yolosac = pd.read_csv('data/yolosac.csv')
yolosac['category'] = yolosac['category'].apply(eval)
print (yolosac['category'])

0                                      [Massage therapist]
1                                      [Massage therapist]
2                                          [Door supplier]
3                                                 [Church]
4                                                 [Church]
                               ...                        
17933                                [American restaurant]
17934                                        [Gas station]
17935                                      [Movie theater]
17936                                   [Pizza restaurant]
17937    [Electronics store, Appliance store, Computer ...
Name: category, Length: 17938, dtype: object


In [25]:
# Top 30 business categories
print(yolosac['category'].explode().value_counts().iloc[:30])
yolosac['category'].explode().value_counts().to_csv('data/business_categories.csv')

category
Restaurant              1352
Fast food restaurant     649
Auto repair shop         513
Park                     463
Mexican restaurant       429
Beauty salon             428
Takeout Restaurant       398
Coffee shop              394
Grocery store            382
Breakfast restaurant     380
Gas station              377
Hair salon               364
Sandwich shop            359
Cafe                     353
American restaurant      347
Tourist attraction       322
Pizza restaurant         320
Caterer                  319
Nail salon               318
Convenience store        310
Bar                      285
Hamburger restaurant     285
Clothing store           260
Cell phone store         235
Tire shop                222
Barber shop              211
ATM                      210
Used car dealer          209
Chinese restaurant       207
Auto parts store         199
Name: count, dtype: int64


### Find correlation between raw categories

#### One Hot Encoding of raw categories

In [2]:
# Credit: https://towardsdatascience.com/dealing-with-list-values-in-pandas-dataframes-a177e534f173
def bool_df(item_lists, unique_items):
  bool_dict = {}
  # Iterate over all tags
  for i, item in enumerate(unique_items):
    # Apply Boolean mask
    bool_dict[item] = item_lists.apply(lambda x: item in x)
  
  # Return as dataframe
  return pd.DataFrame(bool_dict)

In [3]:
# List of unique business categories with frequency
unique_cats = yolosac['category'].explode().value_counts()
# Generate boolean matrix where each category is a column (one hot encoding)
categories_bool = bool_df(item_lists=yolosac['category'], unique_items=unique_cats.index)
categories_bool

Unnamed: 0,Restaurant,Fast food restaurant,Auto repair shop,Park,Mexican restaurant,Beauty salon,Takeout Restaurant,Coffee shop,Grocery store,Breakfast restaurant,...,Amusement ride supplier,Fund management company,Raw food restaurant,Civil engineering company,Industrial building,Mechanical contractor,Finance broker,Stock broker,Asphalt contractor,Wildlife park
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17933,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
17934,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
17935,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
17936,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


#### Correlation Matrix between raw buiness categories

In [12]:
# Generate correlation table between categories (takes ~2 mins)
categories_corr = categories_bool.corr(method="pearson")
categories_corr

Unnamed: 0,Restaurant,Fast food restaurant,Auto repair shop,Park,Mexican restaurant,Beauty salon,Takeout Restaurant,Coffee shop,Grocery store,Breakfast restaurant,...,Amusement ride supplier,Fund management company,Raw food restaurant,Civil engineering company,Industrial building,Mechanical contractor,Finance broker,Stock broker,Asphalt contractor,Wildlife park
Restaurant,1.000000,0.345007,-0.048988,-0.046473,0.128078,-0.043253,0.296780,0.065274,-0.034801,0.180908,...,-0.002132,-0.002132,-0.002132,-0.002132,-0.002132,-0.002132,-0.002132,-0.002132,-0.002132,-0.002132
Fast food restaurant,0.345007,1.000000,-0.033244,-0.031537,0.186558,-0.028335,0.424817,0.101322,-0.028580,0.294913,...,-0.001447,-0.001447,-0.001447,-0.001447,-0.001447,-0.001447,-0.001447,-0.001447,-0.001447,-0.001447
Auto repair shop,-0.048988,-0.033244,1.000000,-0.027929,-0.026858,-0.026826,-0.025846,-0.025713,-0.025310,-0.025242,...,-0.001281,-0.001281,-0.001281,-0.001281,-0.001281,-0.001281,-0.001281,-0.001281,-0.001281,-0.001281
Park,-0.046473,-0.031537,-0.027929,1.000000,-0.025479,-0.025448,-0.024519,-0.024393,-0.024010,-0.023946,...,-0.001215,-0.001215,-0.001215,-0.001215,-0.001215,-0.001215,-0.001215,-0.001215,-0.001215,0.045872
Mexican restaurant,0.128078,0.186558,-0.026858,-0.025479,1.000000,-0.024472,0.130006,-0.023458,-0.010453,0.118869,...,-0.001169,-0.001169,-0.001169,-0.001169,-0.001169,-0.001169,-0.001169,-0.001169,-0.001169,-0.001169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Mechanical contractor,-0.002132,-0.001447,-0.001281,-0.001215,-0.001169,-0.001167,-0.001125,-0.001119,-0.001101,-0.001098,...,-0.000056,-0.000056,-0.000056,-0.000056,1.000000,1.000000,-0.000056,-0.000056,-0.000056,-0.000056
Finance broker,-0.002132,-0.001447,-0.001281,-0.001215,-0.001169,-0.001167,-0.001125,-0.001119,-0.001101,-0.001098,...,-0.000056,-0.000056,-0.000056,-0.000056,-0.000056,-0.000056,1.000000,1.000000,-0.000056,-0.000056
Stock broker,-0.002132,-0.001447,-0.001281,-0.001215,-0.001169,-0.001167,-0.001125,-0.001119,-0.001101,-0.001098,...,-0.000056,-0.000056,-0.000056,-0.000056,-0.000056,-0.000056,1.000000,1.000000,-0.000056,-0.000056
Asphalt contractor,-0.002132,-0.001447,-0.001281,-0.001215,-0.001169,-0.001167,-0.001125,-0.001119,-0.001101,-0.001098,...,-0.000056,-0.000056,-0.000056,-0.000056,-0.000056,-0.000056,-0.000056,-0.000056,1.000000,-0.000056


In [13]:
# Helper function to extract similar categories
def similarCats(cat):
  return categories_corr[str(cat)].sort_values(ascending=False).loc[categories_corr[str(cat)] > 0].index.to_list()

### Extract simplified business categories

In [14]:
restaurant = similarCats('Restaurant')
print(*restaurant, sep='\n')

Restaurant
Fast food restaurant
Sandwich shop
Takeout Restaurant
Caterer
Hamburger restaurant
Lunch restaurant
American restaurant
Breakfast restaurant
Tex-Mex restaurant
Bar
Burrito restaurant
Taco restaurant
Asian restaurant
Vegetarian restaurant
Mexican restaurant
Chinese restaurant
Salad shop
Diner
Steak house
Family restaurant
Seafood restaurant
Bar & grill
Hot dog restaurant
Californian restaurant
Japanese restaurant
Gastropub
Brunch restaurant
Deli
Cajun restaurant
Sports bar
Juice shop
European restaurant
Pasta shop
Box lunch supplier
Barbecue restaurant
Coffee shop
Sushi restaurant
Fine dining restaurant
Chicken wings restaurant
Hot dog stand
Down home cooking restaurant
Italian restaurant
Cafeteria
Traditional American restaurant
Soup restaurant
New American restaurant
Delivery Restaurant
Thai restaurant
Vietnamese restaurant
Takeout restaurant
Buffet restaurant
Mediterranean restaurant
Grill
Pizza restaurant
Bistro
Greek restaurant
Catering food and drink supplier
Pretzel st

In [51]:
hotel = similarCats('Hotel')
print(*hotel, sep='\n')

Army facility
Lodging
Event venue
Meeting room
Motel
Inn
Bed & breakfast
Conference center
Business center
Gambling house
Lodge
Hotel
Extended stay hotel
Banquet hall
Meeting planning service
Wedding venue
Indoor lodging
Function room facility
Casino
Bar
Boutique


In [52]:
yolosac[categories_bool['Hotel'] & categories_bool['Restaurant']]

Unnamed: 0,name,gmap_id,category,avg_rating,num_of_reviews
14921,Park Winters,0x8084dd62863323ab:0x73b08ffc3fac394,"[Event venue, Bed & breakfast, Hotel, Inn, Res...",4.9,138
16053,Rogelio's Dine & Sleep Inn,0x809aad5d5b95d979:0x27d24a0876dd5d9d,"[Restaurant, Bar, Bed & breakfast, Casino, Chi...",4.2,88


In [53]:
beauty = similarCats('Beauty salon')
print(*beauty, sep='\n')

Body piercing shop
Facial spa
Barber shop
Cosmetics industry
Tanning salon
Massage therapist
Spa
Office space rental agency
Ear piercing service
Beauty salon
Mehndi designer
Hair extensions supplier
Medical spa
Waxing hair removal service
Eyebrow bar
Reflexologist
Hair replacement service
Health and beauty shop
Organic shop
Hair care
Skin care clinic
Hair extension technician
Permanent make-up clinic
Aromatherapy service
Day spa
Stylist
Nail salon
Hair removal service
Make-up artist
Massage supply store
Massage spa
Beauty school
Cosmetics store
Beauty supply store
Hairdresser
Wig shop
Hair salon
Barber school
Beauty product supplier
Beauty
Eyelash salon
Trade school
Barber supply store


In [54]:
car = similarCats('Auto repair shop')
print(*car, sep='\n')

Brake shop
Isuzu dealer
Auto market
Diagnostic center
Truck dealer
Lexus dealer
Car accessories store
Travel agency
Hyundai dealer
Truck repair shop
Car security system installer
Toyota dealer
Diesel engine repair service
Kia dealer
Convenience store
Auto sunroof shop
Powder coating service
Car stereo store
Car factory
Auto bodywork mechanic
Vehicle inspection
Manufacturer
Used car dealer
Auto wrecker
Porsche dealer
Boat repair shop
Car inspection station
Car detailing service
Racing car parts store
Gas station
Auto dent removal service
Auto body shop
Auto tag agency
Chrysler dealer
Repair service
Jeep dealer
Auto spring shop
Auto machine shop
Tire repair shop
Muffler shop
Department of motor vehicles
BMW dealer
Window tinting service
Buick dealer
Car repair and maintenance
Tire shop
Maserati dealer
Salvage yard
Chevrolet dealer
Engine rebuilding service
Honda dealer
Gas company
Nissan dealer
Electronics repair shop
Auto glass shop
Ford dealer
RV dealer
Audi dealer
Car dealer
Decal sup

In [56]:
education = similarCats('School')
education.update(similarCats('College'))
print(*education, sep='\n')

Restaurant supply store
Medical school
Private educational institution
Elementary school
School district office
Child care agency
Non-profit organization
Graduate school
Vocational school
Cleaning products supplier
Dental school
Day care center
University
Vocational college
Business school
Catholic school
Mosque
Nursing school
Senior high school
Public educational institution
Primary school
School
Technical school
Adult education school
Junior college
Distribution service
Private college
Yoga studio
After school program
Comedy club
Uniform store
Preschool
Computer training school
Martial arts school
Car detailing service
Educational institution
College
Law school
Middle school
Private university
Trade school
High school
Community college


In [57]:
health = similarCats('Doctor')
health.update(similarCats('Hospital'))
print(*health, sep='\n')

Psychotherapist
Plastic surgeon
Walk-in clinic
Massage therapist
Dentist
Specialized hospital
Mental health clinic
Medical Center
Wellness center
Emergency care service
Medical clinic
Hospital
Ophthalmologist
General hospital
General practitioner
Counselor
Surgeon
Hearing aid store
Emergency care physician
Business center
Weight loss service
Pediatrician
Family practice physician
Obstetrician-gynecologist
Internist
Emergency room
Medical group
Dental clinic
Marriage or relationship counselor
Psychologist
Radiologist
Community health centre
Physical therapist
Audiologist
Urgent care center
Cardiologist
Orthopedic surgeon
Chiropractor
Doctor
Optician
Gift shop
Medical office


In [66]:
clothes = similarCats('Clothing store')
print(*clothes, sep='\n')

Clothing store
Men's clothing store
Women's clothing store
Shoe store
Department store
Children's clothing store
Fashion accessories store
Jeans shop
T-shirt store
Swimwear store
Baby store
Sporting goods store
Sportswear store
Outdoor sports store
Running store
Outlet store
Toy store
Plus size clothing store
Camping store
Discount store
Baby clothing store
Lingerie store
Dress store
Underwear store
Jewelry store
Custom t-shirt store
Luggage store
Sunglasses store
Youth clothing store
Baseball goods store
Electronics store
Work clothes store
Skateboard shop
Home goods store
Boutique
Skate shop
Outdoor clothing and equipment shop
Watch store
Vintage clothing store
Clothes and fabric manufacturer
Clothing supplier
Outerwear store
Novelty store
Used clothing store
Tuxedo shop
Snowboard shop
Golf shop
Furniture store
Western apparel store
Craft store
Outdoor activity organiser
Surf shop
Hunting and fishing store
Grocery store
Clothes market
Protective clothing supplier
Bicycle store
Boot s

In [67]:
rec = similarCats('Park')
print(*rec, sep='\n')

Park
Tourist attraction
Baseball field
Recreation
Basketball court
Playground
Tennis court
Softball field
Amphitheater
Wildlife park
Scenic spot
Fishing pond
Fountain
Animals
Soccer field
Amusement park
State park
Amusement park ride
Hockey rink
Dog park
Skateboard park
Government
Baseball
Theme park
Zoo
Picnic ground
Recreation center
Corporate campus
Wildlife refuge
City park
Memorial park
Golf instructor
Golf club
Pumpkin patch
Ranch
Swimming pool
Nature preserve
Public golf course
Sports equipment rental service
Auditorium
Children's amusement center
Event venue
Golf course
Amusement center
Public swimming pool
Government office
Sports complex
Wedding venue
Live music venue


In [9]:
yolosac[categories_bool['ATM'] & ~(categories_bool['Bank'] | categories_bool['Gas station'])]

Unnamed: 0,name,gmap_id,category,avg_rating,num_of_reviews
83,Bank of America ATM,0x80852903058f1645:0x23304ec837207bc3,[ATM],4.0,2
201,Bank of America ATM,0x80852903058f1645:0x23304ec837207bc3,[ATM],4.0,2
548,Bank of America ATM (Drive-thru),0x809b277f9207d47b:0x41e30805fe829932,[ATM],2.5,8
563,Chase ATM,0x809ade22e9018c3d:0xebc08a39ab6be790,[ATM],5.0,1
567,ampm,0x809ada95f0c5d52b:0xf2659572b88e6e19,"[Convenience store, ATM, Tobacco shop]",4.0,24
...,...,...,...,...,...
14917,Golden 1 Credit Union,0x8084d14ac0a9c627:0xde4ea9ab9c0168bf,"[ATM, Credit union]",3.1,18
15428,Circle Six | Food Stores,0x809ad7576a5c227f:0x421bb1d3dee1b5bb,"[Grocery store, ATM]",3.7,158
15589,Golden 1 Credit Union,0x809ae41640f1c733:0xd316d8cdba932a24,"[ATM, Credit union]",3.4,16
15939,ampm,0x809b2b059474df0d:0xdd78d0a4c834d6b6,"[Convenience store, ATM, Tobacco shop]",3.4,26


### Recategorize businesses

In [19]:
def simplify(lst, cats):
  for cat in cats:
    if cat in lst:
        return cats[0]
  return lst

In [20]:
yolosac2 = yolosac


yolosac2['category'] = yolosac2['category'].map( lambda x : simplify(x, restaurant))

print(yolosac2[categories_bool['Chinese restaurant']].iloc[:30])

                                                   name  \
1220                        Hometown Chinese Restaurant   
1392                                     house of chang   
2023                                        Asia Garden   
2289                                         Wu Village   
2652                                         G Hut Cafe   
3612                                     Beijing Garden   
3998                                       Yum Cha Café   
4399                                     Fortune Palace   
4467                          Sunflower Chinese Cuisine   
4513                  Woking Express Halal Asian Fusion   
5046                                        Jade Garden   
5050                           California Wok and Grill   
5054                                      Tea Cup House   
5079                             River Wok Chinese Food   
5339                     Jen Kitchen Chinese Restaurant   
5382         Liang's Joy Luck Garden Seafood Restaurant 