Try using 'categories' attributes and calculate a simple text similarity, to measure content-based similarity.

In [1]:
%matplotlib inline

In [2]:
import os, sys, time
import numpy as np
import numpy.ma as ma
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
business = pd.read_csv("business.csv")
business.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,1314 44 Avenue NE,"{'BikeParking': 'False', 'BusinessAcceptsCredi...",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",Calgary,"{'Monday': '8:30-17:0', 'Tuesday': '11:0-21:0'...",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB
1,,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,"{'Friday': '17:0-23:0', 'Saturday': '17:0-23:0...",0,35.960734,-114.939821,CK'S BBQ & Catering,,89002,3,4.5,NV
2,1335 rue Beaubien E,"{'Alcohol': 'beer_and_wine', 'Ambience': ""{'ro...",O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",Montréal,"{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...",0,45.540503,-73.5993,La Bastringue,Rosemont-La Petite-Patrie,H2G 1K7,5,4.0,QC
3,211 W Monroe St,,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",Phoenix,,1,33.449999,-112.076979,Geico Insurance,,85003,8,1.5,AZ
4,2005 Alyth Place SE,{'BusinessAcceptsCreditCards': 'True'},8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",Calgary,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,51.035591,-114.027366,Action Engine,,T2H 0N5,4,2.0,AB


In [4]:
business.shape

(188593, 15)

In [5]:
# split into Las Vegas and Phoenix that are 'Restaurant' or 'Food' and open
business_lv = business[(business.city == 'Las Vegas') & (business.categories.str.contains('Restaurant|Food')) & (business.is_open == 1)]
business_ph = business[(business.city == 'Phoenix') & (business.categories.str.contains('Restaurant|Food')) & (business.is_open == 1)]

print(business_lv.shape, business_ph.shape)

(5451, 15) (3542, 15)


In [6]:
business_lv.head(10)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
32,2255 N Rampart Blvd,"{'Alcohol': 'none', 'Ambience': ""{'romantic': ...",kgffcoxT6BQp-gJ-UQ7Czw,"Fast Food, Restaurants, Sandwiches",Las Vegas,"{'Monday': '9:0-21:0', 'Tuesday': '9:0-21:0', ...",1,36.201794,-115.281981,Subway,Summerlin,89128,13,2.5,NV
33,2227 N Rampart Blvd,"{'Alcohol': 'beer_and_wine', 'Ambience': ""{'ro...",0jtRI7hVMpQHpUVtUy4ITw,"Beer, Wine & Spirits, Italian, Food, American ...",Las Vegas,"{'Monday': '7:0-14:30', 'Tuesday': '7:0-19:0',...",1,36.20199,-115.283122,Omelet House Summerlin,Summerlin,89128,242,4.0,NV
141,4505 E Bonanza Rd,"{'Alcohol': 'none', 'Ambience': ""{'romantic': ...",zhxnD7J5_sCrKSw5cwI9dQ,"Chicken Wings, Restaurants, Fast Food",Las Vegas,"{'Monday': '10:0-23:0', 'Tuesday': '10:0-23:0'...",1,36.17314,-115.077945,Popeyes Louisiana Kitchen,Sunrise,89110,16,1.5,NV
143,"560 N. Nellis Blvd, Ste E1","{'Alcohol': 'none', 'Ambience': ""{'romantic': ...",2kWrSFkIes_d2BMg4YrRtA,"Restaurants, Pizza",Las Vegas,"{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",1,36.169353,-115.061694,Pizza Hut,Sunrise,89110,19,2.5,NV
145,3480 S Maryland Pkwy,"{'BusinessAcceptsCreditCards': 'True', 'GoodFo...",6llKs7K_tn8ChXcIM-oTvg,"Japanese, Restaurants",Las Vegas,,1,36.125934,-115.135253,Sansei Japan,Eastside,89169,3,4.5,NV
161,6400 SE Centennial Center Blvd,"{'BikeParking': 'True', 'BusinessAcceptsCredit...",nqnYDW_FMwuTejAPNEOhqA,"Party & Event Planning, Event Planning & Servi...",Las Vegas,"{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...",1,36.274733,-115.268934,Seasonal Adventures Pumpkin Patch,Centennial,89149,11,3.0,NV
197,2505 E Tropicana Ave,"{'Alcohol': 'none', 'Ambience': ""{'romantic': ...",YV9GVfmDSDM7HSV0jVdTOA,"Restaurants, Salad, Fast Food, Mexican",Las Vegas,"{'Monday': '10:0-23:0', 'Tuesday': '10:0-23:0'...",1,36.099671,-115.116144,El Pollo Loco,Southeast,89121,34,3.0,NV
203,4910 S Maryland Pkwy,"{'Alcohol': 'none', 'Ambience': ""{'romantic': ...",F7OsiFk9aLZtqZczA84xpw,"Southern, Chicken Wings, Fast Food, American (...",Las Vegas,"{'Monday': '10:0-23:0', 'Tuesday': '10:0-23:0'...",1,36.100395,-115.136306,Popeyes Louisiana Kitchen,Southeast,89119,56,2.0,NV
249,2201 S Fort Apache Rd,"{'Alcohol': 'full_bar', 'Ambience': ""{'romanti...",XeDLyY2a7nZ3IEY4RYslXA,"American (New), Restaurants, Food, Breweries, ...",Las Vegas,"{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",1,36.147496,-115.298066,Chicago Brewing Company,Westside,89117,446,3.5,NV
257,7530 S Las Vegas Blvd,"{'Alcohol': 'full_bar', 'Ambience': ""{'romanti...",MDVbJicJvqaO4WGx0vEabQ,"Bars, American (Traditional), Nightlife, Resta...",Las Vegas,"{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",1,36.052802,-115.171496,Chili's,Southeast,89123,230,3.0,NV


In [7]:
business_lv.iloc[0,:]

address                                       2255 N Rampart Blvd
attributes      {'Alcohol': 'none', 'Ambience': "{'romantic': ...
business_id                                kgffcoxT6BQp-gJ-UQ7Czw
categories                     Fast Food, Restaurants, Sandwiches
city                                                    Las Vegas
hours           {'Monday': '9:0-21:0', 'Tuesday': '9:0-21:0', ...
is_open                                                         1
latitude                                                  36.2018
longitude                                                -115.282
name                                                       Subway
neighborhood                                            Summerlin
postal_code                                                 89128
review_count                                                   13
stars                                                         2.5
state                                                          NV
Name: 32, 

In [8]:
business_ph.categories.head(10)

5                                     Coffee & Tea, Food
12     Bars, Sports Bars, Dive Bars, Burgers, Nightli...
22     Nightlife, Bars, American (Traditional), Sport...
24                                  Mexican, Restaurants
39                         Pizza, Fast Food, Restaurants
47     Event Planning & Services, Soup, Salad, Mexica...
101                           Beer, Wine & Spirits, Food
187    American (Traditional), Comfort Food, Barbeque...
302                                 Mexican, Restaurants
360    Fast Food, Restaurants, Sandwiches, American (...
Name: categories, dtype: object

In [9]:
# reset pandas dataframe index
#business_lv.reset_index() #old index is added as a column
#business_lv.reset_index(drop=True) #does not add old index as a column

In [10]:
type(business_ph.categories[5])

str

In [11]:
business_ph.categories[5].split(', ')

['Coffee & Tea', 'Food']

In [12]:
# define function for comparing word for word

def compare_texts(text1, text2):
    # Takes two texts
    # hashes their words into twos lists
    # calculates the intersection and union
    # of the two lists, and returns
    # Jacard similarity value
     
    array_1 = text1.split(', ')
    array_2 = text2.split(', ')
            
    intersection = len(list(set(array_1).intersection(array_2)))    
    union = len(set(array_1)) + len(set(array_2)) - intersection
    jacard_similarity = intersection / union
    return jacard_similarity

In [13]:
compare_texts(business_lv.categories[32], business_ph.categories[360])

0.75

### Similarity

In [21]:
# define function for returning the business with highest similarity

def return_similar_business(b_id, df1, df2):
    # Looks up business ID in df1
    # calculates Jacard similarity value based on texts in 'categories'
    # against all businesses in df2
    # returns top 10 business ID with highest similarity
    
    text1 = df1.categories[df1[df1.business_id == b_id].index.values[0]]
    
    similarity = {}
        
    for i in df2.business_id:
        sim = compare_texts(text1, df2.categories[df2[df2.business_id == i].index.values[0]])
        similarity[i] = sim
    
    # return list of businesses sorted by similarity measure
    return sorted(similarity.items(), key=lambda x: x[1], reverse=True)

    #dict(zip(keys, values))

In [22]:
sim = return_similar_business('0jtRI7hVMpQHpUVtUy4ITw', business_lv, business_ph)
len(sim)

3542

In [23]:
sim[:10]

[('y8zlR75tcmvkg49dqdLGDg', 0.625),
 ('zPAMOTc_VEfgPxseUtWw-w', 0.5555555555555556),
 ('mmZaOPp2BFMmWe6suuJndQ', 0.5555555555555556),
 ('MM8n7SPaIuMlDGbUM5SZhA', 0.5555555555555556),
 ('j0_DUr3vBXY-JP-b0bf93A', 0.5555555555555556),
 ('8EvgKu_VpVXJ0KUvrxS7BA', 0.5555555555555556),
 ('0h5yI6np7weAVH0jsjHqPw', 0.5),
 ('uOJ87zqbU1MtShCSHDkiyw', 0.5),
 ('EDcZRvERC22Cvw1yi4-VKg', 0.5),
 ('DWx1UgbHLHU-2Ko02klhZg', 0.5)]

Check the most similar business categories.

In [19]:
print(business_lv.categories[business_lv.business_id == '0jtRI7hVMpQHpUVtUy4ITw'])
print(business_ph.categories[business_ph.business_id == 'y8zlR75tcmvkg49dqdLGDg'])

33    Beer, Wine & Spirits, Italian, Food, American ...
Name: categories, dtype: object
147237    Food, Pizza, Beer, Wine & Spirits, Italian, Re...
Name: categories, dtype: object


In [20]:
sim_list

[('y8zlR75tcmvkg49dqdLGDg', 0.625),
 ('zPAMOTc_VEfgPxseUtWw-w', 0.5555555555555556),
 ('mmZaOPp2BFMmWe6suuJndQ', 0.5555555555555556),
 ('MM8n7SPaIuMlDGbUM5SZhA', 0.5555555555555556),
 ('j0_DUr3vBXY-JP-b0bf93A', 0.5555555555555556),
 ('8EvgKu_VpVXJ0KUvrxS7BA', 0.5555555555555556),
 ('0h5yI6np7weAVH0jsjHqPw', 0.5),
 ('uOJ87zqbU1MtShCSHDkiyw', 0.5),
 ('EDcZRvERC22Cvw1yi4-VKg', 0.5),
 ('DWx1UgbHLHU-2Ko02klhZg', 0.5),
 ('VBJwRxOAtvjy7taLXEjmfg', 0.45454545454545453),
 ('cTJjTKz2huGZ-ElScC2pSw', 0.45454545454545453),
 ('82I9SSilxh3zL8F1UQ45Xw', 0.45454545454545453),
 ('mUVAMNN7BCQ9HGA9w_7C1g', 0.45454545454545453),
 ('aMF8cG445ONPRKBsBYqofg', 0.4444444444444444),
 ('mAryfEwdr7Lby2CzmLFbow', 0.4444444444444444),
 ('JQYkVnUzS5vgFxZgCuW7ag', 0.4444444444444444),
 ('qBd6uxX63lQuM7lzJ_Xihg', 0.42857142857142855),
 ('6Zk5F7fsTr8n2CJTlaxHlw', 0.42857142857142855),
 ('UoLOE3b3iL_J-D1b8C-QfQ', 0.42857142857142855),
 ('X8sgCwr1xIkmTItsxYrsyw', 0.42857142857142855),
 ('wKbdS9JDsk2-GgaTOYD4GA', 0.428571

Now we want to use this business similarity function to iterate through all businesses the specific user has visited.

In [25]:
review_lv = pd.read_csv("review_lv.csv")
review_ph = pd.read_csv("review_ph.csv")

In [26]:
review_ph.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,city,categories,is_open
0,VyVIneSU7XAWgMBllI6LnQ,0,2017-07-08,0,F1UHGw4-6OYyvFFb-E4LMg,5,DELICIOUS! forgot to take a before picture but...,0,5ngpW5tf3ep680eG1HxHzA,Phoenix,"Barbeque, Event Planning & Services, Restauran...",1
1,Xg5qEQiB-7L6kGJ5F4K3bQ,0,2015-12-08,0,QBZz9m3pL6r8Mvbdm2ZBYQ,5,I was in Phoenix on business last week. My PHX...,0,Me267nGzYOJAGNSbNR_e6A,Phoenix,"Barbeque, Restaurants",1
2,Jj8ubiwwuCR-rrhrrjcryw,0,2016-08-14,0,3zNW-BK3YlJnGBdCxpE46g,2,"Been here 3x and no beef #45, will stop coming...",0,0pf5VuzE4_1pwj5NJHG5TQ,Phoenix,"Vietnamese, Restaurants",1
3,ERCZtj8qxNxfXJrdXPdEsw,0,2016-04-29,2,ml9QmqSdQllqSO78RteQfA,1,"In case my email didn't work, still giving a 1...",0,0pf5VuzE4_1pwj5NJHG5TQ,Phoenix,"Fashion, Shopping, Shoe Stores",1
4,CtI7MMRF-arzGJxA1nJccQ,0,2016-06-18,0,_03MY3VpWImsHHmDdll0Mw,4,"Despite the long drive thru line, they were re...",0,0pf5VuzE4_1pwj5NJHG5TQ,Phoenix,"Coffee & Tea, Food",1


In [40]:
[i for i in review_ph.business_id[review_ph.user_id == '0pf5VuzE4_1pwj5NJHG5TQ'].values]

['Jj8ubiwwuCR-rrhrrjcryw',
 'ERCZtj8qxNxfXJrdXPdEsw',
 'CtI7MMRF-arzGJxA1nJccQ',
 'F9cXFXk60E2sz_YTSCFp_Q',
 'OX0T9dWI8b7meu-ljTo22A',
 'nUaLFTfUqVKMFQonsWBnag',
 '-YR7K3rw6VAQ1-MjslvsoQ',
 '7m1Oa1VYV98UUuo_6i0EZg',
 'SVMR0kRmdd-bbSNTTECJBA',
 'QVYyTgJFz7lcMv31ZmghGg',
 'Szcr-yMxq76zRaSTUapAcw',
 'vrGwjE59kp6Yh_ZlkflD-w',
 'BNzcfz3jmBo_1wUB5YIsXg',
 '8-su-O_330PebTOp60RILQ',
 'szhJLmdLDVFTevm8fu0T4A',
 'SvlTuwslYKdqvqWtDe2dgA',
 'qwy9xdDuCwaVf8r-Wjx5ig',
 'Wc9UpJhOcdSj7olZkz7SJA',
 'XGLGvB8S6-Anmt7gZkPLkQ',
 'hkmaUwfNQtFAJFpxYOHBkg',
 'yz3KtngrF2PQrif8JW7gdw',
 'RkXZ4Gx6SHmSMe_R9c1Dcw',
 'ikTlMObBu3aI4xNmE9X4mA',
 'T2XdzAcj0M_kBD9_BHfZlw',
 'fbLYRHIZAt3q839whhaBUg',
 'K1rAToLggkTvjRz46IU-Tg',
 'gxEkxkqQrrK62DJnXxXDLg',
 'yfMK1YGTDsiD-fE5JJtzWg',
 'H45fQ69_8DpWz-yB9oPhhw',
 '70umoFq_Wa24Nr78UOsilQ',
 'O8sBSjxL8hQbA41lKtcoJg',
 'y8k_rQBYeHZHpbX141Jhwg',
 'X1dOyxoGQnKavvl6lXxjMg',
 '4r5yWUyeo2EDc5DDsAMh9A',
 '24Td_CQH1bonWKff1rt2vg',
 'RTWR4yDwK9uHgYycUC6_SA']

In [41]:
def return_similar_businesses(uid, review1, review2, business1, business2):
    # Looks up the input uid in review1 and returns the business ids that user left review for
    # Iterates through the returned business ids to calculate similarity based on categories text
    # Returns top 10 similar business ids from review2 for each business id
    
    b_list = [i for i in review1.business_id[review1.user_id == uid].values]
    
    dic = {}
    for b in b_list:
        sim = return_similar_business(b, business1, business2)
        dic[b] = sim[:10]
    return dic

In [42]:
dict_test = return_similar_businesses('0pf5VuzE4_1pwj5NJHG5TQ', review_ph, review_lv, business_ph, business_lv)

IndexError: index 0 is out of bounds for axis 0 with size 0