In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df_restaurants = pd.read_csv('data_restaurant.csv')
df_reviews = pd.read_csv('data_reviews.csv')
#dropping columns which are not useful
df_restaurants.drop(columns = ['Unnamed: 0'], inplace = True)
df_reviews.drop(columns = ['Unnamed: 0', 'cool', 'date', 'funny', 'review_id', 'text', 'useful'], inplace = True)  

In [4]:
df_restaurants = df_restaurants[['business_id', 'name']].copy()    #selecting required columns from original dataframe

In [5]:
df_restaurants.head()

Unnamed: 0,business_id,name
0,Apn5Q_b6Nz61Tq4XzPdf9A,Minhas Micro Brewery
1,AjEbIBw6ZFfln7ePHha9PA,CK'S BBQ & Catering
2,O8S5hYJ1SMc8fA4QBtVujA,La Bastringue
3,6OuOZAok8ikONMS_T3EzXg,Thai One On
4,8-NRKkPY1UiFXW20WXKiXg,Filiberto's Mexican Food


In [6]:
df_reviews.head()

Unnamed: 0,business_id,stars,user_id
0,iCQpiavjjPzJ5_3gPD5Ebg,2,msQe1u7Z_XuqjGoqhB0J5g
1,pomGBqfbxcqPv14c3XH-ZQ,5,msQe1u7Z_XuqjGoqhB0J5g
2,jtQARsP6P-LbkyjbO1qNGg,1,msQe1u7Z_XuqjGoqhB0J5g
3,elqbBhBfElMNSrjFqW3now,2,msQe1u7Z_XuqjGoqhB0J5g
4,Ums3gaP2qM3W1XcA5r6SsQ,5,msQe1u7Z_XuqjGoqhB0J5g


In [7]:
df_restaurants.shape, df_reviews.shape

((57173, 2), (237468, 3))

In [8]:
#Merging both the dataframes
restaurant_df = pd.merge(df_reviews, df_restaurants.drop_duplicates(['business_id']), on="business_id", how="left")

In [9]:
restaurant_df.columns

Index(['business_id', 'stars', 'user_id', 'name'], dtype='object')

In [10]:
restaurant_df = restaurant_df[['business_id', 'user_id', 'name', 'stars']]  #Rearranging the columns

In [11]:
restaurant_df.head()

Unnamed: 0,business_id,user_id,name,stars
0,iCQpiavjjPzJ5_3gPD5Ebg,msQe1u7Z_XuqjGoqhB0J5g,Secret Pizza,2
1,pomGBqfbxcqPv14c3XH-ZQ,msQe1u7Z_XuqjGoqhB0J5g,Leticia's Mexican Cocina,5
2,jtQARsP6P-LbkyjbO1qNGg,msQe1u7Z_XuqjGoqhB0J5g,H&H BBQ Plus 2,1
3,elqbBhBfElMNSrjFqW3now,msQe1u7Z_XuqjGoqhB0J5g,Pin Kaow Thai Restaurant,2
4,Ums3gaP2qM3W1XcA5r6SsQ,msQe1u7Z_XuqjGoqhB0J5g,Braddah's Island Style,5


In [12]:
restaurant_df.shape

(237468, 4)

In [13]:
#No of unique users
users = restaurant_df['user_id'].unique()
len(users)

104262

In [14]:
train_data, test_data = train_test_split(restaurant_df, test_size = 0.20, random_state=0)
train_data.head(5)

Unnamed: 0,business_id,user_id,name,stars
230703,3ZHyw0d8mk0HOEP7v4fW1A,zfLngxTs7Dvs2tXmz4vSmw,Krung Thai Restaurant,3
97906,VSX3ixrqQFwwGBUf15s0EQ,g5BftD2l8pS4dW7zv8P87Q,Grassroots Kitchen & Tap,4
155206,N5mcKKdcmwirw1bFhE80Sw,8GwL3hkpkFUmDJkO8jOvmg,The Living Room Wine CafÃ© & Lounge,2
93758,v3rXLmTCX6ZFR6kIYTY2fg,LjsaERLKfh_GtLqLcEw9Hg,Istanbul Grille,5
112167,QsJ6orXv_VB7xz-MpqHzSQ,ydEqM2xb86e2OUIV6ZQkVg,Pomegranate Cafe,5


In [15]:
users = train_data['user_id'].unique()
len(users)

90320

## Popularity Based Recommender

In [16]:
def popularity_recommender(train_df, user_id):
    #Calculating average ratings for each restaurants
    restaurants_grouped = train_df.groupby(['business_id']).agg({'stars': 'mean'}).reset_index()
    restaurants_grouped.rename(columns = {'stars': 'average_rating'},inplace=True)
    #Sort the restaurants based on ratings in descending order
    df_sort = restaurants_grouped.sort_values(['average_rating', 'business_id'], ascending = [False, True])
    #Generate a ranking system based on ratings
    df_sort['Rank'] = restaurants_grouped['average_rating'].rank(ascending = 0, method = 'first')
    #Get top 10 restaurants
    df_top_ten = df_sort.head(10)
    #Adding user_id for which the recommendations are created
    df_top_ten['user_id'] = user_id
    return df_top_ten

In [17]:
user_id = users[5]
popularity_recommender(restaurant_df, user_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,business_id,average_rating,Rank,user_id
4,--GM_ORV2cYS-h38DSaCLw,5.0,1.0,7H0vJk-mjuXgvvGwj_8pyg
5,--Gc998IMjLn8yr-HTzGUg,5.0,2.0,7H0vJk-mjuXgvvGwj_8pyg
7,--KCl2FvVQpvjzmZSPyviA,5.0,3.0,7H0vJk-mjuXgvvGwj_8pyg
22,-0aIra_B6iALlfqAriBSYA,5.0,4.0,7H0vJk-mjuXgvvGwj_8pyg
36,-2WW_cjFa9-UwKSmbEs78Q,5.0,5.0,7H0vJk-mjuXgvvGwj_8pyg
38,-2isRNVb6PDuBagELL5EBw,5.0,6.0,7H0vJk-mjuXgvvGwj_8pyg
52,-59dl6bXMKpmv38TosZJlg,5.0,7.0,7H0vJk-mjuXgvvGwj_8pyg
64,-79cl_yASWXiv7RmzirNxA,5.0,8.0,7H0vJk-mjuXgvvGwj_8pyg
76,-8F04F54iDT6VgWPCgybug,5.0,9.0,7H0vJk-mjuXgvvGwj_8pyg
87,-9CvNuJ35elp7YwIy4xHDw,5.0,10.0,7H0vJk-mjuXgvvGwj_8pyg


## Similarity Based Personal Recommender System

In [18]:
#Get unique restaurants for a given user
def get_user_items(train_df, user):
    user_data =train_df[train_df['user_id'] == user]
    user_items = list(user_data['business_id'].unique())
    return user_items

In [19]:
#Get unique users for a given restaurant
def get_items_users(train_df, item):
    item_data = train_df[train_df['business_id'] == item]
    item_users = set(item_data['user_id'].unique())
    return item_users

In [20]:
#Get all unique restaurants from training data
def get_all_items(train_df):
    all_items = list(train_df['business_id'].unique())
    return all_items

In [21]:
def construct_cocurrence_matrix(train_df, user_rest, all_rest):
    #Get users for all restaurants in user_rest
    user_rest_users = []
    for i in range(0,len(user_rest)):
        user_rest_users.append(get_items_users(train_df,user_rest[i]))
        
        '''
        Initialize the item cocurrence matrix of size
        len(user_rest) x len(rest)
        '''
        cocurrence_matrix = np.matrix(np.zeros(shape=(len(user_rest), len(all_rest))),float)
        '''
        Calculate similarity between user restaurants and all unique restaurants
        in training data
        '''
        
    for i in range(0,len(all_rest)):
        #Calculate unique customers of restaurant i
        rest_i_data = train_df[train_df['business_id'] == all_rest[i]]
        users_i = set(rest_i_data['user_id'].unique())
            
        for j in range(0, len(user_rest)):
            #Get unique customers of restaurant j
            users_j = user_rest_users[j]
                
            #Calculate intersection of customers of restaurant i and j
            users_intersection = users_i.intersection(users_j)
                
            #Calculate cocurrence_matrix[i,j] 
            if len(users_intersection)!= 0:
                #Calculate union of customers of restaurant i and j
                users_union = users_i.union(users_j)
                cocurrence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
            else:
                cocurrence_matrix[j,i] = 0
    return cocurrence_matrix

In [22]:
def generate_recommendations(user, cocurrence_matrix, all_rest, user_rest):
    print("Non zero values in cocurrence matrix: %d" %np.count_nonzero(cocurrence_matrix))
    
    #Calculate a weighted average of the scores for all the songs
    user_sim_scores = cocurrence_matrix.sum(axis = 0)/float(cocurrence_matrix.shape[0])
    user_sim_scores = np.array(user_sim_scores)[0].tolist()
    
    #Sorting indices of user_sim_scores based upon their value
    sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse = True)
    
    #Create a dataframe
    columns = ['user_id', 'business_id', 'score', 'rank']
    df = pd.DataFrame(columns = columns)
    
    #Filling the dataframe with top 10 recommendations
    rank = 1
    for i in range(0,len(sort_index)):
        if ~np.isnan(sort_index[i][0]) and all_rest[sort_index[i][1]] not in user_rest and rank <10:
            df.loc[len(df)] = [user, all_rest[sort_index[i][1]], sort_index[i][0], rank]
            rank = rank + 1
        
    #Handling the case where there is no recommendations
    if df.shape[0] == 0:
        print("Current user has no data for training the item similarity based recommendations")
        return -1
    else:
        return df


In [23]:
def recommend(train_df,user):
    #Get all unique restaurants for this user
    user_rest = get_user_items(train_df,user)
    print("No. of unique restaurants for the user: %d" %len(user_rest))
    
    #Get all unique restaurants in the training data
    all_rest = get_all_items(train_df)
    print("No. of unique restaurants in the training set: %d" %len(all_rest))
    
    #Construct cocurrence matrix
    cocurrence_matrix = construct_cocurrence_matrix(train_df,user_rest, all_rest)
    
    #Use the cocurrence matrix to make recommendations
    df_recommend = generate_recommendations(user, cocurrence_matrix, all_rest,user_rest)
    return df_recommend

In [24]:
def get_similar_items(train_df,item_list):
    user_rest = item_list
    all_rest = get_all_items(train_df)
    
    print("No. of unique restaurants in the training set: %d" %len(all_rest))
    
    #Construct item cocurrence matrix to make recommendations
    cocurrence_matrix = construct_cocurrence_matrix(user_rest, all_rest)
    
    #Use the cocurrence amtrix to make recommendations
    user =""
    df_recommendations = generate_recommendations(user, cocurrence_matrix, all_rest, user_rest)
    
    return df_recommendations

In [37]:
user_id = users[351]
user_items = get_user_items(train_data, user_id)

In [38]:
recommend(train_data,user_id)

No. of unique restaurants for the user: 7
No. of unique restaurants in the training set: 36257
Non zero values in cocurrence matrix: 522


Unnamed: 0,user_id,business_id,score,rank
0,i5O1z5htF7slUq_0D94dOQ,FoyNJ9w3cYW_YnDjiaHK2A,0.025275,1
1,i5O1z5htF7slUq_0D94dOQ,Y0E0MsmJagLlyiSUn9uv-w,0.025275,2
2,i5O1z5htF7slUq_0D94dOQ,HvVV1N9RyxsJhayQeDuuTA,0.02381,3
3,i5O1z5htF7slUq_0D94dOQ,nuJbVf0KUcpMG3i3RtNd_A,0.023191,4
4,i5O1z5htF7slUq_0D94dOQ,GI9nWATB9s_1ECMSNnXEJA,0.023191,5
5,i5O1z5htF7slUq_0D94dOQ,KYxiIkANSVjLhdO939GK6A,0.023191,6
6,i5O1z5htF7slUq_0D94dOQ,5aGYZbjkXpCecz4QH2ZKFg,0.021429,7
7,i5O1z5htF7slUq_0D94dOQ,WCkh_ZcsWKU1wx4bqyU9KA,0.020408,8
8,i5O1z5htF7slUq_0D94dOQ,VvXI3pyUFl1MAimZsX5RrA,0.019918,9
