In [1]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation,TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
from gensim.models.doc2vec import Doc2Vec,TaggedDocument

In [2]:
reviews = pd.read_csv("yelp_academic_dataset_review.csv")
businesses = pd.read_csv("yelp_academic_dataset_business.csv")

In [3]:
reviews = reviews.merge(businesses, how='left', on='business_id')
reviews = reviews[(reviews['city'] == 'Philadelphia') & (reviews['categories'].str.contains('Restaurants', na=False))]
reviews.reset_index(drop=True, inplace=True)
reviews = reviews[['review_id', 'user_id', 'business_id', 'stars_x', 'text']]
reviews['liked'] = (reviews['stars_x'] > 3).astype(int)
reviews

Unnamed: 0,review_id,user_id,business_id,stars_x,text,liked
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,"Wow! Yummy, different, delicious. Our favo...",1
1,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1.0,I am a long term frequent customer of this est...,0
2,8JFGBuHMoiNDyfcxuWNtrA,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4.0,Good food--loved the gnocchi with marinara\nth...,1
3,oyaMhzBSwfGgemSGuZCdwQ,Dd1jQj7S-BFGqRbApFzCFw,YtSqYv1Q_pOltsVPSx54SA,5.0,Tremendous service (Big shout out to Douglas) ...,1
4,Xs8Z8lmKkosqW5mw_sVAoA,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5.0,My absolute favorite cafe in the city. Their b...,1
...,...,...,...,...,...,...
687284,Wrt6pZizQzw-ZTKrvMwrGw,ua6QuBe6mar6pDrhHETzJQ,Bk_1vsPtOtO0bojfQZQIOw,2.0,"Ok, I have to say, after living on this block ...",0
687285,X5R98ygOtbhryDiKA-J2qQ,LHWtjTG7e1NzNPYUbUo-9w,rgeuy1qbw6Z8B6CSVANHIA,5.0,I've been to the other Federal Donuts location...,1
687286,MVg4YUQeEhCA7Z7RsBJSVg,7-7A0Avj47slLGV7yBFc8w,ytynqOUb3hjKeJfRj5Tshw,3.0,"I was so excited about all the food I saw, but...",0
687287,nLjbVsETpqO17RbFcqskkA,am7-gkH_PDz598oTdYSD6A,3gVSrS4kffGGZT8oXHsIcw,3.0,"*Later Yelp* I've only been here once, but I l...",0


In [35]:
sample = reviews.sample(10000).reset_index(drop=True)
sample

Unnamed: 0,review_id,user_id,business_id,stars_x,text,liked
0,z_NsFhbdJo7RYKnZT5WWAw,hZHRtmXk5JhSszriw3jnkg,lGwmPQcKEGPIEWKGSoM1kw,5.0,I like theirs food.the special blend sauce is ...,1
1,bVVDKOEVdDa1HgD-kfaxoA,gejdn9yFiMmvBQYJZ5V7iQ,RQAF6a0akMiot5lZZnMNNw,5.0,The best cheesesteaks plain and simple. If you...,1
2,PHd1EEmTSHNK8pl8XTBxqw,7QhdBy475-dT4NQUu701Ng,DsKzHnkLKnxZTVsFpts4oA,1.0,For a restaurant/tavern that's attached to a n...,0
3,cnbhYcx0RC_9utV1VXmvrQ,ZtEI69EQuYsxKKsmqqGogg,PDGyduDCysMP5L__5BnYJA,5.0,"I was so relieved to see they were reopening ""...",1
4,FLHizI8QtACw80kac35UGg,gC9xulH0GzQAAzWh-OH0Pg,8-1vYscR7TNGF7LcJeNZBQ,5.0,The fresh Grill Chicken BBQ Pizza is to die fo...,1
...,...,...,...,...,...,...
9995,_Co9V_xydJj1YeAz0pO1Rg,zPPyxgbjhcy2tHo3F1jMXw,sV6up2rehuTegmfo3uYUtA,3.0,Nothing to write home about. Service was great...,0
9996,MOUnCwuGyB5Aqz4btFD0WA,4CLAGiVV7EyRGhntHWazQA,U2y7fsqDgxAXskoJNVxbwg,5.0,I love this place!! I come here once a week- p...,1
9997,l-zv0sX202_Amq8EMAD1VA,u766nLu7-4ptpYTPdlGbnA,rgeuy1qbw6Z8B6CSVANHIA,5.0,This place was attached to our hotel so we sto...,1
9998,WW3yDjijXivr9R37LqQS3Q,ITpldUH1k90kuxJfccW4SA,kzy0Kf7z1ucbL2sVbfAFsg,3.0,My husband I went here for the second time rec...,0


In [5]:
# Normalization
corpus = []
ps = PorterStemmer()
sw = set(stopwords.words('english'))
for i in range(0, reviews['text'].size):
    # get review and remove non alpha chars
    review = re.sub('[^a-zA-Z]', ' ', reviews['text'][i])
    # to lower-case
    review = review.lower()
    # split into tokens, apply stemming and remove stop words
    review = ' '.join([w for w in review.split() if w not in sw])
    corpus.append(review)

corpus

['wow yummy different delicious favorite lamb curry korma different kinds naan let outside deter almost changed minds go try something new glad',
 'long term frequent customer establishment went order take apps told busy really place maybe half full best dick reach ass yes go fuck frequent customer great tipper glad kanella opened never going back dmitris',
 'good food loved gnocchi marinara baked eggplant appetizer good service slow despite go back food good',
 'tremendous service big shout douglas complemented delicious food pretty expensive establishment avg main course definitely backs atmosphere comparable top tier restaurants across country',
 'absolute favorite cafe city black white latte probably best ever sweet right amount foam soups always really good even non soup people lot space work noise level perfect music perfect level always enjoy patrons bring dogs keep giving business long',
 'boyfriend tried deli first time today turkey avocado bacon panini ha buffalo chicken wrap

In [65]:
# Feature extraction
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print(X.shape)

(687289, 153651)


In [66]:
n_topics = 8

lsa_model = TruncatedSVD(n_components=n_topics)
lsa_topic_matrix = lsa_model.fit_transform(X)

In [55]:
# Define helper functions
def get_keys(topic_matrix):
    '''
    returns an integer list of predicted topic 
    categories for a given topic matrix
    '''
    keys = topic_matrix.argmax(axis=1).tolist()
    return keys

def keys_to_counts(keys):
    '''
    returns a tuple of topic categories and their 
    accompanying magnitudes for a given list of keys
    '''
    count_pairs = Counter(keys).items()
    categories = [pair[0] for pair in count_pairs]
    counts = [pair[1] for pair in count_pairs]
    return (categories, counts)

def get_top_n_words(n, keys, document_term_matrix, count_vectorizer):
    '''
    returns a list of n_topic strings, where each string contains the n most common 
    words in a predicted category, in order
    '''
    top_word_indices = []
    for topic in range(n_topics):
        temp_vector_sum = 0
        for i in range(len(keys)):
            if keys[i] == topic:
                temp_vector_sum += document_term_matrix[i]
        temp_vector_sum = temp_vector_sum.toarray()
        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
        top_word_indices.append(top_n_word_indices)   
    top_words = []
    for topic in top_word_indices:
        topic_words = []
        for index in topic:
            temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
            temp_word_vector[:,index] = 1
            the_word = count_vectorizer.inverse_transform(temp_word_vector)[0][0]
            topic_words.append(the_word.encode('ascii').decode('utf-8'))
        top_words.append(" ".join(topic_words))         
    return top_words


In [67]:
lda_model = LatentDirichletAllocation(n_components=n_topics, learning_method='online', 
                                          random_state=0, verbose=0)
lda_topic_matrix = lda_model.fit_transform(X)

In [68]:
lda_keys = get_keys(lda_topic_matrix)
lda_categories, lda_counts = keys_to_counts(lda_keys)

In [69]:
top_n_words_lda = get_top_n_words(10, lda_keys, X, vectorizer)

for i in range(len(top_n_words_lda)):
    print("Topic {}: ".format(i+1), top_n_words_lda[i])

Topic 1:  good place food like great get cheese one go really
Topic 2:  good food delicious great like sauce ordered dish also one
Topic 3:  chocolate cake pizza cream donuts good ice best like sweet
Topic 4:  food us place service time good one like would order
Topic 5:  food great place good service back delicious really menu restaurant
Topic 6:  beer like food bar place good one great get go
Topic 7:  coffee place great good food breakfast like philly one brunch
Topic 8:  food good place like chicken great really also ordered rice


In [70]:
lda_topic_matrix.shape

(687289, 8)

In [71]:
# add the topics to this dataset as 8 new columns
column_names = ['topic_{}'.format(i+1) for i in range(n_topics)]
topics = pd.DataFrame(lda_topic_matrix, columns=column_names)
reviews = pd.concat([reviews, topics], axis=1)
reviews.to_csv("reviews_topics_user.csv")

reviews

Unnamed: 0,review_id,user_id,stars_x,text,liked,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,5.0,"Wow! Yummy, different, delicious. Our favo...",1,0.313264,0.313915,0.005438,0.254500,0.005445,0.005442,0.005436,0.096559
1,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,1.0,I am a long term frequent customer of this est...,0,0.297167,0.035878,0.003572,0.649079,0.003580,0.003576,0.003574,0.003574
2,8JFGBuHMoiNDyfcxuWNtrA,smOvOajNG0lS4Pq7d8g4JQ,4.0,Good food--loved the gnocchi with marinara\nth...,1,0.007362,0.146414,0.065403,0.007363,0.751392,0.007353,0.007353,0.007360
3,oyaMhzBSwfGgemSGuZCdwQ,Dd1jQj7S-BFGqRbApFzCFw,5.0,Tremendous service (Big shout out to Douglas) ...,1,0.005212,0.079793,0.005209,0.005212,0.748513,0.145623,0.005225,0.005212
4,Xs8Z8lmKkosqW5mw_sVAoA,IQsF3Rc6IgCzjVV9DE8KXg,5.0,My absolute favorite cafe in the city. Their b...,1,0.565553,0.003054,0.071990,0.003056,0.071111,0.003053,0.159809,0.122374
...,...,...,...,...,...,...,...,...,...,...,...,...,...
687284,Wrt6pZizQzw-ZTKrvMwrGw,ua6QuBe6mar6pDrhHETzJQ,2.0,"Ok, I have to say, after living on this block ...",0,0.111870,0.001052,0.001052,0.364793,0.063413,0.001052,0.053158,0.403610
687285,X5R98ygOtbhryDiKA-J2qQ,LHWtjTG7e1NzNPYUbUo-9w,5.0,I've been to the other Federal Donuts location...,1,0.320783,0.052310,0.097361,0.001987,0.130098,0.097764,0.177424,0.122272
687286,MVg4YUQeEhCA7Z7RsBJSVg,7-7A0Avj47slLGV7yBFc8w,3.0,"I was so excited about all the food I saw, but...",0,0.215101,0.005958,0.059662,0.303467,0.096133,0.164183,0.149531,0.005965
687287,nLjbVsETpqO17RbFcqskkA,am7-gkH_PDz598oTdYSD6A,3.0,"*Later Yelp* I've only been here once, but I l...",0,0.003794,0.126797,0.064905,0.048054,0.593378,0.003798,0.003792,0.155481


# NEED TO FILTER FOR USERS THAT REVIEWED AT LEAST 5 TIMES

In [72]:
# Average topic distribution for items user interacted with only liked items
user_profile = reviews[reviews["liked"] == 1].drop(columns=['review_id', 'stars_x', 'text']).groupby("user_id", as_index=False)[column_names].mean()
user_profile.to_csv("user_profiles.csv")

user_profile

Unnamed: 0,user_id,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8
0,---r61b7EpVPkb4UVme5tA,0.149137,0.005005,0.005000,0.395696,0.430137,0.005009,0.005001,0.005015
1,--0kuuLmuYBe3Rmu0Iycww,0.373362,0.160772,0.122085,0.005390,0.156996,0.170622,0.005386,0.005387
2,--2tyArRmSoyKx5r-FVG0A,0.257389,0.009424,0.013265,0.180111,0.185537,0.197084,0.096957,0.060232
3,--2vR0DIsmQ6WfcSzKWigw,0.033140,0.009672,0.812296,0.013007,0.012538,0.021357,0.097229,0.000761
4,--4AjktZiHowEIBCMd4CZA,0.206812,0.005182,0.017323,0.049723,0.425025,0.080224,0.049057,0.166654
...,...,...,...,...,...,...,...,...,...
159922,zzrhWsiCwAKQzbgMZIOtgg,0.230308,0.045626,0.063714,0.082449,0.372313,0.085547,0.103842,0.016201
159923,zztAOHhlNfNzOxUeyxeyFw,0.356585,0.005961,0.005957,0.005964,0.430900,0.122631,0.066047,0.005957
159924,zzvCl_egPyWpxO7EvWc2IA,0.004241,0.067078,0.004237,0.149818,0.644001,0.059269,0.067116,0.004239
159925,zzwYLnmIvj8C7wJmRjtkRA,0.042339,0.122963,0.006520,0.006525,0.709200,0.099399,0.006525,0.006529


In [21]:
frequent_users = reviews[reviews['liked'] == 1]['user_id'].value_counts() > 5
frequent_users.sum()

14626

# NEED TO FILTER FOR RESTAURANTS THAT WERE REVIEWED AT LEAST 5 TIMES

In [16]:
reviews_topic = pd.read_csv("reviews_topics_user.csv")
reviews_topic = pd.concat([reviews_topic, reviews['business_id']], axis=1)
reviews_topic = reviews_topic.drop(columns=['Unnamed: 0', 'review_id', 'user_id', 'stars_x', 'text'])
restaurant_profile = reviews_topic[reviews_topic['liked'] == 1].groupby("business_id", as_index=False)[column_names].mean()
restaurant_profile.to_csv("restaurant_profiles.csv")
restaurant_profile

Unnamed: 0,business_id,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8
0,-0M0b-XhtFagyLmsBtOe8w,0.078602,0.046403,0.022593,0.140175,0.436417,0.175497,0.080066,0.020248
1,-0PN_KFPtbnLQZEeb23XiA,0.402912,0.018541,0.005863,0.084059,0.078386,0.047164,0.063135,0.299939
2,-0TffRSXXIlBYVbb5AwfTg,0.108954,0.216670,0.018824,0.104431,0.367122,0.047990,0.033901,0.102109
3,-0eUa8TsXFFy0FCxHYmrjg,0.418875,0.053791,0.043648,0.096559,0.211321,0.037843,0.081673,0.056292
4,-1B9pP_CrRBJYPICE5WbRA,0.082961,0.078748,0.013261,0.108814,0.360558,0.038375,0.020408,0.296874
...,...,...,...,...,...,...,...,...,...
5772,zxY4DgtXsVHihSUpsmwamg,0.641424,0.130154,0.012577,0.066264,0.075898,0.037468,0.031334,0.004880
5773,zy7uNOvpykrq-XlmDY_wHA,0.248098,0.050230,0.022135,0.199817,0.365974,0.024520,0.074811,0.014414
5774,zyMkbavgHASQtqVwaock9A,0.678724,0.030783,0.018321,0.031066,0.141033,0.033699,0.032831,0.033543
5775,zz-fcqurtm77bZ_rVvo2Lw,0.463548,0.108644,0.059583,0.041887,0.111181,0.117334,0.068131,0.029690


In [22]:
frequent_restaurants = reviews_topic[reviews_topic['liked'] == 1]['business_id'].value_counts() > 5
frequent_restaurants.sum()

4726

In [105]:
user_profile = pd.read_csv("user_profiles.csv")
user_profile = user_profile.drop(columns=['Unnamed: 0'])
restaurant_profile = pd.read_csv("restaurant_profiles.csv")
restaurant_profile = restaurant_profile.drop(columns=['Unnamed: 0'])

In [28]:
# Recommendations

# select one user
user_id = "---r61b7EpVPkb4UVme5tA"
usr_lst = user_profile[user_profile['user_id'] == user_id].drop(columns=['user_id']).values

# removing the restaurants already visited by the user
visited_restaurants = reviews[reviews['user_id'] == user_id]['business_id'].values # including even the ones the user didn't like
new_restaurants = restaurant_profile[-restaurant_profile['business_id'].isin(visited_restaurants)]
new_restaurants.reset_index(drop=True, inplace=True)

# business matrix
bus_matrix = new_restaurants.drop(columns=['business_id']).values

# Compute similarity between user profile and all document topics
similarity_scores = cosine_similarity(usr_lst, bus_matrix)

# Recommend items with highest similarity scores
N = 5
recommended_indices = similarity_scores.argsort()[0][::-1][:N]  # Top N recommendations
restaurant_profile.iloc[recommended_indices]["business_id"]

5088    sDaBm5V1VQMOPfCm76turg
4495    l_FzESKdJPs41Ll5y35xUg
343     2hCIzMXhSbgObLS4al82rg
3185    YRw-uBpdzRZngN6zzoC4WA
2124    MYoRNLb5chwjQe3c_k37Gg
Name: business_id, dtype: object

# Doc2Vec

This takes the text completely untreated. Maybe we can look into the possibility of normalizing it before.

Less than 2 minutes with the 10000 sample

In [36]:
# Filtering for users and restaurants that have at least 5 reviews posted/received

# counts
user_counts = reviews['user_id'].value_counts()
restaurant_counts = reviews['business_id'].value_counts()

# creating filters
users_with_5_plus_reviews = user_counts[user_counts >= 5].index
restaurants_with_5_plus_reviews = restaurant_counts[restaurant_counts >= 5].index

# applying filters to reviews df
filtered_reviews = reviews[
    (reviews['user_id'].isin(users_with_5_plus_reviews)) &
    (reviews['business_id'].isin(restaurants_with_5_plus_reviews)) &
    (reviews['liked'] == 1)
]

filtered_reviews.reset_index(drop=True, inplace=True)
filtered_reviews

Unnamed: 0,review_id,user_id,business_id,stars_x,text,liked
0,8JFGBuHMoiNDyfcxuWNtrA,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4.0,Good food--loved the gnocchi with marinara\nth...,1
1,Xs8Z8lmKkosqW5mw_sVAoA,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5.0,My absolute favorite cafe in the city. Their b...,1
2,JBWZmBy69VMggxj3eYn17Q,aFa96pz67TwOFu4Weq5Agg,kq5Ghhh14r-eCxlVmlyd8w,5.0,My boyfriend and I tried this deli for the fir...,1
3,cvQXRFLCyr0S7EgFb4lZqw,ZGjgfSvjQK886kiTzLwfLQ,EtKSTHV5Qx_Q7Aur9o4kQQ,5.0,"On a scale of one to things that are awesome, ...",1
4,r2IBPY_E8AE5_GpsqlONyg,IKbjLnfBQtEyVzEu8CuOLg,VJEzpfLs_Jnzgqh5A_FVTg,4.0,It was my fiance's birthday and he decided he ...,1
...,...,...,...,...,...,...
287601,ZcEx4UEnTnR_TEPEqwkKjA,gkg9VqsxPCgpfYXO1dl8CA,Ea663rIHyKXz2VP2DPH7Cg,4.0,I decided to try this place out after Christma...,1
287602,rtt1Ymczj-1Lb26JMsY2lA,M1cMsRL4L7IUr9RILDywEQ,vt_esoDw6HG5ClM12OPkMg,4.0,"5 stars for the Bonte waffle, 3 stars for this...",1
287603,me7QTotYCOjWNVA8bzN1eg,bJ5FtCtZX3ZZacz2_2PJjA,wMQkdK2aNMvq2xoojC98Mw,4.0,South Street Diner isn't the best of Philly Di...,1
287604,5n_oSwXspiiSsZgNwjp48g,bJ5FtCtZX3ZZacz2_2PJjA,SOsjW1JARmtHUFtpFlp8rw,4.0,When I first heard that the Peace A Pizza (htt...,1


In [37]:
sample = filtered_reviews.sample(10000).reset_index(drop=True)
sample

Unnamed: 0,review_id,user_id,business_id,stars_x,text,liked
0,sLC4-3d74U8ZB8-ZjVsSkQ,tCqMd0ZG3B8vi2KOxReW1Q,i_FWONQD1ZBqrNE2b-M5Ug,5.0,This and Suraya are our go to restaurant when ...,1
1,XPKAYjFeHchaGeXj_9I9OQ,0iBwDU_88feB_Drpmjkx-g,7pwZZVVlYCxQvVdd8Q03wg,5.0,Best pizza I've tasted in south philly....I ha...,1
2,CAbVpMMfAEu0y5rjzFFVsA,K1Z4CV1kCmvLYJnA30yfAw,U7HYUH8SqZO6OQMNKCr5kQ,5.0,Holy ba-jesus this place is GREAT! Mooooove ov...,1
3,6dXF_yq-525ksUiwhGCCNQ,55fCofuKVb7XP3newzmX3w,tf2nzjkP8XGIeU-lg767XA,4.0,I've been here for drinks at the bar before an...,1
4,wtyetieCLcewtXSKaeJBeQ,UV41Cu3ZdhPaYCDa8uPL8g,benTAK8255VkJu14ZxVtDQ,4.0,Great addition to the neighborhood! My husband...,1
...,...,...,...,...,...,...
9995,L7d6ernOCVXlikiP5OFIWg,n1OrzdUywZXcvrRa7bfQag,y6-J_UjNk69VNLb39c_5CA,5.0,I had a good experience and it was a lot of fu...,1
9996,T-UnHI0o6rPSt4beq0CZjA,3ZtKlGT9UNCv0tqVsiimkg,FIFi_8eNmc-jPHZVHPS3NQ,4.0,Came here after hearing so many great reviews ...,1
9997,KDAIeB1ZDzvCVOLlgjLncA,vFd8aBLg1kFcd0kCkoi-xw,G9LZoNlCfRH941q87_JLIg,4.0,The esquites is a good way to start your meal ...,1
9998,ZZMJlcWZ1J9piomyl2Sltw,uRwxr1xulZk3g8_eimR6tA,LzVvcVm48zYIcQnwLKt1oA,5.0,Ok. Seriously I don't know where to begin! I g...,1


In [38]:
data = sample['text']

# preproces the documents, and create TaggedDocuments
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
							tags=[str(i)]) for i,
			doc in enumerate(data)]

# Doc2vec model
model = Doc2Vec(vector_size=100,
				min_count=2, epochs=50)
model.build_vocab(tagged_data)
model.train(tagged_data,
			total_examples=model.corpus_count,
			epochs=model.epochs)

# document vectors
document_vectors = [model.infer_vector(
	word_tokenize(doc.lower())) for doc in data]

In [39]:
sample['doc2vec'] = pd.Series(document_vectors)

In [40]:
sample

Unnamed: 0,review_id,user_id,business_id,stars_x,text,liked,doc2vec
0,sLC4-3d74U8ZB8-ZjVsSkQ,tCqMd0ZG3B8vi2KOxReW1Q,i_FWONQD1ZBqrNE2b-M5Ug,5.0,This and Suraya are our go to restaurant when ...,1,"[-0.45106882, 0.03996058, 1.1453925, -1.439583..."
1,XPKAYjFeHchaGeXj_9I9OQ,0iBwDU_88feB_Drpmjkx-g,7pwZZVVlYCxQvVdd8Q03wg,5.0,Best pizza I've tasted in south philly....I ha...,1,"[-0.31197488, 0.7667, -0.30364245, -1.3111172,..."
2,CAbVpMMfAEu0y5rjzFFVsA,K1Z4CV1kCmvLYJnA30yfAw,U7HYUH8SqZO6OQMNKCr5kQ,5.0,Holy ba-jesus this place is GREAT! Mooooove ov...,1,"[-2.5873663, 0.85221237, -1.8083317, -3.284638..."
3,6dXF_yq-525ksUiwhGCCNQ,55fCofuKVb7XP3newzmX3w,tf2nzjkP8XGIeU-lg767XA,4.0,I've been here for drinks at the bar before an...,1,"[0.63472456, 2.3002424, -1.4746091, -0.8441891..."
4,wtyetieCLcewtXSKaeJBeQ,UV41Cu3ZdhPaYCDa8uPL8g,benTAK8255VkJu14ZxVtDQ,4.0,Great addition to the neighborhood! My husband...,1,"[-0.010030669, -0.83387834, -3.2065663, 0.2614..."
...,...,...,...,...,...,...,...
9995,L7d6ernOCVXlikiP5OFIWg,n1OrzdUywZXcvrRa7bfQag,y6-J_UjNk69VNLb39c_5CA,5.0,I had a good experience and it was a lot of fu...,1,"[0.24692239, 0.28317174, -0.6580293, -0.288496..."
9996,T-UnHI0o6rPSt4beq0CZjA,3ZtKlGT9UNCv0tqVsiimkg,FIFi_8eNmc-jPHZVHPS3NQ,4.0,Came here after hearing so many great reviews ...,1,"[-0.3912266, 1.0629753, -0.8093995, 0.04401463..."
9997,KDAIeB1ZDzvCVOLlgjLncA,vFd8aBLg1kFcd0kCkoi-xw,G9LZoNlCfRH941q87_JLIg,4.0,The esquites is a good way to start your meal ...,1,"[-0.4884597, 0.5126089, -0.102682665, -0.78100..."
9998,ZZMJlcWZ1J9piomyl2Sltw,uRwxr1xulZk3g8_eimR6tA,LzVvcVm48zYIcQnwLKt1oA,5.0,Ok. Seriously I don't know where to begin! I g...,1,"[0.34513474, -1.5451471, -1.2362162, 0.3502437..."


In [41]:
sample_expanded = pd.DataFrame(sample['doc2vec'].tolist(), index=sample.index)
sample_expanded.columns = [f'doc2vec_{i}' for i in range(sample_expanded.shape[1])]
sample = sample.drop('doc2vec', axis=1).join(sample_expanded)

sample

Unnamed: 0,review_id,user_id,business_id,stars_x,text,liked,doc2vec_0,doc2vec_1,doc2vec_2,doc2vec_3,...,doc2vec_90,doc2vec_91,doc2vec_92,doc2vec_93,doc2vec_94,doc2vec_95,doc2vec_96,doc2vec_97,doc2vec_98,doc2vec_99
0,sLC4-3d74U8ZB8-ZjVsSkQ,tCqMd0ZG3B8vi2KOxReW1Q,i_FWONQD1ZBqrNE2b-M5Ug,5.0,This and Suraya are our go to restaurant when ...,1,-0.451069,0.039961,1.145393,-1.439584,...,-1.377381,1.748691,-1.017092,-0.620256,1.416039,0.672509,-0.662186,-0.605508,-0.397329,0.444505
1,XPKAYjFeHchaGeXj_9I9OQ,0iBwDU_88feB_Drpmjkx-g,7pwZZVVlYCxQvVdd8Q03wg,5.0,Best pizza I've tasted in south philly....I ha...,1,-0.311975,0.766700,-0.303642,-1.311117,...,-0.219193,-0.324556,0.023984,-0.764924,0.503930,-0.840846,0.436587,1.447701,0.141035,1.260364
2,CAbVpMMfAEu0y5rjzFFVsA,K1Z4CV1kCmvLYJnA30yfAw,U7HYUH8SqZO6OQMNKCr5kQ,5.0,Holy ba-jesus this place is GREAT! Mooooove ov...,1,-2.587366,0.852212,-1.808332,-3.284638,...,-0.965245,2.605013,1.049494,-0.973607,0.048613,-0.816860,-0.450335,-0.254436,0.703874,-2.108874
3,6dXF_yq-525ksUiwhGCCNQ,55fCofuKVb7XP3newzmX3w,tf2nzjkP8XGIeU-lg767XA,4.0,I've been here for drinks at the bar before an...,1,0.634725,2.300242,-1.474609,-0.844189,...,-1.077259,1.561410,0.601046,0.024337,-0.618392,-1.030566,0.404822,1.013267,1.445827,0.959639
4,wtyetieCLcewtXSKaeJBeQ,UV41Cu3ZdhPaYCDa8uPL8g,benTAK8255VkJu14ZxVtDQ,4.0,Great addition to the neighborhood! My husband...,1,-0.010031,-0.833878,-3.206566,0.261463,...,-1.021978,0.923961,-0.598759,0.381343,-0.377123,-1.242114,1.200541,-0.579291,-1.025504,-1.072454
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,L7d6ernOCVXlikiP5OFIWg,n1OrzdUywZXcvrRa7bfQag,y6-J_UjNk69VNLb39c_5CA,5.0,I had a good experience and it was a lot of fu...,1,0.246922,0.283172,-0.658029,-0.288496,...,-0.514799,0.439222,-0.090009,-0.354539,0.404292,0.468304,-0.219190,0.176248,0.067341,-0.357425
9996,T-UnHI0o6rPSt4beq0CZjA,3ZtKlGT9UNCv0tqVsiimkg,FIFi_8eNmc-jPHZVHPS3NQ,4.0,Came here after hearing so many great reviews ...,1,-0.391227,1.062975,-0.809399,0.044015,...,-0.416782,0.008877,-1.017066,-0.666284,-0.487110,0.852823,-0.199479,0.283766,0.822444,-0.365639
9997,KDAIeB1ZDzvCVOLlgjLncA,vFd8aBLg1kFcd0kCkoi-xw,G9LZoNlCfRH941q87_JLIg,4.0,The esquites is a good way to start your meal ...,1,-0.488460,0.512609,-0.102683,-0.781004,...,-0.554141,1.082571,0.499381,-1.086260,0.159616,0.603335,0.084637,0.795217,-0.267860,-1.525839
9998,ZZMJlcWZ1J9piomyl2Sltw,uRwxr1xulZk3g8_eimR6tA,LzVvcVm48zYIcQnwLKt1oA,5.0,Ok. Seriously I don't know where to begin! I g...,1,0.345135,-1.545147,-1.236216,0.350244,...,1.073642,-0.393033,1.016322,1.192494,1.686341,-0.482530,0.347534,1.070352,-0.218979,0.286283


In [42]:
user_profile_doc2vec =  sample.drop(columns=['review_id', 'business_id', 'stars_x', 'text', 'liked']).groupby("user_id", as_index=False)[sample_expanded.columns].mean()
user_profile_doc2vec

Unnamed: 0,user_id,doc2vec_0,doc2vec_1,doc2vec_2,doc2vec_3,doc2vec_4,doc2vec_5,doc2vec_6,doc2vec_7,doc2vec_8,...,doc2vec_90,doc2vec_91,doc2vec_92,doc2vec_93,doc2vec_94,doc2vec_95,doc2vec_96,doc2vec_97,doc2vec_98,doc2vec_99
0,-25VzPSb4ox-GdWmKDtshA,-1.301838,0.683539,-1.051339,-1.094017,-0.747729,-0.833799,0.650213,0.057953,1.348980,...,0.573487,-2.113169,0.360323,-0.535257,1.204833,-0.551977,-2.105661,-0.501908,1.357714,-0.720134
1,-2cKJFFNJ9XVyWBt62mWvA,-0.151490,-0.216041,-0.107470,-0.243526,1.580797,-0.797302,0.593055,0.974836,-0.153719,...,1.805690,0.652622,0.294857,0.795383,-1.274903,0.057253,0.071028,0.872815,-0.228553,0.310425
2,-3P0apvgEEM_cCIRyl3bRw,0.236318,2.744531,-0.608267,-1.657934,3.090559,0.068579,-0.849246,0.799614,0.237397,...,-0.628696,0.827344,-0.515293,-0.476057,0.677622,1.687139,-0.035063,0.322998,-0.489022,-0.299178
3,-47Rm6DWQxl2skeCl-vFHg,0.370408,0.428403,0.516310,-1.042840,0.018001,-0.645075,-0.154592,1.334038,-0.387951,...,-0.174853,0.241877,-0.856027,-0.951270,-0.794492,-0.223143,1.088800,0.232143,0.869431,-0.111912
4,-6nlh2IbzCbGhWndMLZt1g,-0.490849,1.209373,0.809219,0.774592,1.435144,1.191466,0.677854,0.247982,-1.505491,...,1.042966,1.719276,0.128132,0.546436,0.307675,-1.225703,-0.480467,-0.763259,0.089684,-0.882564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6869,zx2NkJtfSvJhid6rxvYMlg,-0.605965,0.609501,1.637628,0.549139,1.322350,-0.446108,-1.380047,1.224981,-0.433248,...,-1.072574,0.575207,-0.101791,-0.401478,1.251195,-0.162863,0.925879,-0.317275,0.751595,0.509876
6870,zx43prq7t59Bg5xDOm1t3g,-0.709193,0.686933,-0.610342,-0.181626,0.990744,0.724226,0.863436,0.079055,0.719572,...,0.343236,0.777813,-0.147987,-0.186692,0.813609,-0.552430,0.343865,-0.028431,0.980844,-0.464169
6871,zxeG7pecqW6SPGrYIP0HSg,-0.860021,0.834991,-0.421162,0.072532,0.567860,-1.418928,0.307376,0.950814,-0.329542,...,0.226755,0.058169,-0.579540,0.204547,0.415189,-0.217747,-0.782550,0.029245,0.799248,-0.818683
6872,zyB2nCLs1x27K-sXLj8aDw,-1.109280,1.809126,0.327575,-1.649184,0.699434,0.420974,1.010823,-0.129215,0.679385,...,0.389543,0.470620,1.365513,-0.859373,0.764206,0.698052,-0.405975,1.627416,1.461163,0.564337


In [43]:
restaurant_profile_doc2vec =  sample.drop(columns=['review_id', 'user_id', 'stars_x', 'text', 'liked']).groupby("business_id", as_index=False)[sample_expanded.columns].mean()
restaurant_profile_doc2vec

Unnamed: 0,business_id,doc2vec_0,doc2vec_1,doc2vec_2,doc2vec_3,doc2vec_4,doc2vec_5,doc2vec_6,doc2vec_7,doc2vec_8,...,doc2vec_90,doc2vec_91,doc2vec_92,doc2vec_93,doc2vec_94,doc2vec_95,doc2vec_96,doc2vec_97,doc2vec_98,doc2vec_99
0,-0PN_KFPtbnLQZEeb23XiA,-0.343017,-0.505310,-0.858467,0.003974,0.787255,-0.108218,-0.586535,0.059905,-0.503976,...,1.399061,0.399789,0.378700,-1.964260,1.759055,-0.005532,0.085946,-0.410141,-0.270524,-1.346960
1,-0TffRSXXIlBYVbb5AwfTg,-0.256850,0.386000,-0.395760,-0.288296,0.908090,0.097290,0.177546,0.526278,0.072453,...,0.341356,0.316282,0.211761,-0.403023,0.887313,0.112904,-0.708907,0.217309,0.161457,0.002104
2,-0eUa8TsXFFy0FCxHYmrjg,-1.996013,-0.591426,-0.340084,0.765210,1.110209,0.302426,-0.393390,1.676181,-1.171620,...,0.088121,-0.763345,-0.578279,-0.319888,0.266820,-0.783076,1.112962,-0.464288,-0.267040,0.401994
3,-1B9pP_CrRBJYPICE5WbRA,-0.580241,0.041306,-1.091146,-0.004317,0.898353,-0.218667,0.276324,0.688997,-0.240543,...,0.161978,0.000830,-0.179954,-0.247282,0.541362,0.065893,-0.439497,-0.208325,-0.346133,0.181681
4,-2-ih3mE8KPyeKVIzpBfPQ,-0.559072,0.800264,-1.458378,-1.029026,1.027816,-1.389903,-0.046550,-0.018216,0.806164,...,0.521168,1.143190,-0.088687,0.298586,-0.121110,0.215365,0.153591,0.145992,-1.041175,-0.501286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2708,zujdPV3HT-Y-CKE1GgkMHQ,-0.947553,0.213453,0.003991,-0.664370,0.246650,-0.866719,-0.252758,0.717134,-0.073692,...,1.202132,0.740893,0.008193,-0.695996,0.243963,-0.439433,0.138277,-0.317086,0.559143,-0.189497
2709,zvvl3c1FO3O3BZdhusficA,-0.819395,-0.129767,-0.617420,-1.098633,-0.498430,0.256153,1.815694,2.251440,0.900176,...,-2.639077,-1.241327,0.676447,1.896346,0.158675,-0.319389,-0.485694,-0.450734,-0.193166,-0.385535
2710,zwTmOj4B_OVPMTMYijQiKg,-2.712824,-0.325418,0.666085,-0.555177,-0.564782,-1.448224,0.332883,-0.482389,0.923767,...,2.069994,-0.723348,0.643271,0.883189,0.090938,0.331887,-0.136998,0.153328,1.740309,0.384210
2711,zwd4dyQ5ovnjVojWfAuhMw,-0.793893,0.197163,0.410260,-1.641406,0.800636,0.292556,-0.536209,0.672725,0.405609,...,-0.062029,0.088131,-1.129928,-0.273341,1.933125,1.361800,1.406788,-0.432450,1.515419,-0.152973


In [45]:
# Recommendations

# select one user
user_id = "-25VzPSb4ox-GdWmKDtshA"
usr_lst = user_profile_doc2vec[user_profile_doc2vec['user_id'] == user_id].drop(columns=['user_id']).values

# removing the restaurants already visited by the user
visited_restaurants = reviews[reviews['user_id'] == user_id]['business_id'].values # including even the ones the user didn't like
new_restaurants = restaurant_profile_doc2vec[-restaurant_profile_doc2vec['business_id'].isin(visited_restaurants)]
new_restaurants.reset_index(drop=True, inplace=True)

# business matrix
bus_matrix = new_restaurants.drop(columns=['business_id']).values

# Compute similarity between user profile and all document topics
similarity_scores = cosine_similarity(usr_lst, bus_matrix)

# Recommend items with highest similarity scores
N = 5
recommended_indices = similarity_scores.argsort()[0][::-1][:N]  # Top N recommendations
new_restaurants.iloc[recommended_indices]["business_id"]

1394    WP9GAuhvmUhm8MAxMqhgrQ
2106    lWedWkinrM5j13pyimbpbA
2101    lRbHFOIFuusN2WOR_ypQ_A
2090    l8gAoQqqVfphPe9jmIJZ3g
2353    rYqmaOIULRouz_1db07OdQ
Name: business_id, dtype: object

In [103]:
def recommend(user_id, reviews, user_profile, restaurant_profile, type='user_item', N=5, K=5):
    
    '''
    Provides N recommendations of restaurants to the given user based on three
    different methods explained in the comments

    Inputs:
    user_id - the user the recommendations are aimed at
    reviews - original df with every review
    user_profile - df with the profiles of the users (vectors from LSA/LDA/doc2vec)
    restaurant_profile - df with the profiles of the restaurants (vectors from LSA/LDA/doc2vec)
    type - type of recommendations (user_item, users or items) (default = "user_item")
    N - How many recommendations to output (user_item method) (default = 5)
    K - Number of neighbors to consider (users or items method) (default = 5)
    N_peruser - Number of recommendations per user (only for the users method) (default = 1) [REMOVED]

    Outputs:
    Series of N recommended restaurants (business_id)
    '''

    # Extracting the vector relative to the user and removing the user from the profiles
    usr_lst = user_profile[user_profile['user_id'] == user_id].drop(columns=['user_id']).values
    user_profile = user_profile[user_profile['user_id'] != user_id].reset_index(drop=True)
    
    # Removing the restaurants already visited by the user
    visited_restaurants = reviews[reviews['user_id'] == user_id]['business_id'].values # including even the ones the user didn't like
    new_restaurants = restaurant_profile[-restaurant_profile['business_id'].isin(visited_restaurants)]
    new_restaurants.reset_index(drop=True, inplace=True)

    # Final user and business matrixes in array format
    user_matrix = user_profile.drop(columns=['user_id']).values
    bus_matrix = new_restaurants.drop(columns=['business_id']).values

    if type == 'user_item':
        # Measures the similarity between user and available restaurants
        # Recommends the restaurants that are most similar to the user
        similarity_scores = cosine_similarity(usr_lst, bus_matrix)
        recommended_indices = similarity_scores.argsort()[0][::-1][:N]
        recommendations = new_restaurants.iloc[recommended_indices]["business_id"]
        
    elif type == 'users':
        # Measures the similarity between the user and the other users
        # Recommends the highest rated restaurants by other users that the user has not experienced

        ### Previous version - possibly to delete
        # similarity_scores = cosine_similarity(usr_lst, user_matrix)
        # similar_user_indices = similarity_scores.argsort()[0][::-1][:K]
        # similar_users = user_profile.iloc[similar_user_indices]['user_id']
        # recommendations = reviews[(reviews['user_id'].isin(similar_users)) & (-reviews['business_id'].isin(visited_restaurants))].groupby('user_id').apply(lambda x: x.nlargest(N_peruser, 'stars_x')).reset_index(drop=True)['business_id']    

        ### Properly using the CF approach
        # Similarities between users
        similarity_scores = cosine_similarity(usr_lst, user_matrix)

        # Finding the K most similar users
        similar_user_indices = similarity_scores.argsort()[0][::-1][:K]
        similar_users = user_profile.iloc[similar_user_indices]['user_id']

        # creating a matrix for the similar users
        user_item = reviews[reviews['user_id'].isin(similar_users)][['user_id', 'business_id', 'stars_x']]
        user_item = pd.pivot_table(user_item, index='user_id', columns='business_id', values='stars_x', aggfunc='mean')

        # averaging the ratings for each restaurant [MAYBE TRY AVERAGE WEIGHTED BY SIMILARITY]
        average_ratings = np.mean(user_item, axis=0)

        # removing restaurants already visited
        for i in visited_restaurants:
            try:
                average_ratings.drop(i, inplace=True)
            except:
                continue

        # recommending the top N restaurants
        recommendations = pd.Series(average_ratings.sort_values(ascending=False).head(N).keys())

    elif type == 'items':
        # Measures the similarity between the restaurants
        # Recommends the restaurants that are most similar to the ones the user liked before

        ### Previous version - possibly to delete
        # favorite_business = reviews[reviews['user_id'] == user_id].sort_values('stars_x', ascending=False)['business_id'].head(1)
        # bus_list = restaurant_profile[restaurant_profile['business_id'] == favorite_business].drop(columns=['business_id']).values
        # similarity_scores = cosine_similarity(bus_list, bus_matrix)
        # similar_bus_indices = similarity_scores.argsort()[0][::-1][:K]
        # recommendations = new_restaurants.iloc[similar_bus_indices]['business_id']

        ### Properly using the CF approach
        # getting the user's favorite restaurant
        favorite_business = reviews[reviews['user_id'] == user_id].sort_values('stars_x', ascending=False)['business_id'].head(1).values[0]
        bus_list = restaurant_profile[restaurant_profile['business_id'] == favorite_business].drop(columns=['business_id']).values
        
        # Similarities between users
        similarity_scores = cosine_similarity(bus_list, bus_matrix)

        # Finding the K most similar restaurants
        similar_bus_indices = similarity_scores.argsort()[0][::-1][:K]
        similar_restaurants = new_restaurants.iloc[similar_bus_indices]['business_id']

        # recommending the top N restaurants [WE SHOULD WEIGHT THE RATING BY SIMILARITY OF THE RESTAURANTS THE USER RATED]
        # ASSUMING A RATING OF 5 FOR THESE
        recommendations = similar_restaurants
    
    else:
        raise ValueError("Invalid type. Please choose 'user_item', 'users', or 'items'.")
    
    return recommendations

In [47]:
recommend('-25VzPSb4ox-GdWmKDtshA', reviews, user_profile_doc2vec, restaurant_profile_doc2vec)

1394    WP9GAuhvmUhm8MAxMqhgrQ
2106    lWedWkinrM5j13pyimbpbA
2101    lRbHFOIFuusN2WOR_ypQ_A
2090    l8gAoQqqVfphPe9jmIJZ3g
2353    rYqmaOIULRouz_1db07OdQ
Name: business_id, dtype: object

In [77]:
recommend('-25VzPSb4ox-GdWmKDtshA', reviews, user_profile_doc2vec, restaurant_profile_doc2vec, type='users')

0    -0TffRSXXIlBYVbb5AwfTg
1    A9bqf3aoiQXX9WBswewivg
2    mUIBtlWNPD7sz3rGGWQ1RA
3    Ay0j65KKb3sOrOFUVWs44g
4    B08ELtRljjxnefQ3ayqUZg
Name: business_id, dtype: object

In [107]:
recommend('-25VzPSb4ox-GdWmKDtshA', reviews, user_profile, restaurant_profile, type='items')

1092    AvdhZ3RgY3xJbW-JMACcMg
2139    MqAvonk4-cJ-7pW78tR1Rw
2485    QgDsoMYcLcVcBQ3UgVay4A
3450    aaqm5vJvVj3vyMeGwPqxEw
2526    R9VYqhQeGtHpaadx4eK3_A
Name: business_id, dtype: object