In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('mod_data2.txt', encoding = "ISO-8859-1")

In [4]:
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate.]Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [5]:
import string
import re
def clean_text(text):
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    return text

In [6]:
yelp_data = df[['business_id', 'user_id', 'stars', 'text']]

In [7]:
%%time
yelp_data['text'] = yelp_data['text'].apply(clean_text)

Wall time: 438 ms


In [8]:
#Split train test
vld_size=0.15
X_train, X_valid, y_train, y_valid = train_test_split(yelp_data['text'], df['business_id'], test_size = vld_size) 

In [9]:
userid_df = yelp_data[['user_id','text']]
business_df = yelp_data[['business_id', 'text']]

In [10]:
userid_df = userid_df.groupby('user_id').agg({'text': ' '.join})
business_df = business_df.groupby('business_id').agg({'text': ' '.join})

In [11]:
#userid vectorizer
userid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=1000)
userid_vectors = userid_vectorizer.fit_transform(userid_df['text'])
userid_vectors.shape

(262, 1000)

In [12]:
#Business id vectorizer
businessid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=1000)
businessid_vectors = businessid_vectorizer.fit_transform(business_df['text'])
businessid_vectors.shape

(253, 1000)

# Matrix Factorization

In [13]:
userid_rating_matrix = pd.pivot_table(yelp_data, values='stars', index=['user_id'], columns=['business_id'])
userid_rating_matrix.shape

(262, 253)

In [14]:
userid_rating_matrix.head()

business_id,-0bUDim5OGuv8R0Qqq6J4A,-kVXDEqGHOWKxQ3EhvFgVA,-yxfBYGB6SEqszmxJxd97A,06kfoeRs9Acj82Yl3i9p_w,08Z_Zzp8PyEmWWpYurIO-Q,0BpMvu5B9fY-KEbOuxLtFQ,0QTn4pMzKv1mnQifcP9YoQ,0QrA-Klgp1R-GzUr6uJS7Q,0RqNRc6RiILzZxK9J8Kqug,145SQ_msdQ1yo7J0E63VoA,...,xmtKVO7C7KYVqMBe2eQq3A,y6uO4ydAwBHUujfiSktxZg,yGmdo1ENajB98iryHGoWFw,yJr24Yy1K6bt2G9fX3_zPA,yPJFfglhMHAKciUpjTmgBg,yb17xHvhDJthJGS10uhFeQ,yc5AH9H71xJidA_J2mChLA,zDfaNgSYLn-TwquB5A6AaA,znBnrQNq1FdUt5aIGAbyuQ,zp713qNhx8d9KCJJnrw1xA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-OMlS6yWkYjVldNhC31wYg,,,,,,,,,,,...,,,,,,,,,,
-V3CXSxnUzjkLVqi1xGrkA,,,,,,,,,,,...,,,,,,,,,,
-Wa5j14Rhps3DC2SMaaatQ,,,,,,,,,,,...,,,,,,,,,,
-yg_AMAU2HNh48zcuQ13mw,,,,,,,,,,,...,,,,,,,,,,
0CMz8YaO3f8xu4KqQgKb9Q,,,,,,,,,,,...,,,,,,,,,,


In [15]:
P = pd.DataFrame(userid_vectors.toarray(), index=userid_df.index, columns=userid_vectorizer.get_feature_names())
Q = pd.DataFrame(businessid_vectors.toarray(), index=business_df.index, columns=businessid_vectorizer.get_feature_names())


In [16]:
Q.head()

Unnamed: 0_level_0,!,+,-,1,10,15,2,20,3,30,...,yellow,yelp,yes,yesterday,yet,yoga,you,yum,yummy,zach
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0bUDim5OGuv8R0Qqq6J4A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-kVXDEqGHOWKxQ3EhvFgVA,0.0,0.0,0.085288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-yxfBYGB6SEqszmxJxd97A,0.0,0.214001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.069083,0.0,0.0,0.074398,0.0,0.0
06kfoeRs9Acj82Yl3i9p_w,0.0,0.0,0.083387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
08Z_Zzp8PyEmWWpYurIO-Q,0.219165,0.5709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
def matrix_factorization(R, P, Q, steps=1, gamma=0.001,lamda=0.02):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        
    return P,Q


In [18]:
%%time
P, Q = matrix_factorization(userid_rating_matrix, P, Q, steps=1, gamma=0.001,lamda=0.02)

Wall time: 4.8 s


In [19]:
#Store P, Q and vectorizer in pickle file
import pickle
output = open('recommendation.pkl', 'wb')
pickle.dump(P,output)
pickle.dump(Q,output)
pickle.dump(userid_vectorizer,output)
output.close()

## Prediction for input text

In [21]:
words = 'I am intrested in vegeterian restaruents and I want eat falafal oh my god'
test_df= pd.DataFrame([words], columns=['text'])
test_df['text'] = test_df['text'].apply(clean_text)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index, columns=userid_vectorizer.get_feature_names())

predictItemRating=pd.DataFrame(np.dot(test_v_df.loc[0],Q.T),index=Q.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:3]
#topRecommendations


In [22]:
L = topRecommendations.index.values.tolist()
L

['tdcjXyFLMKAsvRhURNOkCg', 'K2_Hmmo5crTYWiT_1sWnfQ', 'uI4YqMarUpchI4I3ZWgOGA']

In [23]:
CSV = pd.read_csv('cuisine_data_final.csv', header=None)
CSV

Unnamed: 0,tdcjXyFLMKAsvRhURNOkCg,Turkish (VEG),"['Kuru Fasulye','Börek. Börek','Çorba','Ciğ Kofte','İmam Bayıldı']",words = 'I am intrested in vegeterian restaruents and I want eat falafal oh my god'
0,K2_Hmmo5crTYWiT_1sWnfQ,Iranian (VEG),['Kashk-e Bademjan (eggplant with whey sauce d...,
1,uI4YqMarUpchI4I3ZWgOGA,Mexican (VEG),['Butternut Squash Chipotle Chili with Avocado...,
2,1NZLxU5WvB5roPFzneAlLw,Mexican (NON VEG),"['Chicken Quesadillas','Red Snapper Vera Cruza...","words = 'I ate the tacos here and loved it , t..."
3,b5cEoKR8iQliq-yT2_O0LQ,All American (NON VEG),"['The Hamburger(Beaf or chicken)','Clam Chowde...",
4,c1yGkETheht_1vjda7G5sA,South American(NON VEG),"['Ceviche','Empanadas','Chimichurri and Red Wi...",
5,VY_tvNUCCXGXQeSvJl757Q,South Indian(VEG),"['Idli','Masala Doas', 'Set Dosa', 'Pongal','U...",words = 'OMG the dosa and Idli was too good here'
6,odhXwWaYZvD_icIN6f_DbA,North Indian (VEG),"['Dal Chaval','Kichidi','Paneer Butter Masala'...",
7,FIQz9u8Cy7aTXfMQugUYvA,Indian (VEG),"['Idli','Rava Idli','Paneer Butter Masala','Ki...",
8,MUsqPXthMdHoPEow9M3MfA,Chinese (NON VEG),"['Szechwan (Sichuan) Chilli Chicken ','Sweet a...",words = 'I loved the wonton soup and the sprin...
9,RqbSeoeqXTwts5pfhw7nJg,Chinese (VEG),"['15-Minute Garlic Noodles','General Tso Tofu ...",


## Prediction and Accuracy test on Validation set

In [3]:
f = open('recommendation.pkl', 'rb')
P, Q, userid_vectorizer = pickle.load(f), pickle.load(f), pickle.load(f)

In [14]:
test_df = pd.DataFrame([sentence], columns=['text'])
test_df['text'] = test_df['text'].apply(self.clean_text)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index,
                         columns=userid_vectorizer.get_feature_names())

pandas.core.series.Series

In [36]:
test_df = X_valid.to_frame()
test_df['text'] = test_df['text'].apply(clean_text)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index,
                         columns=userid_vectorizer.get_feature_names())


In [49]:
y_pred = []
for key, row in test_v_df.iterrows():
    predictItemRating=pd.DataFrame(np.dot(row,Q.T),index=Q.index,columns=['Rating'])
    topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:1]
    y_pred.append(topRecommendations.index[0])

In [52]:
#Calculate Accuracy
from sklearn.metrics import accuracy_score
print('Accuracy for validation set is: ',accuracy_score(y_valid, y_pred))

Accuracy for validation set is:  0.684
