In [1]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
from keras.models import load_model
from sklearn.model_selection import train_test_split
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from keras.models import Model
warnings.filterwarnings('ignore')
%matplotlib inline

## Analyse Dataset

### Food.com   Dataset
- https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions
- https://aclanthology.org/D19-1613/

In [3]:
rating_df =pd.read_csv('RAW_interactions.csv')


In [4]:
rating_df

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."
...,...,...,...,...,...
1132362,116593,72730,2003-12-09,0,Another approach is to start making sauce with...
1132363,583662,386618,2009-09-29,5,These were so delicious! My husband and I tru...
1132364,157126,78003,2008-06-23,5,WOW! Sometimes I don't take the time to rate ...
1132365,53932,78003,2009-01-11,4,Very good! I used regular port as well. The ...


In [5]:
rating_df=rating_df[['user_id','recipe_id','date','rating']]


In [6]:
rating_df.rename(columns = {'user_id':'userId', 'recipe_id':'item'}, inplace = True)

In [7]:

x = rating_df.groupby('item').count()['rating'] >= 60
userIndex = x[x].index
nm_rating_df=rating_df[rating_df['item'].isin(userIndex)]
y = nm_rating_df.groupby('userId').count()['rating']>=100
items = y[y].index
rating_df = nm_rating_df[nm_rating_df['userId'].isin(items)]
rating_df.shape

(14357, 4)

### Generate Train and test data

In [8]:
# train_df, test_df = train_test_split(rating_df, test_size=0.25, random_state=0)


In [9]:
final_ratings=rating_df.sort_values(by=['userId','date','rating'], ascending=[False,False,False]) 
final_ratings.shape

(14357, 4)

In [10]:
test_df=final_ratings.groupby('userId').head(30)
test_df.shape

(2880, 4)

In [11]:
train_df=final_ratings[~final_ratings.isin(test_df).all(1)]
train_df.shape

(11477, 4)

In [12]:
train_df.head(2)

Unnamed: 0,userId,item,date,rating
721496,1581225,38298,2017-01-16,5
88267,1581225,97496,2017-01-06,5


In [13]:
product_ids = list(set(list(rating_df.item.unique())))
user_ids = list(set(list(rating_df.userId.unique())))

In [14]:
dict_products = {}
index = 0
for ids in sorted(product_ids):
    dict_products[ids] = index
    index += 1

In [15]:
dict_users = {}
index = 0
for ids in sorted(user_ids):
    dict_users[ids] = index
    index += 1

In [16]:
train_df["item"] = train_df["item"].map(dict_products)
train_df["userId"] = train_df["userId"].map(dict_users)

In [17]:
test_df["item"] = test_df["item"].map(dict_products)
test_df["userId"] = test_df["userId"].map(dict_users)

In [18]:
for col in ["userId", "item", "rating"]:
    train_df[col] = train_df[col].astype(np.float32)
    test_df[col] = test_df[col].astype(np.float32)

In [19]:
num_unique_users=len(set(list(rating_df.userId.unique()) ))
num_unique_products=len(set(list(rating_df.item.unique())))

### DNN

In [20]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import optimizers as opt
from tensorflow.keras import regularizers as rgl
from tensorflow.keras.layers import Embedding, multiply, concatenate, Flatten, Input, Dense, Dropout

In [21]:
dropout_rate=0.05
dense_1=512
dense_2=128
dense_3=64

activation_func="relu"
regularizer=rgl.l2(0.0001)#l2 regularization
max_rating=5
min_rating=0

In [22]:
EMBEDDING_SIZE=64
users_input = Input(shape=(1,), name="user_input")
users_embedding = Embedding(num_unique_users + 1, EMBEDDING_SIZE, embeddings_regularizer=regularizer, name="user_embeddings")(users_input)
users_bias = Embedding(num_unique_users + 1, 1, embeddings_regularizer=regularizer, name="user_bias")(users_input)

product_input = Input(shape=(1,), name="product_input")
product_embedding = Embedding(num_unique_products + 1, EMBEDDING_SIZE, embeddings_regularizer=regularizer, name="product_embedding")(product_input)
product_bias = Embedding(num_unique_products + 1, 1, embeddings_regularizer=regularizer, name="product_bias")(product_input)

dot_product_users_product = multiply([users_embedding, product_embedding])

input_terms = concatenate([dot_product_users_product, users_bias, product_bias])

input_terms = Flatten(name="fl_inputs")(input_terms)

output = Dense(dense_1, activation=activation_func, name="dense_0")(input_terms)

output = Dropout(dropout_rate)(output)

output = Dense(dense_2, activation=activation_func, name="dense_1")(output)

output = Dropout(dropout_rate)(output)

output = Dense(dense_3, activation=activation_func, name="dense_2")(output)

output = Dropout(dropout_rate)(output)

output = Dense(1, activation="relu", name="output")(output) * (max_rating - min_rating) + min_rating

In [23]:
model = Model(inputs=[users_input, product_input], outputs=output)

In [24]:
opt_adam = opt.RMSprop(lr = 0.001)
model.compile(optimizer=opt_adam, loss= ['mse'], metrics=['mean_absolute_error'])

In [25]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3,restore_best_weights=True)
model.fit(x=[train_df.userId, train_df.item], y=train_df.rating, batch_size=32, epochs=10, verbose=1 ,callbacks=[callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x261c9dafb00>

In [26]:
model.predict([test_df.userId, test_df.item])

array([[3.9290614],
       [3.959827 ],
       [4.0382442],
       ...,
       [4.772168 ],
       [4.806385 ],
       [4.8231144]], dtype=float32)

In [27]:
model.evaluate([test_df.userId, test_df.item], test_df.rating)



[1.0681740045547485, 0.7049938440322876]

In [28]:
result=model.predict([test_df.userId, test_df.item])

In [29]:
uniqueproducts=train_df.item.unique()

In [30]:
userlist=list(test_df.userId.unique())[:200]

In [31]:
test_df=test_df[test_df['userId'].isin(userlist)]

In [32]:
train_df=train_df[train_df['userId'].isin(userlist)]

In [33]:
import itertools
result=pd.DataFrame(list(itertools.product(userlist,uniqueproducts)))

In [34]:
result.columns =['userId', 'item']

In [35]:
predictions=model.predict([result.userId, result.item])

In [36]:
predictions=list(predictions)

In [37]:
ratings=[]

for e in predictions:
 ratings.append(e[0])


In [38]:
result['rating']=ratings

In [39]:
result=result.sort_values(['userId','rating'], ascending=[True,False])

In [40]:
result=result[result['rating']>4]

In [41]:
from collections import Counter
recommend_dict = { }
for index, row in result.iterrows():
    user=row['userId']
    item=row['item']
    if user not in recommend_dict.keys():
        recommend_dict[user]={}
    
    if(~((train_df['userId'] == user) & (train_df['item'] ==item)).any()):
            if item not in recommend_dict[user].keys():
                recommend_dict[user][item] =1
            else:
                recommend_dict[user][item]+=1
                    
        
    
        
        

In [42]:
# Iterating over values
recommendations=[]
for userid, items in recommend_dict.items():
    total=0
    mlist=[]
    for i in items:
        total+=1
        mlist.append(i)
        if(total>10):
            break
    recommendations.append([userid,mlist])

In [43]:
type(recommendations)

list

In [44]:
recommendations=pd.DataFrame(recommendations,columns=['user_id','items'])

In [45]:
recommendations.head(5)

Unnamed: 0,user_id,items
0,0.0,"[21.0, 392.0, 1282.0, 556.0, 35.0, 806.0, 327...."
1,1.0,"[858.0, 391.0, 692.0, 1132.0, 1224.0, 813.0, 1..."
2,2.0,"[861.0, 1210.0, 21.0, 391.0, 571.0, 349.0, 918..."
3,3.0,"[1282.0, 1227.0, 392.0, 861.0, 977.0, 1071.0, ..."
4,4.0,"[813.0, 777.0, 875.0, 477.0, 915.0, 1282.0, 80..."


### Hit Rate

In [46]:
total=0
hit=0
for index, row in recommendations.iterrows():
    uid=row['user_id']
    results=list(row['items'])
    total+=1
    for rs in results:
        
        if(((test_df['item'].isin(results)) & (test_df['userId'] ==uid)).any()):
                hit+=1
                break
        
hit_ratio=hit/total   
print(hit_ratio)

0.5416666666666666


### Hit Rate on liked items

In [47]:
total=0
hit=0
for index, row in recommendations.iterrows():
    uid=row['user_id']
    results=list(row['items'])
    total+=1
    for rs in results:
        
        if(((test_df['item'].isin(results)) & (test_df['userId'] ==uid) & (test_df['rating']>4)).any()):
                hit+=1
                break
        
hit_ratio=hit/total   
print(hit_ratio)

0.4583333333333333
