In [1]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
from keras.models import load_model
from sklearn.model_selection import train_test_split
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from keras.models import Model
warnings.filterwarnings('ignore')
%matplotlib inline

## Analyse Dataset

### Amazon product review   Dataset
- https://www.kaggle.com/datasets/saurav9786/amazon-product-reviews?resource=download

In [3]:
!pip install gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!gdown https://drive.google.com/uc?id=1dFuCcqIO0_vQcNg_47Ondz3akBG1ojzb

Downloading...
From: https://drive.google.com/uc?id=1dFuCcqIO0_vQcNg_47Ondz3akBG1ojzb
To: /content/ratings_Electronics.csv
100% 319M/319M [00:01<00:00, 226MB/s]


In [5]:
rating_df =pd.read_csv('ratings_Electronics.csv',  names=['userId','item','rating','time'])
rating_df=rating_df[['userId','item','rating','time']]

x = rating_df.groupby('item').count()['rating'] >= 30
userIndex = x[x].index
nm_rating_df=rating_df[rating_df['item'].isin(userIndex)]
y = nm_rating_df.groupby('userId').count()['rating']>=30
items = y[y].index
rating_df = nm_rating_df[nm_rating_df['userId'].isin(items)]
rating_df.shape

(131062, 4)

### Generate Train and test data

In [6]:
# train_df, test_df = train_test_split(rating_df, test_size=0.25, random_state=0)


In [7]:
final_ratings=rating_df.sort_values(by=['userId','time','rating'], ascending=[False,False,False]) 
final_ratings.shape

(131062, 4)

In [8]:
test_df=final_ratings.groupby('userId').head(5)
test_df.shape

(13430, 4)

In [9]:
train_df=final_ratings[~final_ratings.isin(test_df).all(1)]
train_df.shape

(117632, 4)

In [10]:
train_df.head(2)

Unnamed: 0,userId,item,rating,time
7145947,AZZYW4YOE1B6E,B00BP5MB56,3.0,1389744000
3215796,AZZYW4YOE1B6E,B003BYRGJU,5.0,1388534400


In [11]:
product_ids = list(set(list(rating_df.item.unique())))
user_ids = list(set(list(rating_df.userId.unique())))

In [12]:
dict_products = {}
index = 0
for ids in sorted(product_ids):
    dict_products[ids] = index
    index += 1

In [13]:
dict_users = {}
index = 0
for ids in sorted(user_ids):
    dict_users[ids] = index
    index += 1

In [14]:
train_df["item"] = train_df["item"].map(dict_products)
train_df["userId"] = train_df["userId"].map(dict_users)

In [15]:
test_df["item"] = test_df["item"].map(dict_products)
test_df["userId"] = test_df["userId"].map(dict_users)

In [16]:
for col in ["userId", "item", "rating"]:
    train_df[col] = train_df[col].astype(np.float32)
    test_df[col] = test_df[col].astype(np.float32)

In [17]:
num_unique_users=len(set(list(rating_df.userId.unique()) ))
num_unique_products=len(set(list(rating_df.item.unique())))

### DNN

In [18]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import optimizers as opt
from tensorflow.keras import regularizers as rgl
from tensorflow.keras.layers import Embedding, multiply, concatenate, Flatten, Input, Dense, Dropout

In [19]:
dropout_rate=0.05
dense_1=256
dense_2=128
dense_3=64

activation_func="relu"
regularizer=rgl.l2(0.00001)#l2 regularization
max_rating=5
min_rating=0

In [20]:
EMBEDDING_SIZE=64
users_input = Input(shape=(1,), name="user_input")
users_embedding = Embedding(num_unique_users + 1, EMBEDDING_SIZE, embeddings_regularizer=regularizer, name="user_embeddings")(users_input)
users_bias = Embedding(num_unique_users + 1, 1, embeddings_regularizer=regularizer, name="user_bias")(users_input)

product_input = Input(shape=(1,), name="product_input")
product_embedding = Embedding(num_unique_products + 1, EMBEDDING_SIZE, embeddings_regularizer=regularizer, name="product_embedding")(product_input)
product_bias = Embedding(num_unique_products + 1, 1, embeddings_regularizer=regularizer, name="product_bias")(product_input)

dot_product_users_product = multiply([users_embedding, product_embedding])

input_terms = concatenate([dot_product_users_product, users_bias, product_bias])

input_terms = Flatten(name="fl_inputs")(input_terms)

output = Dense(dense_1, activation=activation_func, name="dense_0")(input_terms)

output = Dropout(dropout_rate)(output)

output = Dense(dense_2, activation=activation_func, name="dense_1")(output)

output = Dropout(dropout_rate)(output)

output = Dense(dense_3, activation=activation_func, name="dense_2")(output)

output = Dropout(dropout_rate)(output)

output = Dense(1, activation="relu", name="output")(output) * (max_rating - min_rating) + min_rating

In [21]:
model = Model(inputs=[users_input, product_input], outputs=output)

In [22]:
opt_adam = opt.Adam(lr = 0.001)
model.compile(optimizer=opt_adam, loss= ['mse'], metrics=['mean_absolute_error'])

In [23]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3,restore_best_weights=True)
model.fit(x=[train_df.userId, train_df.item], y=train_df.rating,validation_split = 0.1, batch_size=256, epochs=30, verbose=1 ,callbacks=[callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30


<keras.callbacks.History at 0x7fd5b64a8dd0>

In [24]:
model.predict([test_df.userId, test_df.item])

array([[4.196298 ],
       [4.9617147],
       [4.3071566],
       ...,
       [4.9449806],
       [4.752428 ],
       [4.9635153]], dtype=float32)

In [25]:
model.evaluate([test_df.userId, test_df.item], test_df.rating)



[1.0552324056625366, 0.666104793548584]

In [26]:
model.predict([pd.Series([9.0]), pd.Series([1.0])])

array([[4.7862005]], dtype=float32)

In [27]:
result=model.predict([test_df.userId, test_df.item])

In [28]:
uniqueproducts=train_df.item.unique()

In [29]:
userlist=list(test_df.userId.unique())[:200]

In [30]:
test_df=test_df[test_df['userId'].isin(userlist)]

In [31]:
train_df=train_df[train_df['userId'].isin(userlist)]

In [32]:
import itertools
result=pd.DataFrame(list(itertools.product(userlist,uniqueproducts)))

In [33]:
result.columns =['userId', 'item']

In [34]:
predictions=model.predict([result.userId, result.item])

In [35]:
predictions=list(predictions)

In [36]:
ratings=[]

for e in predictions:
 ratings.append(e[0])


In [37]:
result['rating']=ratings

In [38]:
result=result.sort_values(['userId','rating'], ascending=[True,False])

In [39]:
result=result[result['rating']>4]

In [40]:
from collections import Counter
recommend_dict = { }
for index, row in result.iterrows():
    user=row['userId']
    item=row['item']
    if user not in recommend_dict.keys():
        recommend_dict[user]={}
    
    if(~((train_df['userId'] == user) & (train_df['item'] ==item)).any()):
            if item not in recommend_dict[user].keys():
                recommend_dict[user][item] =1
            else:
                recommend_dict[user][item]+=1
                    
        
    
        
        

In [41]:
# Iterating over values
recommendations=[]
for userid, items in recommend_dict.items():
    total=0
    mlist=[]
    for i in items:
        total+=1
        mlist.append(i)
        if(total>10):
            break
    recommendations.append([userid,mlist])

In [42]:
type(recommendations)

list

In [43]:
recommendations=pd.DataFrame(recommendations,columns=['user_id','items'])

In [44]:
recommendations.head(5)

Unnamed: 0,user_id,items
0,2486.0,"[21298.0, 1374.0, 23767.0, 13869.0, 3803.0, 25..."
1,2487.0,"[21039.0, 11469.0, 8204.0, 2174.0, 16270.0, 50..."
2,2488.0,"[17719.0, 19536.0, 22211.0, 21487.0, 22804.0, ..."
3,2489.0,"[14450.0, 23105.0, 1736.0, 6415.0, 11470.0, 14..."
4,2490.0,"[9648.0, 22163.0, 1242.0, 8159.0, 13338.0, 113..."


### Hit Rate

In [45]:
total=0
hit=0
for index, row in recommendations.iterrows():
    uid=row['user_id']
    results=list(row['items'])
    total+=1
    for rs in results:
        
        if(((test_df['item'].isin(results)) & (test_df['userId'] ==uid)).any()):
                hit+=1
                break
        
hit_ratio=hit/total   
print(hit_ratio)

0.015


### Hit Rate on liked items

In [46]:
total=0
hit=0
for index, row in recommendations.iterrows():
    uid=row['user_id']
    results=list(row['items'])
    total+=1
    for rs in results:
        
        if(((test_df['item'].isin(results)) & (test_df['userId'] ==uid) & (test_df['rating']>4)).any()):
                hit+=1
                break
        
hit_ratio=hit/total   
print(hit_ratio)

0.015
