# **Importing libraries**

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# **Reading the dataset**

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amazon-data/Amazon_data.csv


In [3]:
df = pd.read_csv('/kaggle/input/amazon-data/Amazon_data.csv')
df.head()

Unnamed: 0,item_id,title,brand,user_id,rating,timestamp,sub_cat,main_cat
0,B00ENFVJJO,PowerA DualShock 4 Charging Station for PlaySt...,by\n \n PowerA,A30E7ZUWMO2E8V,5.0,"01 5, 2017",Video_Games,Electronics and Technology
1,B00AEM2EGW,"JE206 Men Women Faux Spinal Gemstone Ring, Sil...",Quality Brand,A4FQC6UQO6W84,5.0,"09 20, 2013",All_Beauty,Beauty and Fashion
2,B006GHA8EE,The Witcher 2: Assassins Of Kings Enhanced Edi...,by\n \n WB Games,A3GPR3LKLWHSXS,2.0,"01 21, 2014",Video_Games,Electronics and Technology
3,B0001HAI8G,MVP Baseball 2004 - Gamecube,by\n \n Electronic Arts,A1PQNBC3IG5DWD,1.0,"08 23, 2015",Video_Games,Electronics and Technology
4,B000CBCVFE,Resident Evil Deadly Silence - Nintendo DS,by\n \n Capcom,A3FJKFJ61BU32J,5.0,"08 17, 2017",Video_Games,Electronics and Technology


# **Data pre-processing**

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   item_id    50000 non-null  object 
 1   title      49856 non-null  object 
 2   brand      46137 non-null  object 
 3   user_id    50000 non-null  object 
 4   rating     50000 non-null  float64
 5   timestamp  50000 non-null  object 
 6   sub_cat    50000 non-null  object 
 7   main_cat   50000 non-null  object 
dtypes: float64(1), object(7)
memory usage: 3.1+ MB


In [5]:
df.isna().sum()

item_id         0
title         144
brand        3863
user_id         0
rating          0
timestamp       0
sub_cat         0
main_cat        0
dtype: int64

In [6]:
df = df.drop_duplicates()
df = df[df['brand'].notna()]
df.drop('timestamp', axis='columns', inplace=True)
df.drop('brand', axis='columns', inplace=True)
df.drop('sub_cat', axis='columns', inplace=True)
df.drop('main_cat', axis='columns', inplace=True)
df.drop('item_id', axis='columns', inplace=True)

In [7]:
df.isna().sum()

title      0
user_id    0
rating     0
dtype: int64

In [8]:
df.shape

(46095, 3)

In [9]:
df['title'] = df['title'].astype('string')
df['user_id'] = df['user_id'].astype('string')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46095 entries, 0 to 49998
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   title    46095 non-null  string 
 1   user_id  46095 non-null  string 
 2   rating   46095 non-null  float64
dtypes: float64(1), string(2)
memory usage: 1.4 MB


In [10]:
# converting item id's to unique integer id's
def generate_id(s):
    return abs(hash(s)) % (10 ** 10)

# df.loc[:, ('item_id')] = df['item_id'].apply(generate_id)
df['item_id'] = df['title'].apply(generate_id)
df = df.iloc[:-1, :]

In [11]:
df

Unnamed: 0,title,user_id,rating,item_id
0,PowerA DualShock 4 Charging Station for PlaySt...,A30E7ZUWMO2E8V,5.0,9684468121
1,"JE206 Men Women Faux Spinal Gemstone Ring, Sil...",A4FQC6UQO6W84,5.0,1409967169
2,The Witcher 2: Assassins Of Kings Enhanced Edi...,A3GPR3LKLWHSXS,2.0,8340696149
3,MVP Baseball 2004 - Gamecube,A1PQNBC3IG5DWD,1.0,9569002861
4,Resident Evil Deadly Silence - Nintendo DS,A3FJKFJ61BU32J,5.0,2585246229
...,...,...,...,...
49992,History Channel: Civil War: A Nation Divided -...,AV0T8159EFP0Q,3.0,9802589503
49993,Amazon Music [PC] [Download],A2D239DN8M1NB8,1.0,3518628090
49994,PaintShop Pro X5 Ultimate [Old Version],A26TNHDJOJDLF0,5.0,9376315784
49995,HORI Stereo Chat Headset 4 for PlayStation 4,AO3VLMCH15G2V,4.0,4664981332


In [12]:
df['item_id'].unique().shape

(21159,)

In [13]:
df['title'].unique().shape

(21159,)

# **Popularity Based Recommender**

In [14]:
item_ratings = df[['item_id', 'rating']]
item_ratings = item_ratings.groupby('item_id').mean()

In [15]:
item_ratings.sort_values(by='rating', ascending=False).head(5)

Unnamed: 0_level_0,rating
item_id,Unnamed: 1_level_1
4980917055,5.0
5273835289,5.0
5302768214,5.0
5304753702,5.0
5306823579,5.0


In [16]:
item_ratings['item_id'] = item_ratings.index
item_ratings = item_ratings.reset_index(drop=True)

# **Collaborative Filtering**

In [17]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD

In [18]:
reader = Reader(line_format = 'user item rating', rating_scale=(0,5))
data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [19]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ed971662ce0>

**Evaluating the model**

In [20]:
pred = model.test(testset)
pred.sort(key=lambda x: x.est, reverse=True)

cb = pd.DataFrame.from_records(pred)
cb.drop(cb.columns[[0, 3, 4]],axis = 1, inplace=True)
cb.rename(columns = {'1':'item_id'}, inplace = True)
cb.columns = ['item_id', 'rating']

In [21]:
cb

Unnamed: 0,item_id,rating
0,8381866440,5.0
1,8381866440,3.0
2,8381866440,5.0
3,8381866440,5.0
4,8381866440,1.0
...,...,...
9214,9617457283,3.0
9215,9617457283,2.0
9216,7347397974,1.0
9217,7347397974,1.0


In [22]:
from surprise import accuracy
rmse1 = accuracy.rmse(pred)
mae1 = accuracy.mae(pred)

RMSE: 1.4080
MAE:  1.1514


**Finding recommendations**

In [23]:
train_set = data.build_full_trainset()
model.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ed971662ce0>

In [24]:
uid = 'A2F2J3J2ALMM2A'
items = df['item_id'].unique()
arr = []
for item in items:
    iid = str(item)
    arr.append(model.predict(uid, iid))
predictions = pd.DataFrame.from_records(arr)
predictions.drop(predictions.columns[[0, 2, 4]],axis = 1, inplace=True)
predictions.columns = ['item_id', 'rating']
predictions.head(5)

Unnamed: 0,item_id,rating
0,9684468121,4.045845
1,1409967169,4.045845
2,8340696149,4.045845
3,9569002861,4.045845
4,2585246229,4.045845


# **Content Based Filtering**

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [26]:
unique_titles = pd.DataFrame(df['title'].unique())

In [27]:
unique_titles.columns = ['title']
unique_titles

Unnamed: 0,title
0,PowerA DualShock 4 Charging Station for PlaySt...
1,"JE206 Men Women Faux Spinal Gemstone Ring, Sil..."
2,The Witcher 2: Assassins Of Kings Enhanced Edi...
3,MVP Baseball 2004 - Gamecube
4,Resident Evil Deadly Silence - Nintendo DS
...,...
21154,Fight Night Round 3 - Xbox 360
21155,Leegoal(TM) Punk Gothic Colorful Long Arm Warm...
21156,Favebridal Women's Long Formal Mermaid Gold La...
21157,History Channel: Civil War: A Nation Divided -...


In [28]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(unique_titles['title'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [29]:
title = df.loc[df['user_id'] == 'A2F2J3J2ALMM2A', 'title'].values[0]
idx = unique_titles.index[unique_titles['title'] == title][0]
idx
score_series = pd.DataFrame(cosine_sim[idx])
score_series
score_series.columns = ['rating']
unique_items = df['item_id'].unique()
score_series['item_id'] = unique_items
score_series.sort_values(by='rating', ascending=False, inplace=True)

In [30]:
score_series.head()

Unnamed: 0,rating,item_id
4017,1.0,9761228353
16744,0.366486,9864882471
12683,0.364229,4725582703
9372,0.329617,1823946570
6608,0.329166,6490323890


In [31]:
score_series['rating'] = score_series['rating'] * [5]

In [32]:
item = df.loc[df['user_id'] == 'A2F2J3J2ALMM2A', 'item_id'].values[0]

# **Getting new recommendations**

In [33]:
predictions['item_id'] = predictions['item_id'].astype('int64')

In [34]:
hybrid_rec = item_ratings.merge(predictions ,on='item_id').merge(score_series,on='item_id')

In [35]:
hybrid_rec

Unnamed: 0,rating_x,item_id,rating_y,rating
0,4.5,483220,4.045845,0.0
1,5.0,625290,4.045845,0.0
2,4.0,864109,4.045845,0.0
3,2.0,924352,4.045845,0.0
4,3.0,1235031,4.045845,0.0
...,...,...,...,...
21154,5.0,9997270519,4.045845,0.0
21155,5.0,9997749516,4.045845,0.0
21156,5.0,9998530680,4.045845,0.0
21157,5.0,9998969939,4.045845,0.0


In [36]:
hybrid_rec['r'] = hybrid_rec['rating']*0.2 + hybrid_rec['rating_x']*0.4 + hybrid_rec['rating_y']*0.4

In [37]:
hybrid_rec = hybrid_rec[['item_id', 'r']]

In [38]:
hybrid_rec.sort_values(by='r', ascending=False)['item_id'].head()

20626    9761228353
20872    9864882471
3888     1823946570
13746    6490323890
12495    5902636591
Name: item_id, dtype: int64

# **Comparing the performance of the approaches**

In [39]:
pbr = item_ratings.loc[item_ratings['item_id'] == item, 'rating'].values[0]

In [40]:
i = np.where(predictions['item_id'].values == item)[0][0]
cfr = predictions.at[predictions.index[i],'rating']

In [41]:
cbf = score_series.loc[score_series['item_id'] == item, 'rating'].values[0]

In [42]:
predicted = pbr * 0.2 + cfr * 0.4 + cbf * 0.4

In [43]:
actual = df.loc[df['user_id'] == 'A2F2J3J2ALMM2A', 'rating'].values[0]

In [44]:
import math
def rmse(prediction, actual):
    return math.sqrt((prediction-actual)**2)

def mae(prediction, actual):
    return abs(prediction-pbr)
rmse_hybrid = round(rmse(predicted, actual), 4)
mae_hybrid = round(mae(predicted, actual), 4)
print(f"rmse: {rmse_hybrid}")
print(f"mae: {mae_hybrid}")

rmse: 0.4217
mae: 0.2217


In [45]:
d = [[round(rmse1, 4), rmse_hybrid], [round(mae1, 4), mae_hybrid]]
comp = pd.DataFrame(d, columns=['existing', 'improved'], index=['RMSE', 'MAE'])
comp

Unnamed: 0,existing,improved
RMSE,1.408,0.4217
MAE,1.1514,0.2217
