<a href="https://colab.research.google.com/github/insarov2014/recommended-items/blob/main/collaborative_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import python modules
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

I will import a small data set that contains the information users liked items. I'll apply user based collaborative filtering recommender to find out the recommended items for any given user.

In [2]:
# Mount my Google drive so you can read them easily
from google.colab import drive
drive.mount('/content/gdrive')
# Read dataset
RawDataFile = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/recommender system/Copy of user_item_ratings.csv")

# What does this data look like?
RawDataFile.head(10)

Mounted at /content/gdrive


Unnamed: 0,user,item,x,observed,liked
0,1,1,1.546881,1,1
1,2,1,0.178921,1,0
2,3,1,-0.282547,1,1
3,4,1,-0.767299,0,0
4,5,1,-0.576404,1,0
5,6,1,-0.914856,1,0
6,7,1,0.369911,0,1
7,8,1,-1.467684,1,0
8,9,1,-1.745362,1,0
9,10,1,-0.667382,1,0


In [92]:
# How many individual users?
RawDataFile['user'].nunique()

100

In [93]:
# How many individual items?
RawDataFile['item'].nunique()

28

In [3]:
RawDataFile.shape

(2800, 5)

In [94]:
# to select the observed facts
train_data = RawDataFile[RawDataFile['observed']==1]

In [5]:
train_data.shape

(2515, 5)

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2515 entries, 0 to 2799
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   user      2515 non-null   int64  
 1   item      2515 non-null   int64  
 2   x         2515 non-null   float64
 3   observed  2515 non-null   int64  
 4   liked     2515 non-null   int64  
dtypes: float64(1), int64(4)
memory usage: 117.9 KB


Ref: https://github.com/satishrath185/Product-Recommendation/blob/master/Product%20Recommendations.ipynb

In [20]:
# to make a matrix that indicates which user liked which item
user_item_matrix = train_data.pivot_table(
    index='user',
    columns='item',
    values='liked',
    aggfunc='sum'
)

user_item_matrix

item,1,2,3,4,5,6,7,8,9,10,...,19,20,21,22,23,24,25,26,27,28
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,1.0,,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,1.0,1.0,,,0.0,1.0,...,1.0,0.0,,1.0,,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,,1.0,1.0,1.0,,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,,1.0,,0.0
4,,,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
5,0.0,1.0,1.0,0.0,,1.0,,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,,0.0,0.0,,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,1.0,,0.0,0.0,1.0
97,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
98,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,,1.0,0.0,0.0,,1.0,0.0,0.0
99,1.0,0.0,1.0,1.0,1.0,1.0,,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [21]:
user_item_matrix = user_item_matrix.applymap(lambda x: 1 if x > 0 else 0)
user_item_matrix

item,1,2,3,4,5,6,7,8,9,10,...,19,20,21,22,23,24,25,26,27,28
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1,1,1,1,0,1,0,0,0,...,1,0,0,0,1,1,0,0,1,0
2,0,1,0,0,1,1,0,0,0,1,...,1,0,0,1,0,0,0,1,0,0
3,1,0,0,0,0,1,1,1,0,0,...,0,0,1,0,0,0,0,1,0,0
4,0,0,0,1,1,0,0,1,0,0,...,1,1,0,0,1,1,0,1,0,1
5,0,1,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0,0,0,1,1,1,0,1,0,0,...,1,1,1,0,0,1,0,0,0,1
97,0,1,1,0,0,1,1,0,0,0,...,1,0,1,0,1,0,0,0,1,0
98,1,1,0,1,1,1,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
99,1,0,1,1,1,1,0,0,1,0,...,0,0,1,0,1,0,1,0,0,0


Collaborative Filtering

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

# to calculate similarity score
user_user_simMatrix = pd.DataFrame(cosine_similarity(user_item_matrix))
user_user_simMatrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.000000,0.365148,0.218218,0.435194,0.577350,0.288675,0.365148,0.416667,0.510310,0.348155,...,0.174078,0.204124,0.456435,0.435194,0.365148,0.365148,0.639010,0.320256,0.400320,0.384900
1,0.365148,1.000000,0.239046,0.286039,0.421637,0.632456,0.300000,0.365148,0.223607,0.762770,...,0.286039,0.223607,0.300000,0.572078,0.300000,0.300000,0.500000,0.701646,0.263117,0.210819
2,0.218218,0.239046,1.000000,0.227921,0.125988,0.125988,0.358569,0.327327,0.133631,0.341882,...,0.569803,0.400892,0.358569,0.341882,0.478091,0.358569,0.358569,0.314485,0.419314,0.125988
3,0.435194,0.286039,0.227921,1.000000,0.201008,0.301511,0.286039,0.783349,0.426401,0.181818,...,0.272727,0.533002,0.286039,0.272727,0.095346,0.762770,0.190693,0.418121,0.334497,0.904534
4,0.577350,0.421637,0.125988,0.201008,1.000000,0.222222,0.316228,0.384900,0.589256,0.502519,...,0.301511,0.235702,0.421637,0.402015,0.421637,0.316228,0.632456,0.369800,0.277350,0.222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.365148,0.300000,0.358569,0.762770,0.316228,0.105409,0.400000,0.730297,0.335410,0.190693,...,0.476731,0.782624,0.200000,0.286039,0.300000,1.000000,0.300000,0.350823,0.350823,0.843274
96,0.639010,0.500000,0.358569,0.190693,0.632456,0.316228,0.300000,0.273861,0.447214,0.476731,...,0.190693,0.223607,0.600000,0.286039,0.400000,0.300000,1.000000,0.263117,0.438529,0.105409
97,0.320256,0.701646,0.314485,0.418121,0.369800,0.647150,0.263117,0.560449,0.098058,0.585369,...,0.501745,0.392232,0.263117,0.585369,0.350823,0.350823,0.263117,1.000000,0.538462,0.369800
98,0.400320,0.263117,0.419314,0.334497,0.277350,0.369800,0.526235,0.320256,0.294174,0.250873,...,0.668994,0.294174,0.438529,0.418121,0.701646,0.350823,0.438529,0.538462,1.000000,0.277350


In [23]:
#Renaming index and column names

user_user_simMatrix.columns = user_item_matrix.index

user_user_simMatrix['user'] = user_item_matrix.index
user_user_simMatrix = user_user_simMatrix.set_index('user')
user_user_simMatrix.head()

user,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.365148,0.218218,0.435194,0.57735,0.288675,0.365148,0.416667,0.51031,0.348155,...,0.174078,0.204124,0.456435,0.435194,0.365148,0.365148,0.63901,0.320256,0.40032,0.3849
2,0.365148,1.0,0.239046,0.286039,0.421637,0.632456,0.3,0.365148,0.223607,0.76277,...,0.286039,0.223607,0.3,0.572078,0.3,0.3,0.5,0.701646,0.263117,0.210819
3,0.218218,0.239046,1.0,0.227921,0.125988,0.125988,0.358569,0.327327,0.133631,0.341882,...,0.569803,0.400892,0.358569,0.341882,0.478091,0.358569,0.358569,0.314485,0.419314,0.125988
4,0.435194,0.286039,0.227921,1.0,0.201008,0.301511,0.286039,0.783349,0.426401,0.181818,...,0.272727,0.533002,0.286039,0.272727,0.095346,0.76277,0.190693,0.418121,0.334497,0.904534
5,0.57735,0.421637,0.125988,0.201008,1.0,0.222222,0.316228,0.3849,0.589256,0.502519,...,0.301511,0.235702,0.421637,0.402015,0.421637,0.316228,0.632456,0.3698,0.27735,0.222222


Making Recommendations

In [24]:
user_user_simMatrix.loc[1].sort_values(ascending=False)
items_bought_by_user1 = user_item_matrix.loc[1][user_item_matrix.loc[1]>0]
print("Items Bought by user1: ")
print(items_bought_by_user1)

Items Bought by user1: 
item
1     1
2     1
3     1
4     1
5     1
7     1
11    1
15    1
19    1
23    1
24    1
27    1
Name: 1, dtype: int64


In [78]:
# to build a fuction that gives items recommended to a given user
def recommend_for_givenUser(given_user_id):
  items_bought_by_givenUserID = user_item_matrix.loc[given_user_id][user_item_matrix.loc[given_user_id]>0]
  target_user_id = user_user_simMatrix.loc[given_user_id].sort_values(ascending=False).index[1]
  items_bought_by_targetUserID = user_item_matrix.loc[target_user_id][user_item_matrix.loc[target_user_id]>0]
  items_to_recommend_to_givenUser = set(items_bought_by_targetUserID.index) - set(items_bought_by_givenUserID.index)
  if items_to_recommend_to_givenUser==set():
    target_user_id = user_user_simMatrix.loc[given_user_id].sort_values(ascending=False).index[2]
    items_bought_by_targetUserID = user_item_matrix.loc[target_user_id][user_item_matrix.loc[target_user_id]>0]
    items_to_recommend_to_givenUser = set(items_bought_by_targetUserID.index) - set(items_bought_by_givenUserID.index)
  else:
    pass

  return items_to_recommend_to_givenUser

In [79]:
# to build a function that explains whether or not the recommended item for a given user appeared in his item-liked history
def recommend_items_compared_to_history(given_user_id):
  recommend_items = recommend_for_givenUser(given_user_id)
  items_for_him_history = set(RawDataFile[(RawDataFile['observed']==0)&(RawDataFile['user']==given_user_id)]['item'])
  comparison = list(recommend_items.intersection(items_for_him_history))

  if len(comparison)==0:
    print('The recommended items were not in the history of the given user id')
    print(recommend_items)
  else:
    history = RawDataFile[(RawDataFile['observed']==0)&(RawDataFile['user']==given_user_id)]
    final = history[history['item'].isin(comparison)]
    if final['liked'].nunique()==1:
      if final['liked'].value_counts().index[0]==0:
        print('Some of the recommended items were in the history of the given user id. But the given user did not like them.')
      else:
        print('Some of the recommended items were in the history of the given user id. And the given user liked all of them.')
    else:
      percent= final['liked'].value_counts()[1]/len(final['liked'])*100
      print('Some of the recommended items were in the history of the given user id. And the given user liked %.2f percent of the small group of items.' %(percent))

    return final

Now I randomly pick a few users and check how the recommender works.

case 1: user 1

In [80]:
recommend_for_givenUser(given_user_id=1)

{22}

In [81]:
RawDataFile[(RawDataFile['observed']==0)&(RawDataFile['user']==1)]

Unnamed: 0,user,item,x,observed,liked
700,1,8,-1.341051,0,0


In [82]:
recommend_items_compared_to_history(given_user_id=1)

The recommended items were not in the history of the given user id
{22}


case 2: user 3

In [83]:
recommend_for_givenUser(given_user_id=3)

{5, 13, 15, 16, 20, 25, 28}

In [84]:
RawDataFile[(RawDataFile['observed']==0)&(RawDataFile['user']==3)]

Unnamed: 0,user,item,x,observed,liked
402,3,5,-0.146724,0,1
802,3,9,-0.711622,0,1
1202,3,13,0.086942,0,1
1402,3,15,0.756329,0,0
2402,3,25,-0.613616,0,1
2602,3,27,0.57071,0,0


In [85]:
recommend_items_compared_to_history(given_user_id=3)

Some of the recommended items were in the history of the given user id. And the given user liked 75.00 percent of the small group of items.


Unnamed: 0,user,item,x,observed,liked
402,3,5,-0.146724,0,1
1202,3,13,0.086942,0,1
1402,3,15,0.756329,0,0
2402,3,25,-0.613616,0,1


case 3: user 47

In [86]:
recommend_for_givenUser(given_user_id=47)

{13}

In [87]:
RawDataFile[(RawDataFile['observed']==0)&(RawDataFile['user']==47)]

Unnamed: 0,user,item,x,observed,liked
546,47,6,-1.459666,0,0
1246,47,13,1.124976,0,1
1546,47,16,0.485647,0,0
2346,47,24,-0.314309,0,0


In [88]:
recommend_items_compared_to_history(given_user_id=47)

Some of the recommended items were in the history of the given user id. And the given user liked all of them.


Unnamed: 0,user,item,x,observed,liked
1246,47,13,1.124976,0,1


case 4: user 66

In [89]:
recommend_for_givenUser(given_user_id=66)

{5, 7, 11, 20}

In [90]:
RawDataFile[(RawDataFile['observed']==0)&(RawDataFile['user']==66)]

Unnamed: 0,user,item,x,observed,liked
465,66,5,0.863403,0,1
1065,66,11,0.189609,0,0
1665,66,17,1.08604,0,1


In [91]:
recommend_items_compared_to_history(given_user_id=66)

Some of the recommended items were in the history of the given user id. And the given user liked 50.00 percent of the small group of items.


Unnamed: 0,user,item,x,observed,liked
465,66,5,0.863403,0,1
1065,66,11,0.189609,0,0


The recommender seems reasonable. It gives some basic information that what a user may like.