# COLLABORATIVE FILTERING IMPLEMENTATION FOR E-COMMERCE DATASET

# Imports tatements

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns 
import random 

from IPython.display import Image 
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors 

from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset 

from surprise.model_selection import train_test_split, cross_validate,GridSearchCV
from surprise.prediction_algorithms import CoClustering
from surprise.prediction_algorithms import NMF
from surprise import accuracy

# Data Loading

In [9]:
# data = pd.read_excel('../data/Rec_sys_data.xlsx',encoding= 'unicode_escape')

In [5]:
data = pd.read_excel('../data/Rec_sys_data.xlsx')


In [6]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID
0,536365,84029E,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.2,ExpressAir,30.12,17850
1,536365,71053,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,17850
2,536365,21730,6,2010-12-01 08:26:00,2010-12-03 08:26:00,0.56,Regular Air,15.22,17850
3,536365,84406B,8,2010-12-01 08:26:00,2010-12-03 08:26:00,0.3,Regular Air,15.22,17850
4,536365,22752,2,2010-12-01 08:26:00,2010-12-04 08:26:00,0.57,Delivery Truck,5.81,17850


DATASET INFO:

			• InvoiceNo: The invoice number of a particular transaction
			
			• StockCode: The unique identifier for a particular item
			
			• Quantity: The quantity of that item bought by the customer
			
			• InvoiceDate: The date and time when the transaction was made
			
			• DeliveryDate: The date and time when the delivery happened
			
			• Discount%: Percentage of discount on the purchased item
			
			• ShipMode: Mode of shipping
			
			• ShippingCost: Cost of shipping that item
			
			• CustomerID: The unique identifier of a particular customer

# Getting knowing the data

In [10]:
data.shape

(272404, 9)

Checking for null values

In [11]:
data.isnull().sum().sort_values(ascending=False)

InvoiceNo       0
StockCode       0
Quantity        0
InvoiceDate     0
DeliveryDate    0
Discount%       0
ShipMode        0
ShippingCost    0
CustomerID      0
dtype: int64

There is no null values in the data

Getting basics statistics

In [12]:
data.describe()

Unnamed: 0,InvoiceNo,Quantity,Discount%,ShippingCost,CustomerID
count,272404.0,272404.0,272404.0,272404.0,272404.0
mean,553740.733319,13.579536,0.300092,17.053491,15284.323523
std,9778.082879,149.136756,0.176023,10.01321,1714.478624
min,536365.0,1.0,0.0,5.81,12346.0
25%,545312.0,2.0,0.15,5.81,13893.0
50%,553902.0,6.0,0.3,15.22,15157.0
75%,562457.0,12.0,0.45,30.12,16788.0
max,569629.0,74215.0,0.6,30.12,18287.0


In [13]:
# Changing the type of StockCode
data.StockCode = data.StockCode.astype(str)

# Collaborative Filtering using Memory-Based Approach

USER-To-USER Implementation:

In [14]:
# Creating data matrix for purchase history
purchase_df = (data.groupby(['CustomerID', 'StockCode'])['Quantity'].sum()\
                .unstack().reset_index().fillna(0).set_index('CustomerID'))

In [15]:
purchase_df.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214R,90214S,90214V,90214Y,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
12350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,5.0


In [16]:
# The data matrix shown above reveals the total quantity purchased by each
# user against each item. Only information about whether the item was bought or not by
# the user is needed, not the quantity.

In [17]:
def encode_units(x):
     
					if x < 1:
      # If the quantity is less than 1
						return 0 # Not purchased
					if x >= 1: 
      # If the quantity is greater than 1
						return 1 # Purchased

In [18]:
purchase_df = purchase_df.applymap(encode_units)
purchase_df.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214R,90214S,90214V,90214Y,BANK CHARGES,C2,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [19]:
# The purchase data matrix reveals the behavior of customers across all items.
# This matrix finds the user similarity scores matrix, and the similarity metric uses cosine
# similarity. The user similarity score matrix has user-to-user similarity for each user pair.

In [20]:
# Applying cosine_similarity to the purchase dataframe
user_similarities = cosine_similarity(purchase_df)

In [21]:
user_similarities

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.07063224, ..., 0.        , 0.0876668 ,
        0.02125256],
       [0.        , 0.07063224, 1.        , ..., 0.        , 0.12309149,
        0.08206099],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.0876668 , 0.12309149, ..., 0.        , 1.        ,
        0.11111111],
       [0.        , 0.02125256, 0.08206099, ..., 0.        , 0.11111111,
        1.        ]])

In [22]:
# Converting the user_similarities array into a dataframe 
user_similarity_data = pd.DataFrame(user_similarities,index=purchase_df.index,columns=purchase_df.index)
user_similarity_data.head()

CustomerID,12346,12347,12348,12350,12352,12353,12354,12355,12356,12358,...,18269,18270,18272,18273,18278,18280,18281,18282,18283,18287
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.114708,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12347,0.0,1.0,0.070632,0.053567,0.048324,0.0,0.029001,0.091885,0.075845,0.0,...,0.041739,0.0,0.050669,0.0,0.036811,0.069843,0.0,0.0,0.087667,0.021253
12348,0.0,0.070632,1.0,0.051709,0.031099,0.0,0.027995,0.118262,0.146427,0.061546,...,0.0,0.0,0.024456,0.0,0.0,0.0,0.0,0.0,0.123091,0.082061
12350,0.0,0.053567,0.051709,1.0,0.035377,0.0,0.0,0.0,0.033315,0.070014,...,0.0,0.0,0.027821,0.0,0.0,0.0,0.0,0.0,0.052511,0.0
12352,0.0,0.048324,0.031099,0.035377,1.0,0.0,0.095765,0.040456,0.10018,0.084215,...,0.110264,0.065233,0.133855,0.0,0.0,0.0,0.0,0.0,0.094742,0.056143


Get Recommendation from a given user by creating a function

In [26]:
def fetch_similar_users(user_id, k=5):
    """
    This function separates the selected user from all other users and then takes a cosine
    similarity of the selected user with all users to find similar users. Return the top k similar
    users (by CustomerID) to our selected user.
    """
    # Separating data rows for the entered user id
    user_similarity = user_similarity_data[user_similarity_data.index == user_id]
    
    # Data of all other users
    other_users_similarities = user_similarity_data[user_similarity_data.index != user_id]
    
    # Calculate cosine similarity between user and each other user
    similarities = cosine_similarity(user_similarity, other_users_similarities)[0].tolist()
    user_indices = other_users_similarities.index.tolist()
    index_similarity_pair = dict(zip(user_indices, similarities))
    
    # Sort by similarity
    sorted_index_similarity_pair = sorted(index_similarity_pair.items(), reverse=True)
    top_k_users_similarities = sorted_index_similarity_pair[:k]
    similar_users = [u[0] for u in top_k_users_similarities]
    
    print('The users with behaviour similar to that of user {0} are:'.format(user_id))
    
    return similar_users


In [28]:
similar_users = fetch_similar_users(12347)
similar_users

The users with behaviour similar to that of user 12347 are:


[18287, 18283, 18282, 18281, 18280]

 Getting recommendations by showing the items bought by similar users.

In [31]:
def simular_users_recommendation(userid):
     
					similar_users = fetch_similar_users(userid)
					#obtaining all the items bought by similar users
					simular_users_recommendation_list = []
					for j in similar_users:
										item_list = data[data["CustomerID"]==j]['StockCode'].to_list()
										simular_users_recommendation_list.append(item_list)
          
					#this gives us multi-dimensional list
					# we need to flatten it
					flat_list = []
					for sublist in simular_users_recommendation_list:
										for item in sublist:
															flat_list.append(item)
					final_recommendations_list = list(dict.fromkeys(flat_list))
					# storing 10 random recommendations in a list
					ten_random_recommendations = random.sample(final_recommendations_list, 10)
					print('Items bought by Similar users based on Cosine Similarity')
					#returning 10 random recommendations
					return ten_random_recommendations

In [32]:
simular_users_recommendation(12347)

The users with behaviour similar to that of user 12347 are:
Items bought by Similar users based on Cosine Similarity


['23234',
 '21908',
 '20676',
 '22964',
 '22379',
 '47559B',
 '23344',
 '82580',
 '22356',
 '22645']

# Item-to-Item Collaborative Filtering

In [33]:
# Creating data matrix, which contains all the item IDs across their purchase history
items_purchase_df = (data.groupby(['StockCode','CustomerID'])['Quantity'].sum()\
                     .unstack().reset_index().fillna(0).set_index('StockCode'))

In [34]:
items_purchase_df.head()

CustomerID,12346,12347,12348,12350,12352,12353,12354,12355,12356,12358,...,18269,18270,18272,18273,18278,18280,18281,18282,18283,18287
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10123C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10124A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# Let's encode the items_purchase_df with wether an item has been purchased by a customer or not 
items_purchase_df = items_purchase_df.applymap(encode_units)

In [36]:
items_purchase_df

CustomerID,12346,12347,12348,12350,12352,12353,12354,12355,12356,12358,...,18269,18270,18272,18273,18278,18280,18281,18282,18283,18287
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10120,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10123C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10124A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DOT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
PADS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# finding similarity score
item_similarities = cosine_similarity(items_purchase_df)

In [38]:
item_similarities

array([[1.        , 0.        , 0.10882144, ..., 0.07933288, 0.        ,
        0.06698641],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.10882144, 0.        , 1.        , ..., 0.07673883, 0.        ,
        0.01388487],
       ...,
       [0.07933288, 0.        , 0.07673883, ..., 1.        , 0.        ,
        0.06579517],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.06698641, 0.        , 0.01388487, ..., 0.06579517, 0.        ,
        1.        ]])

In [39]:
# Converting item_similarities array to a data frame
item_similarity_data = pd.DataFrame(item_similarities,index=items_purchase_df.index,columns=items_purchase_df.index)
item_similarity_data.head()

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214R,90214S,90214V,90214Y,BANK CHARGES,C2,DOT,M,PADS,POST
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,1.0,0.0,0.108821,0.091287,0.0,0.0,0.094281,0.062932,0.091902,0.110096,...,0.0,0.0,0.0,0.0,0.0,0.032275,0.0,0.079333,0.0,0.066986
10080,0.0,1.0,0.0,0.0,0.0,0.0,0.043033,0.028724,0.067116,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10120,0.108821,0.0,1.0,0.132453,0.0,0.0,0.068399,0.068483,0.026669,0.079872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076739,0.0,0.013885
10123C,0.091287,0.0,0.132453,1.0,0.0,0.0,0.172133,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10124A,0.0,0.0,0.0,0.0,1.0,0.288675,0.074536,0.049752,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Getting recommendations for a given user.

In [40]:
def fetch_similar_items(item_id,k=10):
     
     # separating data rows of the selected item
					item_similarity = item_similarity_data[item_similarity_data.index == item_id]
					# a data of all other items
					other_items_similarities = item_similarity_data[item_similarity_data.index != item_id]
					# calculate cosine similarity between selected item with other items
					similarities = cosine_similarity(item_similarity,other_items_similarities)[0].tolist()
					# create list of indices of these items
					item_indices = other_items_similarities.index.tolist()
					# create key/values pairs of item index and their similarity
					index_similarity_pair = dict(zip(item_indices, similarities))
					# sort by similarity
					sorted_index_similarity_pair = sorted(index_similarity_pair.items())
					# grab k items from the top
					top_k_item_similarities = sorted_index_similarity_pair[:k]
					similar_items = [u[0] for u in top_k_item_similarities]

					print('Similar items based on purchase behaviour (item-to-item collaborative filtering)')
					return similar_items


In [41]:
similar_items = fetch_similar_items('10002')
similar_items

Similar items based on purchase behaviour (item-to-item collaborative filtering)


['10080',
 '10120',
 '10123C',
 '10124A',
 '10124G',
 '10125',
 '10133',
 '10135',
 '11001',
 '15030']

In [45]:
# Recommendations by showing similar items to those bought by a particular user.
def similar_item_recommendation(user_id):
    similar_items_recommendation_list = []
    # Obtaining all the similar items to items bought by user
    item_list = data[data["CustomerID"] == user_id]['StockCode'].to_list()

    for item in item_list:
        similar_items = fetch_similar_items(item)
        similar_items_recommendation_list.append(item_list)
    
    # This gives us multi-dimensional list
    # We need to flatten it
    flat_list = []
    for sublist in similar_items_recommendation_list:
        for item in sublist:
            flat_list.append(item)
    
    final_recommendations_list = list(dict.fromkeys(flat_list))
    # Storing 10 random recommendations in a list
    ten_random_recommendations = random.sample(final_recommendations_list, 10)
    print('Similar Items bought by our users based on Cosine Similarity')
    # Returning 10 random recommendations
    return ten_random_recommendations


In [43]:
# This function gets the list of similar items for all previously bought items by our given
# customer (ID). This list is then flattened to get a final list of unique items, from which
# randomly chosen ten items as recommendations for our given user are shown

In [48]:
# simular_item_recommendation(12347)

In [47]:
similar_item_recommendation(12347)

Similar items based on purchase behaviour (item-to-item collaborative filtering)
Similar items based on purchase behaviour (item-to-item collaborative filtering)
Similar items based on purchase behaviour (item-to-item collaborative filtering)
Similar items based on purchase behaviour (item-to-item collaborative filtering)
Similar items based on purchase behaviour (item-to-item collaborative filtering)
Similar items based on purchase behaviour (item-to-item collaborative filtering)
Similar items based on purchase behaviour (item-to-item collaborative filtering)
Similar items based on purchase behaviour (item-to-item collaborative filtering)
Similar items based on purchase behaviour (item-to-item collaborative filtering)
Similar items based on purchase behaviour (item-to-item collaborative filtering)
Similar items based on purchase behaviour (item-to-item collaborative filtering)
Similar items based on purchase behaviour (item-to-item collaborative filtering)
Similar items based on purch

['22423',
 '22699',
 '22422',
 '21171',
 '22728',
 '22372',
 '22775',
 '85232D',
 '22726',
 '71477']

# MACHINE LEARNING-BASEDA APPROACH

# Collaborative Filtering using KNN-based Approach

A CSR (Compressed Sparse Row) matrix is a data structure used to efficiently store sparse matrices. Sparse matrices are matrices in which most of the elements are zero. 

In a CSR matrix, only the nonzero elements and their corresponding row and column indices are stored. This significantly reduces the memory required to represent the matrix compared to using a dense matrix representation where every element is stored.

A CSR matrix consists of three one-dimensional arrays:
1. **data array**: It stores the nonzero elements of the matrix in row-major order.
2. **indices array**: It stores the column indices corresponding to the nonzero elements.
3. **indptr array**: It stores the indices into the data and indices arrays indicating where each row starts and ends.

By using CSR format, operations like matrix-vector multiplication, matrix-matrix multiplication, and other linear algebraic operations can be efficiently performed on sparse matrices. It's particularly useful when dealing with large matrices where most of the elements are zero.

In [49]:
# convert the sparse matrix into a CSR matrix.
purchase_matrix = csr_matrix(purchase_df.values)

In [50]:
knn_model = NearestNeighbors(metric = 'euclidean', algorithm = 'brute')
knn_model

In [51]:
knn_model.fit(purchase_matrix)

Fetch similar users

In [55]:
# simular_users_knn = []


In [61]:
def fetch_similar_users_knn(purchase_df,query_index):
     
     # Creating empty list where we will store user id of similar users
					simular_users_knn = []
     
					# Storing the distance and index of nearest neighbor
					distances, indices = knn_model.kneighbors(purchase_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 5)
					for i in range(0, len(distances.flatten())):
										if i == 0:
															print('Recommendations for {0}:\n'.format(purchase_df.index[query_index]))
										else:
															print('{0}: {1}, with distance of {2}:'.format(i, purchase_df.index[indices.flatten()[i]], distances.flatten()[i]))
															simular_users_knn.append( purchase_df.index[indices.flatten()[i]])


In [62]:
fetch_similar_users_knn(purchase_df,1497)

Recommendations for 14729:

1: 16917, with distance of 8.12403840463596:
2: 16989, with distance of 8.12403840463596:
3: 15124, with distance of 8.12403840463596:
4: 12897, with distance of 8.246211251235321:


In [63]:
simular_users_knn

[]

In [65]:
def knn_recommendation(simular_users_knn):
    # Obtaining all the items bought by similar users
    knn_recommnedations = []
    for j in simular_users_knn:
        item_list = data[data["CustomerID"]==j]['StockCode'].to_list()
        knn_recommnedations.append(item_list)
    
    # This gives us a multi-dimensional list. We need to flatten it.
    flat_list = []
    for sublist in knn_recommnedations:
        for item in sublist:
            flat_list.append(item)
    
    final_recommendations_list = list(dict.fromkeys(flat_list))
    
    # Storing 10 random recommendations in a list
    ten_random_recommendations = random.sample(final_recommendations_list, 10)
    print('Items bought by Similar users based on KNN')
    
    # Returning 10 random recommendations
    return ten_random_recommendations


In [66]:
knn_recommendation(simular_users_knn)

ValueError: Sample larger than population or is negative

In [68]:
from scipy.sparse import csr_matrix

purchase_matrix = csr_matrix(purchase_df.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(purchase_matrix)

In [69]:
simu_knn = []


In [70]:
def similar_users_knn(purchase,query_index):
    distances, indices = model_knn.kneighbors(purchase.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations for {0}:\n'.format(purchase.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, purchase.index[indices.flatten()[i]], distances.flatten()[i]))
            simu_knn.append(purchase.index[indices.flatten()[i]])

In [72]:
similar_users_knn(purchase_df,1497)


Recommendations for 14729:

1: 15124, with distance of 0.7592282938284616:
2: 16917, with distance of 0.7592282938284616:
3: 16989, with distance of 0.772492122403355:
4: 15319, with distance of 0.7893247570999039:
5: 17255, with distance of 0.799356911523718:


In [73]:
simu_knn


[15124, 16917, 16989, 15319, 17255]

In [76]:
def simu_recommendation_knn(simu_knn):
    
    #obtaining all the items bought by similar users
    simu_rec = []
    for j in simu_knn:
        desc = data[data["CustomerID"]==j]['Description'].to_list()
        simu_rec.append(desc)
    
    #this gives us multi-dimensional list
    # we need to flatten it
    flat_list = []
    for sublist in simu_rec:
        for item in sublist:
            flat_list.append(item)
    final_list = list(dict.fromkeys(flat_list))
    
    # storing 10 random recommendations in a list
    ten_recs = random.sample(final_list, 10)
    
    print('Items bought by Similar users based on KNN')
    
    #returning 10 random recommendations
    return ten_recs

In [79]:
simu_recommendation_knn(simu_knn)


KeyError: 'Description'

In [78]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID
0,536365,84029E,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.2,ExpressAir,30.12,17850
1,536365,71053,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,17850
2,536365,21730,6,2010-12-01 08:26:00,2010-12-03 08:26:00,0.56,Regular Air,15.22,17850
3,536365,84406B,8,2010-12-01 08:26:00,2010-12-03 08:26:00,0.3,Regular Air,15.22,17850
4,536365,22752,2,2010-12-01 08:26:00,2010-12-04 08:26:00,0.57,Delivery Truck,5.81,17850


# Collaborative Filtering Using Matrix Factorization

In [80]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID
0,536365,84029E,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.2,ExpressAir,30.12,17850
1,536365,71053,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,17850
2,536365,21730,6,2010-12-01 08:26:00,2010-12-03 08:26:00,0.56,Regular Air,15.22,17850
3,536365,84406B,8,2010-12-01 08:26:00,2010-12-03 08:26:00,0.3,Regular Air,15.22,17850
4,536365,22752,2,2010-12-01 08:26:00,2010-12-04 08:26:00,0.57,Delivery Truck,5.81,17850


In [81]:
items_purchase_df.head()

CustomerID,12346,12347,12348,12350,12352,12353,12354,12355,12356,12358,...,18269,18270,18272,18273,18278,18280,18281,18282,18283,18287
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10120,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10123C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10124A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Formatting the data into the proper format required by the surprise package:

In [82]:
new_data = items_purchase_df.stack().to_frame()
#Renaming the column as Quantity
new_data = new_data.reset_index().rename(columns={0:"Quantity"})
new_data

Unnamed: 0,StockCode,CustomerID,Quantity
0,10002,12346,0
1,10002,12347,0
2,10002,12348,0
3,10002,12350,0
4,10002,12352,0
...,...,...,...
12903081,POST,18280,0
12903082,POST,18281,0
12903083,POST,18282,0
12903084,POST,18283,0


In [83]:
print(items_purchase_df.shape)
print(new_data.shape)

(3538, 3647)
(12903086, 3)


In [84]:
# Storing all customer ids in customers
customer_ids = new_data['CustomerID']
# Storing all item descriptions in items
item_ids = new_data['StockCode']


In [85]:
from collections import Counter

In [86]:
# counting no. of orders made by each customer
count_orders = Counter(customer_ids)

In [87]:
count_orders 

Counter({12346: 3538,
         12347: 3538,
         12348: 3538,
         12350: 3538,
         12352: 3538,
         12353: 3538,
         12354: 3538,
         12355: 3538,
         12356: 3538,
         12358: 3538,
         12359: 3538,
         12360: 3538,
         12361: 3538,
         12362: 3538,
         12363: 3538,
         12364: 3538,
         12365: 3538,
         12370: 3538,
         12372: 3538,
         12373: 3538,
         12375: 3538,
         12377: 3538,
         12378: 3538,
         12379: 3538,
         12380: 3538,
         12381: 3538,
         12383: 3538,
         12384: 3538,
         12386: 3538,
         12388: 3538,
         12390: 3538,
         12393: 3538,
         12394: 3538,
         12395: 3538,
         12397: 3538,
         12399: 3538,
         12401: 3538,
         12402: 3538,
         12405: 3538,
         12406: 3538,
         12407: 3538,
         12408: 3538,
         12409: 3538,
         12410: 3538,
         12412: 3538,
         1

In [88]:
# storing the count and customer id in a dataframe
customer_count_df = pd.DataFrame.from_dict(count_orders, orient='index').reset_index().rename(columns={0:"Quantity"})

In [89]:
# Drop all customer IDs with less than 120 orders.
customer_count_df = customer_count_df[customer_count_df["Quantity"]>120]


In [90]:
# Rename the index column as 'CustomerID' for the inner join.
customer_count_df.rename(columns={'index':'CustomerID'},inplace=True)
customer_count_df


Unnamed: 0,CustomerID,Quantity
0,12346,3538
1,12347,3538
2,12348,3538
3,12350,3538
4,12352,3538
...,...,...
3642,18280,3538
3643,18281,3538
3644,18282,3538
3645,18283,3538


In [92]:
# counting no. of times an item was ordered
count_items = Counter(item_ids)

# storing the count and item description in a dataframe
item_count_df = pd.DataFrame.from_dict(count_items, orient='index').reset_index().rename(columns={0:"Quantity"})
item_count_df

Unnamed: 0,index,Quantity
0,10002,3647
1,10080,3647
2,10120,3647
3,10123C,3647
4,10124A,3647
...,...,...
3533,C2,3647
3534,DOT,3647
3535,M,3647
3536,PADS,3647


In [93]:
# Drop all items that were ordered less than 120 times.
item_count_df = item_count_df[item_count_df["Quantity"]>120]


In [94]:
# Rename the index column as 'Description' for the inner join.
item_count_df.rename(columns={'index':'StockCode'},inplace=True)
item_count_df

Unnamed: 0,StockCode,Quantity
0,10002,3647
1,10080,3647
2,10120,3647
3,10123C,3647
4,10124A,3647
...,...,...
3533,C2,3647
3534,DOT,3647
3535,M,3647
3536,PADS,3647


In [95]:
# Apply a join on both DataFrames with stacked data to create the shortlisted
# final DataFrame.
#Merging stacked df with item count df
new_data1 = pd.merge(new_data, item_count_df, on='StockCode', how='inner')
#Merging with customer count df
new_data1 = pd.merge(new_data1, customer_count_df, on='CustomerID', how='inner')
# dropping columns which are not necessary
new_data1.drop(['Quantity_y','Quantity_x'],axis=1,inplace=True)
new_data1



Unnamed: 0,StockCode,CustomerID,Quantity
0,10002,12346,3538
1,10080,12346,3538
2,10120,12346,3538
3,10123C,12346,3538
4,10124A,12346,3538
...,...,...,...
12903081,C2,18287,3538
12903082,DOT,18287,3538
12903083,M,18287,3538
12903084,PADS,18287,3538


In [96]:
new_data1.describe()

Unnamed: 0,CustomerID,Quantity
count,12903090.0,12903086.0
mean,15279.12,3538.0
std,1720.738,0.0
min,12346.0,3538.0
25%,13784.0,3538.0
50%,15253.0,3538.0
75%,16756.0,3538.0
max,18287.0,3538.0


In [97]:
# Read the data in a format supported by the surprise library.
reader = Reader(rating_scale=(0,5095))

Load the dataset in a format supported by the surprise library:

In [98]:
formated_data = Dataset.load_from_df(new_data1, reader)

In [99]:
formated_data

<surprise.dataset.DatasetAutoFolds at 0x217d6952c50>

In [100]:
# splitting the data to train and test for validating the models.
# performing train test split on the dataset
train_set, test_set = train_test_split(formated_data, test_size= 0.2)


# Implementation of   NMF

In [101]:
# defining the model
algo1 = NMF()


In [102]:
algo1

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x217d8c0ac50>

In [103]:
# model fitting
algo1.fit(train_set)

In [None]:
# model prediction
pred1 = algo1.test(test_set)

Performance Metrics:

In [None]:
# RMSE
accuracy.rmse(pred1)

In [None]:
#MAE
accuracy.mae(pred1)

In [None]:
cross_validate(algo1, formated_data, verbose=True)

# Co-Clustering Method Implementation 

In [None]:
# defining the model
algo2 = CoClustering()


In [None]:
# model fitting
algo2.fit(train_set)

In [None]:
# model prediction
pred2 = algo2.test(test_set)

In [None]:
# Calculate the RMSE and MAE performance metrics using built-in functions.
# RMSE
accuracy.rmse(pred2)
#MAE
accuracy.mae(pred2)

In [None]:
cross_validate(algo2, formated_data, verbose=True)

# Using SVD(Singular value decomposition)

In [None]:
# defining the model
import SVD
algo3 = SVD()

In [None]:
# model fitting
algo3.fit(train_set)

In [None]:
# model prediction
pred3 = algo3.test(test_set)

In [None]:
# Calculate the RMSE and MAE performance metrics using built-in functions.
# RMSE
accuracy.rmse(pred3)

In [None]:
#MAE
accuracy.mae(pred3)

The RMSE and MAE are significantly high for this model. Until now, this has
performed the worst (worse than NMF and co-clustering).

In [None]:
cross_validate(algo3, formated_data, verbose=True)

# Making Recommendations with Above  predefined algorithms

In [None]:
new_data[(new_data['StockCode']=='47590B')&(new_data['CustomerID']==15738)].Quantity.sum()

In [None]:
algo2.test([['47590B',15738,78]])

In [None]:
new_data

In [None]:
# Viewing best and the worst predictions

In [None]:
predictions_data = pd.DataFrame(pred2, columns=['item_id', 'customer_id','quantity', 'prediction', 'details'])

In [None]:
def get_item_orders(user_id):
    try:
        # For an item, return the number of orders made
        return len(train_set.ur[train_set.to_inner_uid(user_id)])
    except ValueError:
        # User not present in training
        return 0

In [None]:
def get_customer_orders(item_id):
    try:
        # For a customer, return the number of orders made
        return len(train_set.ir[train_set.to_inner_iid(item_id)])
    except ValueError:
        # Item not present in training
        return 0


In [None]:
predictions_data['item_orders'] = predictions_data.item_id.apply(get_item_orders)
predictions_data['customer_orders'] = predictions_data.customer_id.apply(get_customer_orders)

In [None]:
# Calculate the error component to get the best and worst predictions.
predictions_data['error'] = abs(predictions_data.prediction - predictions_data.quantity)
predictions_data

In [None]:
best_predictions = predictions_data.sort_values(by='error')[:10]
best_predictions

In [None]:
worst_predictions = predictions_data.sort_values(by='error')[-10:]
worst_predictions

You can now use the predictions data to get to the recommendations. First, find the
customers that have bought the same items as a given user, and then from the other
items they have bought, to fetch the top items and recommend them.

In [None]:
# Getting item list for user 12347
item_list = predictions_data[predictions_data['customer_id']==12347]['item_id'].values.tolist()
item_list

In [None]:
# Get the list of customers who bought the same items as user 12347.
# Getting list of unique customers who also bought same items (item_list)

customer_list = predictions_data[predictions_data['item_id'].isin(item_list)]['customer_id'].values
customer_list = np.unique(customer_list).tolist()
customer_list


In [None]:
# filtering those customers from predictions data
filtered_data = predictions_data[predictions_data['customer_id'].isin(customer_list)]
# removing the items already bought
filtered_data = filtered_data[~filtered_data['item_id'].isin(item_list)]
# getting the top items (prediction)
recommended_items = filtered_data.sort_values('prediction',ascending=False).reset_index(drop=True)\
                                                  .head(10)['item_id'].values.tolist()
recommended_items

In [None]:
print('1')