In [54]:
# Importing the requried libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel('Online Retail.xlsx')

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
df.shape

(541909, 8)

In [5]:
print(df.dtypes)

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object


In [6]:
df.size

4335272

In [7]:
df.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

### Data pre-processing

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      541909 non-null object
StockCode      541909 non-null object
Description    540455 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null datetime64[ns]
UnitPrice      541909 non-null float64
CustomerID     406829 non-null float64
Country        541909 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


* We can see that a lot of customer ids are missing from the data. These rows are not useful for us as we cannot attribute these purchases to any customer So it is better to remove these from the data

In [9]:
# Removing the rows that do not have customer id
df_clean = df[df.CustomerID.isna() == False]

In [10]:
# Data summary after removing the missing values
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      406829 non-null object
StockCode      406829 non-null object
Description    406829 non-null object
Quantity       406829 non-null int64
InvoiceDate    406829 non-null datetime64[ns]
UnitPrice      406829 non-null float64
CustomerID     406829 non-null float64
Country        406829 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.9+ MB


In [11]:
# Creating a lookup table for stockcode and descriptions
df_lookup = df_clean[['StockCode', 'Description']].drop_duplicates() # Only get unique item-description pairs
df_lookup['StockCode'] = df_lookup.StockCode.astype(str) # easy for lookup

In [12]:
df_lookup.head()

Unnamed: 0,StockCode,Description
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,71053,WHITE METAL LANTERN
2,84406B,CREAM CUPID HEARTS COAT HANGER
3,84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,84029E,RED WOOLLY HOTTIE WHITE HEART.


In [13]:
df_lookup.StockCode.value_counts()

23196     4
23236     4
23209     3
22776     3
23131     3
         ..
90081C    1
20769     1
21619     1
21360     1
16169E    1
Name: StockCode, Length: 3684, dtype: int64

In [14]:
df_lookup[df_lookup.StockCode=='23196']

Unnamed: 0,StockCode,Description
237422,23196,RETRO LEAVES MAGNETIC NOTEPAD
238991,23196,RETO LEAVES MAGNETIC SHOPPING LIST
246802,23196,LEAVES MAGNETIC SHOPPING LIST
252851,23196,VINTAGE LEAF MAGNETIC NOTEPAD


In [15]:
df_lookup.shape

(3916, 2)

In [16]:
df_clean['CustomerID'] = df_clean.CustomerID.astype(int) # Convert to int for customer ID

df_clean = df_clean[['StockCode', 'Quantity', 'CustomerID']] # Get rid of unnecessary info

grouped_cleaned = df_clean.groupby(['CustomerID', 'StockCode']).sum().reset_index() # Group together

grouped_cleaned.Quantity[grouped_cleaned.Quantity == 0] = 1 # Replace a sum of zero purchases with a one to
# indicate purchased # This would have come as zero as the returns are recorded as negatives

grouped_purchased = grouped_cleaned[grouped_cleaned['Quantity'] > 0] # Only get customers where purchase totals were positive

In [17]:
# Final dataset
grouped_purchased.head()

Unnamed: 0,CustomerID,StockCode,Quantity
0,12346,23166,1
1,12347,16008,24
2,12347,17021,36
3,12347,20665,6
4,12347,20719,40


In [19]:
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve

In [20]:
# Creating the sparse matrix

customers = sorted(list(set(grouped_purchased.CustomerID)))# Get our unique customers

products = list(set(grouped_purchased.StockCode)) # Get our unique products that were purchased

quantity = list(grouped_purchased.Quantity) # All of our purchases


# Get the associated row indices
rows = grouped_purchased.CustomerID.astype('category', CategoricalDtype = customers).cat.codes 

# Get the associated column indices
cols = grouped_purchased.StockCode.astype('category', CategoricalDtype = products).cat.codes 

purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))

In [21]:
len(customers)

4338

In [22]:
# Final sparse matrix
purchases_sparse.shape

(4338, 3664)

In [23]:
purchases_sparse.todense()[5:20,:]

matrix([[0, 0, 0, ..., 1, 0, 7],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 7],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 1]], dtype=int32)

In [28]:
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] # Number of possible interactions in the matrix
num_purchases = len(purchases_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_purchases/matrix_size))
sparsity

98.32190920694744

Maximum possible sparsity for collaborative filtering approach is 99.5%. As our sparsity is ~98.3%, we can still expect decent results.

In [29]:
import random

def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    
    random.seed(0) # Set the random seed to zero for reproducibility
    
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered

In [30]:
# 20% of the data has been masked for this exercise
product_train, product_test, product_users_altered = make_train(purchases_sparse, pct_test = 0.2)

In [31]:
#!pip install implicit

### Building the recommendation system

In [33]:
import implicit

alpha = 15
user_vecs, item_vecs = implicit.alternating_least_squares((product_train*alpha).astype('double'), 
                                                          factors=20, 
                                                          regularization = 0.1, 
                                                         iterations = 50)

This method is deprecated. Please use the AlternatingLeastSquares class instead


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




### Evaluating the recommendation system using AUC-ROC curve

In [34]:
from sklearn import metrics

def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr) 

In [35]:
def calc_mean_auc(training_set, altered_users, predictions, test_set):
    
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # Get sum of item iteractions to find most popular
    item_vecs = predictions[1]
    for user in altered_users: # Iterate through each user that had an item altered
        training_row = training_set[user,:].toarray().reshape(-1) # Get the training set row
        zero_inds = np.where(training_row == 0) # Find where the interaction had not yet occurred
        
        # Get the predicted values based on our user/item vectors
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
        
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_items[zero_inds] # Get the item popularity for our chosen items
        
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))  
   # Return the mean AUC rounded to three decimal places for both test and popularity benchmark

In [36]:
calc_mean_auc(product_train, product_users_altered, 
              [sparse.csr_matrix(user_vecs), sparse.csr_matrix(item_vecs.T)], product_test)
# AUC for our recommender system

(0.87, 0.815)

### Example for recommendation

In [37]:
customers_arr = np.array(customers) # Array of customer IDs from the ratings matrix
products_arr = np.array(products) # Array of product IDs from the ratings matrix

In [38]:
def get_items_purchased(customer_id, mf_train, customers_list, products_list, item_lookup):
    
    cust_ind = np.where(customers_list == customer_id)[0][0] # Returns the index row of our customer id
    
    purchased_ind = mf_train[cust_ind,:].nonzero()[1] # Get column indices of purchased items
    
    prod_codes = products_list[purchased_ind] # Get the stock codes for our purchased items
    
    return item_lookup.loc[item_lookup.StockCode.isin(prod_codes)]



In [39]:
customers_arr[:5]

array([12346, 12347, 12348, 12349, 12350])

In [41]:
get_items_purchased(12346, product_train, customers_arr, products_arr, df_lookup)

Unnamed: 0,StockCode,Description
4757,22031,BOTANICAL LAVENDER BIRTHDAY CARD


In [42]:
from sklearn.preprocessing import MinMaxScaler

In [43]:
def rec_items(customer_id, mf_train, user_vecs, item_vecs, customer_list, item_list, item_lookup, num_items = 10):
    
    cust_ind = np.where(customer_list == customer_id)[0][0] # Returns the index row of our customer id
    pref_vec = mf_train[cust_ind,:].toarray() # Get the ratings from the training set ratings matrix
    pref_vec = pref_vec.reshape(-1) + 1 # Add 1 to everything, so that items not purchased yet become equal to 1
    pref_vec[pref_vec > 1] = 0 # Make everything already purchased zero
    rec_vector = user_vecs[cust_ind,:].dot(item_vecs.T) # Get dot product of user vector and all item vectors
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 
    recommend_vector = pref_vec*rec_vector_scaled 
    # Items already purchased have their recommendation multiplied by zero
    product_idx = np.argsort(recommend_vector)[::-1][:num_items] # Sort the indices of the items into order 
    # of best recommendations
    rec_list = [] # start empty list to store items
    for index in product_idx:
        code = item_list[index]
        rec_list.append([code, item_lookup.Description.loc[item_lookup.StockCode == code].iloc[0]]) 
        # Append our descriptions to the list
    codes = [item[0] for item in rec_list]
    descriptions = [item[1] for item in rec_list]
    final_frame = pd.DataFrame({'StockCode': codes, 'Description': descriptions}) # Create a dataframe 
    return final_frame[['StockCode', 'Description']] # Switch order of columns around


In [45]:
rec_items(12346, product_train, user_vecs, item_vecs, customers_arr, products_arr, df_lookup,
                       num_items = 10)

Unnamed: 0,StockCode,Description
0,22030,SWALLOWS GREETING CARD
1,46126A,ELVIS WALLHANGING / CURTAIN
2,22032,BOTANICAL LILY GREETING CARD
3,85230G,ORANGE VOTIVE CANDLE
4,47503H,ASS FLORAL PRINT SPIRIT LEVEL
5,21946,PARTY TIME DESIGN FLANNEL
6,22151,PLACE SETTING WHITE HEART
7,22180,RETROSPOT LAMP
8,85199L,LARGE HANGING IVORY & RED WOOD BIRD
9,22161,HEART DECORATION RUSTIC HANGING


In [47]:
get_items_purchased(12353, product_train, customers_arr, products_arr, df_lookup)

Unnamed: 0,StockCode,Description
1413,22493,PAINT YOUR OWN CANVAS SET
2191,22496,SET OF 2 ROUND TINS DUTCH CHEESE
2437,22497,SET OF 2 TINS VINTAGE BATHROOM
6447,21677,HEARTS STICKERS


In [48]:
rec_items(12353, product_train, user_vecs, item_vecs, customers_arr, products_arr, df_lookup,
                       num_items = 10)

Unnamed: 0,StockCode,Description
0,90202D,PINK ENAMEL FLOWER HAIR TIE
1,90210B,CLEAR ACRYLIC FACETED BANGLE
2,21358,TOAST ITS - HAPPY BIRTHDAY
3,47594A,CAROUSEL DESIGN WASHBAG
4,47591B,SCOTTIES CHILDRENS APRON
5,22494,EMERGENCY FIRST AID TIN
6,84562A,PINK/WHITE RIBBED MELAMINE JUG
7,21359,RELAX LARGE WOOD LETTERS
8,90214B,"LETTER ""B"" BLING KEY RING"
9,22567,20 DOLLY PEGS RETROSPOT


In [49]:
get_items_purchased(12361, product_train, customers_arr, products_arr, df_lookup)

Unnamed: 0,StockCode,Description
88,21033,JUMBO BAG CHARLIE AND LOLA TOYS
552,21244,BLUE POLKADOT PLATE
1360,37343,POLKADOT MUG PINK
2423,90214M,"LETTER ""M"" BLING KEY RING"
37557,84802A,WHITE ANEMONE ARTIFICIAL FLOWER
37663,90108,BLUE BLOSSOM HAIR CLIP
112810,16248B,BLUE HOLE PUNCH
215637,35818P,"ACRYLIC JEWEL ICICLE, PINK"


In [50]:
rec_items(12361, product_train, user_vecs, item_vecs, customers_arr, products_arr, df_lookup,
                       num_items = 10)

Unnamed: 0,StockCode,Description
0,21373,MIRRORED WALL ART SNOWFLAKES
1,84800L,LARGE WHITE/PINK ROSE ART FLOWER
2,85170B,SET/6 BLACK BIRD T-LIGHT CANDLES
3,21034,REX CASH+CARRY JUMBO SHOPPER
4,16156L,"WRAP, CAROUSEL"
5,47593B,SCOTTIE DOGS BABY BIB
6,21035,SET/2 RED RETROSPOT TEA TOWELS
7,22076,6 RIBBONS EMPIRE
8,M,Manual
9,22073,RED RETROSPOT STORAGE JAR


references :https://jessesw.com/Rec-System/