In [1]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn import metrics

from sklearn.preprocessing import MinMaxScaler

import implicit

In [47]:
retail_df = pd.read_excel('data/Online Retail.xlsx')

In [48]:
retail_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [49]:
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [50]:
retail_df = retail_df[retail_df['CustomerID'].notna()]

In [51]:
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    406829 non-null  object        
 1   StockCode    406829 non-null  object        
 2   Description  406829 non-null  object        
 3   Quantity     406829 non-null  int64         
 4   InvoiceDate  406829 non-null  datetime64[ns]
 5   UnitPrice    406829 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      406829 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.9+ MB


In [52]:
grouped_df = retail_df[['CustomerID', 'StockCode', 'Description', 'Quantity']].groupby(['CustomerID', 'StockCode', 'Description']).sum().reset_index()
grouped_df.loc[grouped_df['Quantity'] == 0, ['Quantity']] = 1
grouped_df = grouped_df.loc[grouped_df['Quantity'] > 0]

In [53]:
grouped_df.head()

Unnamed: 0,CustomerID,StockCode,Description,Quantity
0,12346.0,23166,MEDIUM CERAMIC TOP STORAGE JAR,1
1,12347.0,16008,SMALL FOLDING SCISSOR(POINTED EDGE),24
2,12347.0,17021,NAMASTE SWAGAT INCENSE,36
3,12347.0,20665,RED RETROSPOT PURSE,6
4,12347.0,20719,WOODLAND CHARLOTTE BAG,40


In [54]:
grouped_df.Quantity.describe()

count    268324.000000
mean         18.374290
std          93.902948
min           1.000000
25%           2.000000
50%           6.000000
75%          12.000000
max       12540.000000
Name: Quantity, dtype: float64

In [55]:
import plotly.express as px

fig = px.histogram(grouped_df, x='Quantity', title='Distribution of the purchase quantity', nbins=500)
fig.show();

In [56]:
print(f'Number of unique customers: {grouped_df.CustomerID.nunique()}')
print(f'Number of unique items: {grouped_df.StockCode.nunique()}')

print(f'Average purchase quantity per interaction: {int(grouped_df.Quantity.mean())}')
print(f'Minimum purchase quantity per interaction: {grouped_df.Quantity.min()}')
print(f'Maximum purchase quantity per interaction: {grouped_df.Quantity.max()}')

Number of unique customers: 4338
Number of unique items: 3664
Average purchase quantity per interaction: 18
Minimum purchase quantity per interaction: 1
Maximum purchase quantity per interaction: 12540


### Implicit Feedback

In [57]:
unique_customers = grouped_df.CustomerID.unique()
customer_ids = dict(zip(unique_customers, np.arange(unique_customers.shape[0], dtype=np.int32)))

unique_items = grouped_df.StockCode.unique()
item_ids = dict(zip(unique_items, np.arange(unique_items.shape[0], dtype=np.int32)))

grouped_df['customer_id'] = grouped_df.CustomerID.apply(lambda i: customer_ids[i])
grouped_df['item_id'] = grouped_df.StockCode.apply(lambda i: item_ids[i])

In [58]:
grouped_df.head()

Unnamed: 0,CustomerID,StockCode,Description,Quantity,customer_id,item_id
0,12346.0,23166,MEDIUM CERAMIC TOP STORAGE JAR,1,0,0
1,12347.0,16008,SMALL FOLDING SCISSOR(POINTED EDGE),24,1,1
2,12347.0,17021,NAMASTE SWAGAT INCENSE,36,1,2
3,12347.0,20665,RED RETROSPOT PURSE,6,1,3
4,12347.0,20719,WOODLAND CHARLOTTE BAG,40,1,4


In [59]:
sparse_item_customer = sparse.csr_matrix((grouped_df['Quantity'].astype(float), (grouped_df['item_id'], grouped_df['customer_id'])))
sparse_customer_item = sparse.csr_matrix((grouped_df['Quantity'].astype(float), (grouped_df['customer_id'], grouped_df['item_id'])))

In [60]:
sparse_item_customer

<3664x4338 sparse matrix of type '<class 'numpy.float64'>'
	with 266724 stored elements in Compressed Sparse Row format>

In [61]:
sparse_customer_item

<4338x3664 sparse matrix of type '<class 'numpy.float64'>'
	with 266724 stored elements in Compressed Sparse Row format>

In [62]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

alpha = 15
data = (sparse_item_customer * alpha).astype('double')

model.fit(data)

  0%|          | 0/50 [00:00<?, ?it/s]

### Finding the Similar Items

In [63]:
grouped_df.loc[grouped_df['item_id'] == 1319].head()

Unnamed: 0,CustomerID,StockCode,Description,Quantity,customer_id,item_id
3078,12409.0,71053,WHITE METAL LANTERN,12,50,1319
6411,12462.0,71053,WHITE METAL LANTERN,4,95,1319
11523,12556.0,71053,WHITE METAL LANTERN,2,166,1319
22679,12748.0,71053,WHITE METAL LANTERN,1,326,1319
25856,12840.0,71053,WHITE METAL LANTERN,4,390,1319


In [64]:
item_id = 1319
n_similar = 10

item_vecs = model.item_factors
customer_vecs = model.user_factors

item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

scores = item_vecs.dot(item_vecs[item_id]) / item_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / item_norms[item_id]), key=lambda x: -x[1])

In [65]:
for item in similar:
    idx, score = item
    print(grouped_df.Description.loc[grouped_df.item_id == idx].iloc[0])

WHITE METAL LANTERN
LANTERN CREAM GAZEBO 
HANGING METAL HEART LANTERN
PHOTO FRAME CORNICE
RED HANGING HEART T-LIGHT HOLDER
SMALL GLASS HEART TRINKET POT
WOODEN PICTURE FRAME WHITE FINISH
LOVE BUILDING BLOCK WORD
WOODEN FRAME ANTIQUE WHITE 
WOOD BLACK BOARD ANT WHITE FINISH


### Recommend Items to Customers

In [66]:
def recommend(customer_id, sparse_customer_item, customer_vecs, item_vecs, num_items=10):
    
    customer_interactions = sparse_customer_item[customer_id,:].toarray()
    customer_interactions = customer_interactions.reshape(-1) + 1
    customer_interactions[customer_interactions > 1] = 0
    
    rec_vector = customer_vecs[customer_id,:].dot(item_vecs.T).toarray()
    
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = customer_interactions * rec_vector_scaled

    item_idx = np.argsort(recommend_vector)[::-1][:num_items]
    
    descriptions = []
    scores = []

    for idx in item_idx:
        descriptions.append(grouped_df.Description.loc[grouped_df.item_id == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'description': descriptions, 'score': scores})

    return recommendations

In [69]:
customer_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for customer with id 2
customer_id = 2

recommendations = recommend(customer_id, sparse_customer_item, customer_vecs, item_vecs)

print(recommendations)

                          description     score
0       PACK OF 6 LARGE FRUIT STRAWS   1.000000
1      PACK OF 6 PANNETONE GIFT BOXES  0.971729
2         BOX OF 24 COCKTAIL PARASOLS  0.955932
3  3 PIECE SPACEBOY COOKIE CUTTER SET  0.935987
4   PACK OF 6 COCKTAIL PARASOL STRAWS  0.930685
5  SET/10 BLUE POLKADOT PARTY CANDLES  0.921161
6      SET OF 9 HEART SHAPED BALLOONS  0.916775
7  SET/10 PINK POLKADOT PARTY CANDLES  0.910592
8        SMALL HEART MEASURING SPOONS  0.907234
9    PACK OF 12 PINK PAISLEY TISSUES   0.903123


In [76]:
grouped_df.loc[grouped_df['customer_id'] == 2].sort_values('Quantity', ascending=False)[['customer_id', 'Description', 'Quantity']].head(20)

Unnamed: 0,customer_id,Description,Quantity
120,2,DOUGHNUT LIP GLOSS,200
121,2,ICE CREAM PEN LIP GLOSS,192
114,2,PACK OF 12 HEARTS DESIGN TISSUES,144
112,2,PACK OF 12 SUKI TISSUES,144
118,2,60 CAKE CASES VINTAGE CHRISTMAS,144
105,2,PACK OF 72 SKULL CAKE CASES,144
113,2,PACK OF 12 BLUE PAISLEY TISSUES,144
123,2,60 TEATIME FAIRY CAKE CASES,144
111,2,PACK OF 12 WOODLAND TISSUES,144
110,2,PACK OF 12 RED RETROSPOT TISSUES,144


### Evaluation the RecSys

In [71]:
import random

def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of item,user index into list

    
    random.seed(0) # Set the random seed to zero for reproducibility
    
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of item-user pairs without replacement

    item_inds = [index[0] for index in samples] # Get the item row indices

    customer_inds = [index[1] for index in samples] # Get the user column indices

    
    training_set[item_inds, customer_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
    return training_set, test_set, list(set(customer_inds))

In [72]:
item_train, item_test, item_customers_altered = make_train(sparse_item_customer, pct_test = 0.2)

In [73]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)

In [74]:
def calc_mean_auc(training_set, altered_customers, predictions, test_set):
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_items = np.array(test_set.sum(axis = 1)).reshape(-1) # Get sum of item iteractions to find most popular
    item_vecs = predictions[1]
    for customer in altered_customers: # Iterate through each user that had an item altered
        training_column = training_set[:,customer].toarray().reshape(-1) # Get the training set column
        zero_inds = np.where(training_column == 0) # Find where the interaction had not yet occurred
        
        # Get the predicted values based on our user/item vectors
        customer_vec = predictions[0][customer,:]
        pred = customer_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[:,customer].toarray()[zero_inds,0].reshape(-1)
        
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_items[zero_inds] # Get the item popularity for our chosen items
        
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [75]:
calc_mean_auc(item_train, item_customers_altered,
              [customer_vecs, item_vecs.T], item_test)

(0.937, 0.815)