In [60]:
# import Libraries
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import scipy.sparse as sp
from scipy.special import expit
from sklearn.feature_extraction import DictVectorizer
import pickle
import csv
import copy
import itertools
from lightfm import LightFM
import lightfm.evaluation
import helper_functions
from lightfm.evaluation import auc_score, reciprocal_rank,precision_at_k
from scipy.sparse import coo_matrix # for constructing sparse matrix
# timing
import time

In [61]:
# data folder path
PATH_TO_DATA = 'data/'
# read csv files
# dec_df = pd.read_csv(PATH_TO_DATA + '2019-Dec.csv')
nov_df = pd.read_csv(PATH_TO_DATA + '2019-Nov.csv')
oct_df = pd.read_csv(PATH_TO_DATA + '2019-Oct.csv')

In [62]:
# filtering purchase data only 
# dec_df = dec_df.loc[dec_df['event_type'] == 'purchase']
nov_df = nov_df.loc[nov_df['event_type'] == 'purchase']
oct_df = oct_df.loc[oct_df['event_type'] == 'purchase']

In [63]:
# concate 2 month data
Final_data = pd.concat([oct_df, nov_df], ignore_index=True)
Final_data.shape

(1065266, 9)

In [64]:
# drop dublicate if any 
Final_data.drop_duplicates(inplace=True)
# final data
Final_data.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:02:14 UTC,purchase,1004856,2053013555631882655,electronics.smartphone,samsung,130.76,543272936,8187d148-3c41-46d4-b0c0-9c08cd9dc564
1,2019-10-01 00:04:37 UTC,purchase,1002532,2053013555631882655,electronics.smartphone,apple,642.69,551377651,3c80f0d6-e9ec-4181-8c5c-837a30be2d68
2,2019-10-01 00:06:02 UTC,purchase,5100816,2053013553375346967,,xiaomi,29.51,514591159,0e5dfc4b-2a55-43e6-8c05-97e1f07fbb56
3,2019-10-01 00:07:07 UTC,purchase,13800054,2053013557418656265,furniture.bathroom.toilet,santeri,54.42,555332717,1dea3ee2-2ded-42e8-8e7a-4e2ad6ae942f
4,2019-10-01 00:09:26 UTC,purchase,4804055,2053013554658804075,electronics.audio.headphone,apple,189.91,524601178,2af9b570-0942-4dcd-8f25-4d84fba82553


In [65]:
# Threshold data to only include users and models with min 4 products.
threshold_data_interations = helper_functions.threshold_interactions_df(Final_data, 'user_id', 'product_id', 4, 4)

Starting interactions info
Number of rows: 378642
Number of cols: 71122
Sparsity: 0.004%
Ending interactions info
Number of rows: 56150
Number of columns: 21245
Sparsity: 0.047%


In [66]:
# threshold data
threshold_data_interations.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:02:14 UTC,purchase,1004856,2053013555631882655,electronics.smartphone,samsung,130.76,543272936,8187d148-3c41-46d4-b0c0-9c08cd9dc564
1,2019-10-01 00:04:37 UTC,purchase,1002532,2053013555631882655,electronics.smartphone,apple,642.69,551377651,3c80f0d6-e9ec-4181-8c5c-837a30be2d68
5,2019-10-01 00:09:54 UTC,purchase,4804056,2053013554658804075,electronics.audio.headphone,apple,161.98,551377651,3c80f0d6-e9ec-4181-8c5c-837a30be2d68
8,2019-10-01 00:12:14 UTC,purchase,4802036,2053013554658804075,electronics.audio.headphone,apple,171.56,533624186,e5ac3caa-e6d5-4d6b-ae06-2c18cd9ca683
10,2019-10-01 02:19:10 UTC,purchase,1004246,2053013555631882655,electronics.smartphone,apple,736.18,515246296,b7dbae4f-cad3-463e-89ce-41990cf48dea


In [67]:
# getting unique items and users list, this will use when we recommend products to certain user
items = helper_functions.get_item_list(threshold_data_interations, "product_id")
users_list = helper_functions.get_user_list(threshold_data_interations, "user_id")

In [68]:
# Go from dataframe to interaction matrix
# Also, build index to ID mappers.
interations, uid_to_idx, idx_to_uid,\
mid_to_idx, idx_to_mid = helper_functions.df_to_matrix(threshold_data_interations, 'user_id', 'product_id')

interations

<56150x21245 sparse matrix of type '<class 'numpy.float64'>'
	with 410712 stored elements in Compressed Sparse Row format>

In [69]:
# train test split
train, test, user_index = helper_functions.train_test_split(interations, 4, fraction=0.2)

In [70]:
# getting dict of product and category 
feat_dlist = [{} for _ in idx_to_mid]
for idx, row in threshold_data_interations.iterrows():
    feat_key = '{}'.format(row.category_id)
    idx = mid_to_idx.get(row.product_id)
    if idx is not None:
        feat_dlist[idx][feat_key] = 1

In [71]:
feat_dlist[7]

{'2053013554658804075': 1}

In [72]:
# sparse matrix for product and catgories
dv = DictVectorizer()
item_features = dv.fit_transform(feat_dlist)

In [73]:
item_features

<21245x752 sparse matrix of type '<class 'numpy.float64'>'
	with 21278 stored elements in Compressed Sparse Row format>

In [74]:
# initialising model with warp loss function
model_without_features = LightFM(loss = "warp")

## Experiment 1 using only Interation Matrix

In [75]:
# fitting into user to product interaction matrix only / pure collaborative filtering factor

start = time.time()


model_without_features.fit(train,
          user_features=None, 
          item_features=None, 
          sample_weight=None, 
          epochs=40, 
          num_threads=4,
          verbose=False)


end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 17.65 seconds


In [76]:
# auc metric score (ranging from 0 to 1)

start = time.time()

# AUC Score
auc_without_features = auc_score(model = model_without_features, 
                        test_interactions = test,
                        num_threads = 4, check_intersections = False)

end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 8.77 seconds


In [77]:
print("average AUC without adding item-feature interaction = {0:.{1}f}".format(auc_without_features.mean(), 2))

average AUC without adding item-feature interaction = 0.89


## Experiment 2 with Item Features

In [78]:
# initialising model with warp loss function
model_with_features = LightFM(loss = "warp")

In [79]:
# fitting the model with hybrid collaborative filtering + content based (product + features)
start = time.time()


# fitting the model
model_with_features.fit(train,
          user_features=None, 
          item_features=item_features, 
          sample_weight=None, 
          epochs=40, 
          num_threads=4,
          verbose=False)


end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 16.12 seconds


In [80]:
start = time.time()
# AUC Score
auc_with_features = auc_score(model = model_with_features, 
                        test_interactions = test,
                        train_interactions = train, 
                        item_features = item_features,
                        num_threads = 4, check_intersections=False)

end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))



# model, test_interactions, train_interactions=None, 
# user_features=None, item_features=None, preserve_rows=False, num_threads=1, check_intersections=True

time taken = 11.54 seconds


In [81]:
print("average AUC with adding item-feature interaction = {0:.{1}f}".format(auc_with_features.mean(), 2))

average AUC with adding item-feature interaction = 0.89


### Requesting Products / Items Recommendation

we need to combine the training and the test set and retrain everything

In [82]:
def combined_train_test(train, test):
    """
    
    test set is the more recent rating/number_of_order of users.
    train set is the previous rating/number_of_order of users.
    non-zero value in the test set will replace the elements in 
    the train set matrices

    """
    # initialising train dict
    train_dict = {}
    for train_row, train_col, train_data in zip(train.row, train.col, train.data):
        train_dict[(train_row, train_col)] = train_data
        
    # replacing with the test set
    
    for test_row, test_col, test_data in zip(test.row, test.col, test.data):
        train_dict[(test_row, test_col)] = max(test_data, train_dict.get((test_row, test_col), 0))
        
    
    # converting to the row
    row_element = []
    col_element = []
    data_element = []
    for row, col in train_dict:
        row_element.append(row)
        col_element.append(col)
        data_element.append(train_dict[(row, col)])
        
    # converting to np array
    
    row_element = np.array(row_element)
    col_element = np.array(col_element)
    data_element = np.array(data_element)
    
    return coo_matrix((data_element, (row_element, col_element)), shape = (train.shape[0], train.shape[1]))

In [83]:
user_to_product_interaction = combined_train_test(train.tocoo(), test.tocoo())

In [84]:
user_to_product_interaction

<56150x21245 sparse matrix of type '<class 'numpy.float64'>'
	with 410712 stored elements in COOrdinate format>

In [85]:
# retraining the final model with combined dataset

final_model = LightFM(loss = "warp")

# fitting to combined dataset with pure collaborative filtering result

start = time.time()


final_model.fit(user_to_product_interaction,
          user_features=None, 
          item_features=None, 
          sample_weight=None, 
          epochs=40, 
          num_threads=4,
          verbose=False)


end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 19.34 seconds


In [86]:
class recommendation_sampling:
    
    def __init__(self, model, items = items, user_to_product_interaction_matrix = user_to_product_interaction, 
                user2index_map = uid_to_idx):
        
        self.user_to_product_interaction_matrix = user_to_product_interaction_matrix
        self.model = model
        self.items = items
        self.user2index_map = user2index_map
    
    def recommendation_for_user(self, user):
        
        # getting the userindex
        
        userindex = self.user2index_map.get(user, None)
        
        if userindex == None:
            return None
        
        users = [userindex]
        print(users)
        # products already bought
        
        known_positives = self.items[self.user_to_product_interaction_matrix.tocsr()[userindex].indices]
        
        # scores from model prediction
        scores = self.model.predict(user_ids = users[0], item_ids = np.arange(self.user_to_product_interaction_matrix.shape[1]))
        
        # top items
        
        top_items = self.items[np.argsort(-scores)]
        
        # printing out the result
        print("User ID: %s" % user)
        print("     Known positives Products:")
        
        for x in known_positives[:5]:
            print("                  %s" % x)
            
            
        print("     Recommended Products:")
        
        for x in top_items[:5]:
            print("                  %s" % x)

In [87]:
# giving recommendations
recom = recommendation_sampling(model = final_model)

### Recommend Products to Single User

In [88]:
recom.recommendation_for_user(random.choice(users_list))

[25260]
User ID: 513631154
     Known positives Products:
                  1005105
                  1004249
                  1005115
                  4803780
     Recommended Products:
                  1005115
                  1005105
                  1004249
                  4804056
                  1004237


### Recommend Products to Multiple Users 

In [89]:
def sample_recommendation(model, data, uid_to_idx, items, user_ids):


    n_users, n_items = data.shape

    for user_id in user_ids:
        userindex = uid_to_idx.get(user_id, None)
        if userindex == None:
            continue
        users = [userindex]
        known_positives = items[data.tocsr()[userindex].indices]

        scores = model.predict(users[0], item_ids = np.arange(data.shape[1]))
        top_items = items[np.argsort(-scores)]

        print("User ID: %s" % user_id)
        print("     Known positives Product:")

        for x in known_positives[:5]:
            print("        %s" % x)

        print("     Recommended Product:")

        for x in top_items[:5]:
            print("        %s" % x)

In [90]:
user_list = [random.choice(users_list),random.choice(users_list), random.choice(users_list)]
sample_recommendation(final_model, user_to_product_interaction, uid_to_idx, items, user_list)

User ID: 515385233
     Known positives Product:
        1004856
        1004750
        1004833
        1004781
     Recommended Product:
        1004856
        1004767
        1004833
        1004870
        1004836
User ID: 510530201
     Known positives Product:
        5833318
        5833334
        5833319
        5848909
        5877612
     Recommended Product:
        5833330
        5833326
        5809910
        5833325
        5761411
User ID: 513320540
     Known positives Product:
        1004246
        12703015
        1003304
     Recommended Product:
        1004767
        1004856
        1005115
        1004870
        1004833
