In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix # for constructing sparse matrix
from lightfm import LightFM # for model
from lightfm.evaluation import auc_score
import time
import sklearn
from sklearn import model_selection



Data

In [2]:
#orders data
order_df = pd.read_excel('data/Rec_sys_data.xlsx','order')
#customers data
customer_df = pd.read_excel('data/Rec_sys_data.xlsx','customer')
#products data
product_df = pd.read_excel('data/Rec_sys_data.xlsx','product')
order_df.head()

merged_df = pd.merge(order_df,customer_df,left_on=['CustomerID'], right_on=['CustomerID'], how='left')
merged_df = pd.merge(merged_df,product_df,left_on=['StockCode'], right_on=['StockCode'], how='left')

In [3]:
merged_df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID,Gender,Age,Income,Zipcode,Customer Segment,Product Name,Description,Category,Brand,Unit Price
0,536365,84029E,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.2,ExpressAir,30.12,17850,female,48,Medium,84306,Middle class,"3 1/2""W x 20""D x 20""H Funston Craftsman Smooth...",Our Rustic Collection is an instant classic. O...,Home Improvement|Hardware|Brackets and Angle I...,Ekena Milwork,199.11
1,536365,71053,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,17850,female,48,Medium,84306,Middle class,Awkward Styles Shamrock Flag St. Patrick's Day...,Our St Patrick's Day Collection is perfect for...,Clothing|Men|Mens T-Shirts & Tank Tops|Mens Gr...,Awkward Styles,23.95


In [None]:
def unique_users(data, column):
    return np.sort(data[column].unique())
def unique_items(data, column):
    item_list = data[column].unique()
    return item_list
 

In [5]:
def unique_users(data, column):
    return np.sort(data[column].unique())
def unique_items(data, column):
    item_list = data[column].unique()
    return item_list

user_list = unique_users(order_df, "CustomerID")
item_list = unique_items(product_df, "Product Name")

def features_to_add(customer, column1,column2,column3):
    customer1 = customer[column1]
    customer2 = customer[column2]
    customer3 = customer[column3]
    return pd.concat([customer1,customer3,customer2], ignore_index = True).unique()

feature_unique_list = features_to_add(customer_df,'Customer Segment',"Age","Gender")

def mapping(user_list, item_list, feature_unique_list):
#creating empty output dicts
    user_to_index_mapping = {}
    index_to_user_mapping = {}
    # Create id mappings to convert user_id
    for user_index, user_id in enumerate(user_list):
        user_to_index_mapping[user_id] = user_index
        index_to_user_mapping[user_index] = user_id
    item_to_index_mapping = {}
    index_to_item_mapping = {}
    # Create id mappings to convert item_id
    for item_index, item_id in enumerate(item_list):
        item_to_index_mapping[item_id] = item_index
        index_to_item_mapping[item_index] = item_id
    feature_to_index_mapping = {}
    index_to_feature_mapping = {}
    # Create id mappings to convert feature_id
    for feature_index, feature_id in enumerate(feature_unique_list):
        feature_to_index_mapping[feature_id] = feature_index
        index_to_feature_mapping[feature_index] = feature_id
    return user_to_index_mapping, index_to_user_mapping, \
    item_to_index_mapping, index_to_item_mapping, \
    feature_to_index_mapping, index_to_feature_mapping

user_to_index_mapping, index_to_user_mapping, \
item_to_index_mapping, index_to_item_mapping, \
feature_to_index_mapping, index_to_feature_mapping = mapping(user_list, item_list, feature_unique_list)

user_to_product = merged_df[['CustomerID','Product Name','Quantity']]
#Calculating the total quantity(sum) per customer-product
user_to_product = user_to_product.groupby(['CustomerID','Product Name']).agg({'Quantity':'sum'}).reset_index()

product_to_feature = merged_df[['Product Name','Customer Segment','Quantity']]
#Calculating the total quantity(sum) per customer_segment-product
product_to_feature = product_to_feature.groupby(['Product Name','Customer Segment']).agg({'Quantity':'sum'}).reset_index()

user_to_product_train,user_to_product_test = model_selection.train_test_split(user_to_product,test_size=0.33, random_state=42)

def interactions(data, row, col, value, row_map, col_map):
    #converting the row with its given mappings
    row = data[row].apply(lambda x: row_map[x]).values
    #converting the col with its given mappings
    col = data[col].apply(lambda x: col_map[x]).values
    value = data[value].values
    #returning the interaction matrix
    return coo_matrix((value, (row, col)), shape = (len(row_map), len(col_map)))

#for train
user_to_product_interaction_train = interactions(user_to_product_train, "CustomerID",
"Product Name", "Quantity", user_to_index_mapping, item_to_index_mapping)
#for test
user_to_product_interaction_test = interactions(user_to_product_test, "CustomerID",
"Product Name", "Quantity", user_to_index_mapping, item_to_index_mapping)

product_to_feature_interaction = interactions(product_to_feature, "Product Name", "Customer Segment","Quantity",item_to_index_mapping, feature_to_index_mapping)

In [14]:
product_to_feature

Unnamed: 0,Product Name,Customer Segment,Quantity
0,"""In Vinyl W.e Trust"" Rasta Quote Men's T-shirt",Corporate,712
1,"""In Vinyl W.e Trust"" Rasta Quote Men's T-shirt",Middle class,272
2,"""In Vinyl W.e Trust"" Rasta Quote Men's T-shirt",Small Business,388
3,"""Soccer"" Vinyl Graphic - Large - Ivory",Corporate,1940
4,"""Soccer"" Vinyl Graphic - Large - Ivory",Middle class,1418
...,...,...,...
2078,ifrogz Tadpole On the Go Bluetooth Speaker w/ ...,Middle class,14
2079,ifrogz Tadpole On the Go Bluetooth Speaker w/ ...,Small Business,1
2080,storefront christmas LED Decoration Light Gold...,Corporate,682
2081,storefront christmas LED Decoration Light Gold...,Middle class,1603


Each customer segmented is like big business, small and entrepreneur (ish) and then it represents that item 262 has been bought by someone that has a big business 712 times etc

In [7]:
print(product_to_feature_interaction)

<COOrdinate sparse matrix of dtype 'int64'
	with 2083 stored elements and shape (797, 43)>
  Coords	Values
  (262, 1)	712
  (262, 2)	272
  (262, 0)	388
  (167, 1)	1940
  (167, 2)	1418
  (167, 0)	2797
  (482, 1)	233
  (482, 2)	428
  (482, 0)	303
  (399, 1)	38
  (399, 2)	24
  (399, 0)	4
  (676, 1)	6
  (676, 0)	18
  (379, 1)	21
  (379, 2)	8
  (379, 0)	31
  (409, 2)	1
  (478, 1)	108
  (478, 2)	78
  (478, 0)	89
  (527, 1)	1266
  (527, 2)	340
  (527, 0)	612
  (156, 1)	7562
  :	:
  (198, 2)	1521
  (198, 0)	1231
  (199, 1)	69
  (199, 2)	87
  (199, 0)	109
  (76, 1)	9422
  (76, 2)	8254
  (76, 0)	10122
  (12, 1)	1648
  (12, 2)	1822
  (12, 0)	1754
  (623, 1)	8
  (623, 2)	3
  (623, 0)	4
  (326, 1)	76
  (326, 2)	103
  (326, 0)	124
  (551, 1)	139
  (551, 2)	96
  (551, 0)	164
  (278, 2)	14
  (278, 0)	1
  (378, 1)	682
  (378, 2)	1603
  (378, 0)	494


In [8]:
model_with_features = LightFM(loss = "logistic")
start = time.time()
#===================
# fitting the model with hybrid collaborative filtering + content based (product + features)
model_with_features.fit_partial(user_to_product_interaction_train,
user_features=None,
item_features=product_to_feature_interaction,
sample_weight=None,
epochs=10,
num_threads=20,
verbose=False)
#===================
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 1.20 seconds


In [9]:
auc_with_features = auc_score(model = model_with_features,
test_interactions = user_to_product_interaction_test,
train_interactions = user_to_product_interaction_train,
item_features = product_to_feature_interaction, num_threads = 4, check_intersections=False)

In [10]:
def train_test_merge(training_data, testing_data):
    # initialising train dict
    train_dict = {}
    for row, col, data in zip(training_data.row, training_data.col, training_data.data):
        train_dict[(row, col)] = data
    # replacing with the test set
    for row, col, data in zip(testing_data.row, testing_data.col, testing_data.data):
        train_dict[(row, col)] = max(data, train_dict.get((row, col), 0))
    # converting to the row
    row_list = []
    col_list = []
    data_list = []
    for row, col in train_dict:
        row_list.append(row)
        col_list.append(col)
        data_list.append(train_dict[(row, col)])
    # converting to np array
    row_list = np.array(row_list)
    col_list = np.array(col_list)
    data_list = np.array(data_list)
    #returning the matrix output
    return coo_matrix((data_list, (row_list, col_list)), shape = (training_data.shape[0], training_data.shape[1]))
user_to_product_interaction = train_test_merge(user_to_product_interaction_train, user_to_product_interaction_test)

In [11]:
final_model = LightFM(loss = "logistic",no_components=30)
start = time.time()
final_model.fit(user_to_product_interaction,
user_features=None,
item_features=product_to_feature_interaction,
sample_weight=None,
epochs=10,
num_threads=20,
verbose=False)
end = time.time()
print("time taken = {0:.{1}f} seconds".format(end - start, 2))

time taken = 4.56 seconds


In [12]:
def get_recommendations(model,user,items,user_to_product_interaction_matrix,user2index_map,product_to_feature_interaction_matrix):
    # getting the userindex
    userindex = user2index_map.get(user, None)
    if userindex == None:
        return None
    users = userindex
    # getting products already bought
    known_positives = items[user_to_product_interaction_matrix.tocsr()[userindex].indices]
    print('User index =',users)
    # scores from model prediction
    scores = model.predict(user_ids = users, item_ids = np.arange(user_to_product_interaction_matrix.shape[1]),item_features=product_to_feature_interaction_matrix)
    #getting top items
    top_items = items[np.argsort(-scores)]
    # printing out the result
    print("User %s" % user)
    print(" Known positives:")
    for x in known_positives[:10]:
        print(" %s" % x)
    print(" Recommended:")
    for x in top_items[:10]:
     print(" %s" % x)

In [19]:
get_recommendations(final_model,15127,item_list,user_to_product_interaction,user_to_index_mapping,product_to_feature_interaction)

User index = 1735
User 15127
 Known positives:
 Mediven Sheer and Soft 15-20 mmHg Thigh w/ Lace Silicone Top Band CT Wheat II - Ankle 8-8.75 inches
 MightySkins Skin Decal Wrap Compatible with DJI Sticker Protective Cover 100's of Color Options
 billyboards Porcelain School Chalkboard
 MightySkins Skin Decal Wrap Compatible with Smok Sticker Protective Cover 100's of Color Options
 Unik Occasions Sparkling Collection Monogram Cake Topper, Silver
 Ebe Men Gold Shield Half Rim Spring Hinge Reading Glasses a970
 Style and Apply Banksy Paratrooper Rat Wall Decal
 Zoan Synchrony Duo Sport Electric Snow Helmet Tourer Graphics Flou Green XXL 821-338
 Ebe Reading Glasses Mens Womens Tortoise Rectanuglar Chic Anti Glare grade ckbdp9123
 MightySkins Protective Vinyl Skin Decal for PowerA Pro Ex Xbox One Controller case wrap cover sticker skins Baby Blue Designer
 Recommended:
 Mediven Sheer and Soft 15-20 mmHg Thigh w/ Lace Silicone Top Band CT Wheat II - Ankle 8-8.75 inches
 MightySkins Skin De