# Imports

In [1]:
# General
import random
import os
import numpy as np
import pandas as pd
import zipfile

# warning
import warnings
warnings.filterwarnings("ignore")

# display, plots
from IPython.display import display_html
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
import plotly.express as px

# recommender systems
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
from lightfm import LightFM
from scipy.sparse import csr_matrix

ModuleNotFoundError: No module named 'lightfm'

# Loading Data from Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
articles = pd.read_csv('drive/MyDrive/w207-final-project-data/articles.zip', compression='zip')
transactions = pd.read_csv('drive/MyDrive/w207-final-project-data/transactions.zip', compression='zip')

FileNotFoundError: [Errno 2] No such file or directory: 'drive/MyDrive/w207-final-project-data/articles.zip'

In [5]:
print('Size of articles metadata', articles.shape)
print('Size of user-articles transactions data', transactions.shape)

Size of articles metadata (105542, 25)
Size of user-articles transactions data (31788324, 5)


# Articles Data Pre-Processing

In [6]:
articles.shape

(105542, 25)

In [7]:
articles.head(3)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


# Create a subset of Article DataFrame from the selected features

In [8]:
features = ['article_id',
            'product_type_name','product_group_name','graphical_appearance_name', ## product groups info
            'colour_group_name','perceived_colour_value_name','perceived_colour_master_name',## color groups
            'department_name', ##departments
            'index_name','index_group_name','section_name', ##sections
            'garment_group_name', ##garment groups
            'detail_desc' ##Article detail description
           ]

articles = articles[features]
articles.sort_values(by=['article_id'], inplace=True)
articles.head(3)

Unnamed: 0,article_id,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name,detail_desc
0,108775015,Vest top,Garment Upper body,Solid,Black,Dark,Black,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,Vest top,Garment Upper body,Solid,White,Light,White,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,Vest top,Garment Upper body,Stripe,Off White,Dusty Light,White,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.


# Create one-hot encoding Matrix for Articles Data

In [9]:
articles = pd.get_dummies(articles, columns = [
            'product_type_name','product_group_name','graphical_appearance_name', ## product groups info
            'colour_group_name','perceived_colour_value_name','perceived_colour_master_name',## color groups
            'department_name', ##departments
            'index_name','index_group_name','section_name', ##sections
            'garment_group_name' ##garment groups
           ],sparse=True)

In [10]:
articles.head(3)

Unnamed: 0,article_id,detail_desc,product_type_name_Accessories set,product_type_name_Alice band,product_type_name_Baby Bib,product_type_name_Backpack,product_type_name_Bag,product_type_name_Ballerinas,product_type_name_Beanie,product_type_name_Belt,...,garment_group_name_Shorts,garment_group_name_Skirts,garment_group_name_Socks and Tights,garment_group_name_Special Offers,garment_group_name_Swimwear,garment_group_name_Trousers,garment_group_name_Trousers Denim,"garment_group_name_Under-, Nightwear",garment_group_name_Unknown,garment_group_name_Woven/Jersey/Knitted mix Baby
0,108775015,Jersey top with narrow shoulder straps.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,108775044,Jersey top with narrow shoulder straps.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,108775051,Jersey top with narrow shoulder straps.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
articles = articles.sort_values('article_id').reset_index().drop('index', axis=1)
print(articles.shape)

(105542, 602)


# Create CSR Matrix from Articles dataframe

In [12]:
articles_csr = csr_matrix(articles.drop(['article_id', 'detail_desc'], axis=1).values)
articles_csr

<105542x600 sparse matrix of type '<class 'numpy.longlong'>'
	with 1160962 stored elements in Compressed Sparse Row format>

# Create a dictionary of articles and their detailed description

In [13]:
item_dict ={}
df = articles[['article_id', 'detail_desc']].reset_index()

for i in range(df.shape[0]):
    item_dict[(df.loc[i,'article_id'])] = df.loc[i,'detail_desc']

# print first 5 items:
for item in list(item_dict)[0:5]:
    print (item, item_dict[item])

108775015 Jersey top with narrow shoulder straps.
108775044 Jersey top with narrow shoulder straps.
108775051 Jersey top with narrow shoulder straps.
110065001 Microfibre T-shirt bra with underwired, moulded, lightly padded cups that shape the bust and provide good support. Narrow adjustable shoulder straps and a narrow hook-and-eye fastening at the back. Without visible seams for greater comfort.
110065002 Microfibre T-shirt bra with underwired, moulded, lightly padded cups that shape the bust and provide good support. Narrow adjustable shoulder straps and a narrow hook-and-eye fastening at the back. Without visible seams for greater comfort.


# Transactions Data Pre-Processing

In [14]:
transactions.shape

(31788324, 5)

In [15]:
transactions.head(3)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2


In [16]:
transactions['transaction_date_time'] = pd.to_datetime(transactions["t_dat"])

In [17]:
transactions.drop(columns=['t_dat','price','sales_channel_id'],inplace=True,axis=0)

In [18]:
transactions.head()

Unnamed: 0,customer_id,article_id,transaction_date_time
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,2018-09-20
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,2018-09-20
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,2018-09-20
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,2018-09-20
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,2018-09-20


In [19]:
transactions = transactions.groupby(['customer_id','article_id']).size().reset_index()

In [20]:
transactions.columns = ['customer_id','article_id','cust_art_int']

In [21]:
transactions.sort_values(by=['cust_art_int'], ascending=False, inplace=True)

In [22]:
transactions.reset_index(drop=True, inplace=True)

In [23]:
transactions.shape

(27306439, 3)

In [24]:
transactions.head()

Unnamed: 0,customer_id,article_id,cust_art_int
0,d00063b94dcb1342869d4994844a2742b5d62927f36843...,678342001,570
1,94665b46e194622ccdbcadc0170f13a2f8ede1ff6d057d...,629420001,199
2,61da44a2758206d5701771f4315637b40c8321b5111916...,507909001,188
3,ef38ec0f0cb29ee8bbb87efc82fd16f4b99127e3eeefe6...,570002001,170
4,5cba04ed9a3759bc02a8a9e01efccc07ce76c35c1a70dc...,688558002,166


# Randomly select 1K Customers for Development Set

In [25]:
# randomly select 1000 customers
dev_transactions = transactions[transactions['customer_id'].isin(
               random.sample(list(transactions['customer_id'].unique()), k=1000))]

In [26]:
dev_transactions.shape

(19128, 3)

In [27]:
customer_article_interaction = pd.pivot_table(dev_transactions, index='customer_id', columns='article_id', values='cust_art_int')

In [28]:
customer_article_interaction = customer_article_interaction.fillna(0)

In [29]:
dev_transactions_csr = csr_matrix(customer_article_interaction.values)
dev_transactions_csr

<1000x13118 sparse matrix of type '<class 'numpy.float64'>'
	with 19128 stored elements in Compressed Sparse Row format>

In [31]:
# lightfm.cross_validation.random_train_test_split

dev_train_csr, dev_test_csr = random_train_test_split(dev_transactions_csr, test_percentage=0.2, random_state=None)

# Randomly select 10K Customers for Training Set

The team has selected random 10k customers for hyptertuning, as the computing power wasn't able to process all customers included in the transaction data provided.

In [None]:
# randomly select 10000 customers
transactions = transactions[transactions['customer_id'].isin(
               random.sample(list(transactions['customer_id'].unique()), k=10000))]

In [None]:
transactions.shape

(200197, 3)

# Converting transactions data into sparse matrix

In [None]:
from pandas.api.types import CategoricalDtype

#Converting transactions data into a sparse matrix
customer_u = CategoricalDtype(sorted(transactions.customer_id.unique()), ordered=True)
article_u = CategoricalDtype(sorted(transactions.article_id.unique()), ordered=True)

row = transactions.customer_id.astype(customer_u).cat.codes
col = transactions.article_id.astype(article_u).cat.codes
transactions_csr = csr_matrix((transactions['cust_art_int'], (row, col)),shape=(customer_u.categories.size, article_u.categories.size))
transactions_csr

<10000x46649 sparse matrix of type '<class 'numpy.longlong'>'
	with 200197 stored elements in Compressed Sparse Row format>

#### Split Transactions sparse data into Train and Test data using LightFM CrossValidation random train_test split

In [None]:
# lightfm.cross_validation.random_train_test_split

train_csr, test_csr = random_train_test_split(transactions_csr, test_percentage=0.2, random_state=None)

# Base LightFM Model

In [None]:
base_model = LightFM(loss='warp',
                random_state=0,
                learning_rate=0.90,
                no_components=150,
                user_alpha=0.000005)

base_model = base_model.fit(train_csr,
                  epochs=100,
                  num_threads=16, verbose=False)

In [None]:
base_model_dict = {}
base_model_dict['Train Precision at k=12'] = precision_at_k(base_model, train_csr, k=12, num_threads=16).mean()
base_model_dict['Test Precision at k=12'] = precision_at_k(base_model, test_interactions = test_csr, train_interactions = None, k=12, num_threads=16).mean()
base_model_dict['Train AUC Score'] = auc_score(base_model, train_csr, num_threads=16).mean()
base_model_dict['Test AUC Score'] = auc_score(base_model, test_interactions = test_csr, train_interactions = None, num_threads=16).mean()

base_model_df = pd.DataFrame(data=base_model_dict,index=['Score']).transpose()
base_model_df

Unnamed: 0,Score
Train Precision at k=12,0.001115
Test Precision at k=12,8e-05
Train AUC Score,0.614166
Test AUC Score,0.499281


#### Recall@12 and Reciprocal Rank Results

In [None]:
model_recall_at_k = recall_at_k(base_model, test_interactions = test_csr, train_interactions = train_csr, k=12, num_threads=16).mean()
model_recall_at_k

6.698487427818475e-05

In [None]:
model_reciprocal_rank = reciprocal_rank(base_model, test_interactions = test_csr, train_interactions = None, num_threads=16).mean()
model_reciprocal_rank

0.0007122948

# Experimentation with Hyper Parameters

## Learning loss Hyperparameter

In [None]:
loss_method = ['logistic', 'bpr', 'warp', 'warp-kos']
loss_precision_list= []
loss_auc_score_list = []

for i in loss_method: 
  loss_model = LightFM(loss=i,
                  random_state=0,
                  learning_rate=0.90,
                  no_components=150,
                  user_alpha=0.000005,
                  item_alpha=0.000005)
  
  loss_model = loss_model.fit(train_csr,
                    epochs=100,
                    num_threads=16, verbose=False)
  
  loss_precision_list.append(precision_at_k(loss_model, test_interactions = test_csr, train_interactions = None, k=12, num_threads=16).mean())
  loss_auc_score_list.append(auc_score(loss_model, test_interactions = test_csr, train_interactions = None, num_threads=16).mean())

loss_model_df = pd.DataFrame(data=[loss_method,loss_precision_list,loss_auc_score_list]).transpose()
loss_model_df.rename(columns={
                          0 : 'Loss Method', 
                          1 : 'Precision at 12 score',
                          2 : 'AUC Score',
                          }, inplace=True)
loss_model_df

Unnamed: 0,Loss Method,Precision at 12 score,AUC Score
0,logistic,0.004286,0.692284
1,bpr,0.002006,0.676668
2,warp,1.1e-05,0.498531
3,warp-kos,8e-05,0.499206


## Learning Schedule Hyperparameter

### 'adagrad' learning schedule method

In [None]:
loss_method = ['logistic', 'bpr', 'warp', 'warp-kos']
loss_adagrad_precision_list= []
loss_adagrad_auc_score_list = []

for i in loss_method: 
    loss_adagrad_model = LightFM(loss=i,
                  learning_schedule='adagrad',
                  random_state=0,
                  learning_rate=0.90,
                  no_components=150,
                  user_alpha=0.000005,
                  item_alpha=0.000005)
  
    loss_adagrad_model = loss_adagrad_model.fit(train_csr,
                    epochs=100,
                    num_threads=16, verbose=False)
  
    loss_adagrad_precision_list.append(precision_at_k(loss_adagrad_model, test_interactions = test_csr, train_interactions = None, k=12, num_threads=16).mean())
    loss_adagrad_auc_score_list.append(auc_score(loss_adagrad_model, test_interactions = test_csr, train_interactions = None, num_threads=16).mean())

loss_adagrad_model_df = pd.DataFrame(data=[loss_method,loss_adagrad_precision_list,loss_adagrad_auc_score_list]).transpose()
loss_adagrad_model_df.rename(columns={
                          0 : 'Loss Method with adagrad learning schedule', 
                          1 : 'Precision at 12 score',
                          2 : 'AUC Score',
                          }, inplace=True)
loss_adagrad_model_df

Unnamed: 0,Loss Method with adagrad learning schedule,Precision at 12 score,AUC Score
0,logistic,0.004206,0.69218
1,bpr,0.002063,0.677239
2,warp,6.9e-05,0.499269
3,warp-kos,8e-05,0.499705


### 'adadelta' learning schedule method

In [None]:
loss_method = ['logistic', 'bpr', 'warp', 'warp-kos']
loss_adadelta_precision_list= []
loss_adadelta_auc_score_list = []

for i in loss_method: 
    loss_adadelta_model = LightFM(loss=i,
                  learning_schedule='adadelta',
                  random_state=0,
                  learning_rate=0.90,
                  no_components=150,
                  user_alpha=0.000005,
                  item_alpha=0.000005)
  
    loss_adadelta_model = loss_adadelta_model.fit(train_csr,
                    epochs=100,
                    num_threads=16, verbose=False)
  
    loss_adadelta_precision_list.append(precision_at_k(loss_adadelta_model, test_interactions = test_csr, train_interactions = None, k=12, num_threads=16).mean())
    loss_adadelta_auc_score_list.append(auc_score(loss_adadelta_model, test_interactions = test_csr, train_interactions = None, num_threads=16).mean())

loss_adadelta_model_df = pd.DataFrame(data=[loss_method,loss_adadelta_precision_list,loss_adadelta_auc_score_list]).transpose()
loss_adadelta_model_df.rename(columns={
                          0 : 'Loss Method with adadelta learning schedule', 
                          1 : 'Precision at 12 score',
                          2 : 'AUC Score',
                          }, inplace=True)
loss_adadelta_model_df

Unnamed: 0,Loss Method with adadelta learning schedule,Precision at 12 score,AUC Score
0,logistic,0.004584,0.704104
1,bpr,0.003828,0.630355
2,warp,0.001536,0.620971
3,warp-kos,0.001192,0.595122


### 'adadelta' learning schedule method with rho 

In [None]:
loss_method = ['logistic', 'bpr', 'warp', 'warp-kos']
loss_adadelta_precision_list= []
loss_adadelta_auc_score_list = []

for i in loss_method: 
    loss_adadelta_model = LightFM(loss=i,
                  learning_schedule='adadelta',
                  random_state=0,
                  rho=0.90,
                  no_components=150,
                  user_alpha=0.000005,
                  item_alpha=0.000005)
  
    loss_adadelta_model = loss_adadelta_model.fit(train_csr,
                    epochs=100,
                    num_threads=16, verbose=False)
  
    loss_adadelta_precision_list.append(precision_at_k(loss_adadelta_model, test_interactions = test_csr, train_interactions = None, k=12, num_threads=16).mean())
    loss_adadelta_auc_score_list.append(auc_score(loss_adadelta_model, test_interactions = test_csr, train_interactions = None, num_threads=16).mean())

loss_adadelta_model_df = pd.DataFrame(data=[loss_method,loss_adadelta_precision_list,loss_adadelta_auc_score_list]).transpose()
loss_adadelta_model_df.rename(columns={
                          0 : 'Loss Method with adadelta learning schedule', 
                          1 : 'Precision at 12 score',
                          2 : 'AUC Score',
                          }, inplace=True)
loss_adadelta_model_df

Unnamed: 0,Loss Method with adadelta learning schedule,Precision at 12 score,AUC Score
0,logistic,0.004562,0.702804
1,bpr,0.004458,0.625748
2,warp,0.001811,0.668133
3,warp-kos,0.00149,0.633519


# adadelta with varying rho for logistic loss method

In [None]:
loss_adadelta_rho_precision_list= []
loss_adadelta_rho_auc_score_list = []
loss_adadelta_rho_list = []

for i in list(np.linspace(0.1,0.99,10)): 
    loss_adadelta_rho_list.append(i)

    loss_adadelta_rho_model = LightFM(loss='logistic',
                  learning_schedule='adadelta',
                  random_state=0,
                  rho=i,
                  no_components=150,
                  user_alpha=0.000005,
                  item_alpha=0.000005)
  
    loss_adadelta_rho_model = loss_adadelta_rho_model.fit(train_csr,
                    epochs=100,
                    num_threads=16, verbose=False)
  
    loss_adadelta_rho_precision_list.append(precision_at_k(loss_adadelta_rho_model, test_interactions = test_csr, train_interactions = None, k=12, num_threads=16).mean())
    loss_adadelta_rho_auc_score_list.append(auc_score(loss_adadelta_rho_model, test_interactions = test_csr, train_interactions = None, num_threads=16).mean())

loss_adadelta_rho_model_df = pd.DataFrame(data=[loss_adadelta_rho_list,loss_adadelta_rho_precision_list,loss_adadelta_rho_auc_score_list]).transpose()
loss_adadelta_rho_model_df.rename(columns={
                          0 : 'Rho adadelta learning schedule', 
                          1 : 'Precision at 12 score',
                          2 : 'AUC Score',
                          }, inplace=True)
loss_adadelta_rho_model_df

Unnamed: 0,Rho adadelta learning schedule,Precision at 12 score,AUC Score
0,0.1,0.004367,0.687155
1,0.198889,0.004367,0.686191
2,0.297778,0.004367,0.687042
3,0.396667,0.004413,0.689206
4,0.495556,0.00439,0.691582
5,0.594444,0.004401,0.693967
6,0.693333,0.004413,0.697114
7,0.792222,0.00447,0.699758
8,0.891111,0.00447,0.702511
9,0.99,0.004298,0.703652


# Varying epoch

In [None]:
loss_adadelta_epoch_precision_list= []
loss_adadelta_epoch_auc_score_list = []
loss_adadelta_epoch_list = []

for i in [1,10,25,50,75,100]: 
    loss_adadelta_epoch_list.append(i)

    loss_adadelta_epoch_model = LightFM(loss='logistic',
                  learning_schedule='adadelta',
                  random_state=0,
                  rho=0.99,
                  no_components=150,
                  user_alpha=0.000005,
                  item_alpha=0.000005)
  
    loss_adadelta_epoch_model = loss_adadelta_epoch_model.fit(train_csr,
                    epochs=i,
                    num_threads=16, verbose=False)
  
    loss_adadelta_epoch_precision_list.append(precision_at_k(loss_adadelta_epoch_model, test_interactions = test_csr, train_interactions = None, k=12, num_threads=16).mean())
    loss_adadelta_epoch_auc_score_list.append(auc_score(loss_adadelta_epoch_model, test_interactions = test_csr, train_interactions = None, num_threads=16).mean())

loss_adadelta_epoch_model_df = pd.DataFrame(data=[loss_adadelta_epoch_list,loss_adadelta_epoch_precision_list,loss_adadelta_epoch_auc_score_list]).transpose()
loss_adadelta_epoch_model_df.rename(columns={
                          0 : 'epoch adadelta learning schedule', 
                          1 : 'Precision at 12 score',
                          2 : 'AUC Score',
                          }, inplace=True)
loss_adadelta_epoch_model_df

Unnamed: 0,epoch adadelta learning schedule,Precision at 12 score,AUC Score
0,1.0,0.004539,0.70613
1,10.0,0.004527,0.707522
2,25.0,0.003862,0.705709
3,50.0,0.004424,0.703883
4,75.0,0.004241,0.703569
5,100.0,0.004344,0.7034


# Varying alpha - L2 Regularization

In [None]:
loss_adadelta_alpha_precision_list= []
loss_adadelta_alpha_auc_score_list = []
loss_adadelta_alpha_list = []

for i in [0.05, 0.005, 0.0005, 0.00005, 0.000005]: 
    loss_adadelta_alpha_list.append(i)

    loss_adadelta_alpha_model = LightFM(loss='logistic',
                  learning_schedule='adadelta',
                  random_state=0,
                  rho=0.99,
                  no_components=150,
                  user_alpha=i,
                  item_alpha=i)
  
    loss_adadelta_alpha_model = loss_adadelta_alpha_model.fit(train_csr,
                    epochs=10,
                    num_threads=16, verbose=False)
  
    loss_adadelta_alpha_precision_list.append(precision_at_k(loss_adadelta_alpha_model, test_interactions = test_csr, train_interactions = None, k=12, num_threads=16).mean())
    loss_adadelta_alpha_auc_score_list.append(auc_score(loss_adadelta_alpha_model, test_interactions = test_csr, train_interactions = None, num_threads=16).mean())

loss_adadelta_alpha_model_df = pd.DataFrame(data=[loss_adadelta_alpha_list,loss_adadelta_alpha_precision_list,loss_adadelta_alpha_auc_score_list]).transpose()
loss_adadelta_alpha_model_df.rename(columns={
                          0 : 'alpha adadelta learning schedule', 
                          1 : 'Precision at 12 score',
                          2 : 'AUC Score',
                          }, inplace=True)
loss_adadelta_alpha_model_df

Unnamed: 0,alpha adadelta learning schedule,Precision at 12 score,AUC Score
0,0.05,0.00165,0.102071
1,0.005,0.001387,0.373101
2,0.0005,0.00463,0.680827
3,5e-05,0.004665,0.685821
4,5e-06,0.004367,0.706921


# Final Model

# Logistic Loss

In [None]:
model = LightFM(loss='logistic',
                learning_schedule='adadelta',
                random_state=0,
                rho=0.99,
                no_components=150,
                user_alpha=0.000005,
                item_alpha=0.000005)

model = model.fit(train_csr,
                  epochs=10,
                  num_threads=16, verbose=False)

In [None]:
model_dict = {}
model_dict['Precision at k=12'] = precision_at_k(model, test_interactions = test_csr, train_interactions = None, k=12, num_threads=16).mean()
model_dict['AUC Score'] = auc_score(model, test_interactions = test_csr, train_interactions = None, num_threads=16).mean()

model_df = pd.DataFrame(data=model_dict,index=['Score']).transpose()
model_df

Unnamed: 0,Score
Precision at k=12,0.004481
AUC Score,0.706926


In [None]:
model_recall_at_k = recall_at_k(model, test_interactions = test_csr, train_interactions = train_csr, k=12, num_threads=16).mean()
model_recall_at_k

0.011028412058559778

# BPR Loss

In [None]:
model = LightFM(loss='bpr',
                learning_schedule='adagrad',
                random_state=0,
                learning_rate=0.99,
                no_components=150,
                user_alpha=0.000005,
                item_alpha=0.000005)

model = model.fit(train_csr,
                  epochs=100,
                  num_threads=16, verbose=False)

In [None]:
model_dict = {}
model_dict['Precision at k=12'] = precision_at_k(model, test_interactions = test_csr, train_interactions = None, k=12, num_threads=16).mean()
model_dict['AUC Score'] = auc_score(model, test_interactions = test_csr, train_interactions = None, num_threads=16).mean()

model_df = pd.DataFrame(data=model_dict,index=['Score']).transpose()
model_df

Unnamed: 0,Score
Precision at k=12,0.001593
AUC Score,0.674112


# Warp Loss

In [None]:
model = LightFM(loss='warp',
                learning_schedule='adadelta',
                random_state=0,
                rho=0.99,
                no_components=150,
                user_alpha=0.000005,
                item_alpha=0.000005)

model = model.fit(train_csr,
                  epochs=10,
                  num_threads=16, verbose=False)

In [None]:
model_dict = {}
model_dict['Precision at k=12'] = precision_at_k(model, test_interactions = test_csr, train_interactions = None, k=12, num_threads=16).mean()
model_dict['AUC Score'] = auc_score(model, test_interactions = test_csr, train_interactions = None, num_threads=16).mean()

model_df = pd.DataFrame(data=model_dict,index=['Score']).transpose()
model_df

Unnamed: 0,Score
Precision at k=12,0.002613
AUC Score,0.613763


# Development Set Prediction

In [33]:
dev_model = LightFM(loss='logistic',
                learning_schedule='adadelta',
                random_state=0,
                rho=0.99,
                no_components=150,
                user_alpha=0.000005,
                item_alpha=0.000005)

dev_model = dev_model.fit(dev_train_csr,
                  epochs=10,
                  num_threads=16, verbose=False)

In [35]:
dev_model_dict = {}
dev_model_dict['Precision at k=12'] = precision_at_k(dev_model, test_interactions = dev_test_csr, train_interactions = None, k=12, num_threads=16).mean()
dev_model_dict['AUC Score'] = auc_score(dev_model, test_interactions = dev_test_csr, train_interactions = None, num_threads=16).mean()

dev_model_df = pd.DataFrame(data=dev_model_dict,index=['Score']).transpose()
dev_model_df

Unnamed: 0,Score
Precision at k=12,0.002943
AUC Score,0.412782


# Create Customer Dictionary

In [36]:
customer_id = list(customer_article_interaction.index)
customer_dict = {}
counter = 0 
for i in customer_id:
    customer_dict[i] = counter
    counter += 1

# print first 5 items:
for item in list(customer_dict)[0:5]:
    print (item, customer_dict[item])

0126407060141efdad0df717e928b3fb49451298cfe4bf35531a88b8a4e202d1 0
01a1ea4ceaa38f7f7a979b35bc38246717b0904563a8ccd33fc6035c803cbe93 1
01c99ec0578258a6b50bc55749a294765c52d9613ec23cec4e74a6245b54e2c3 2
01c9e2ab7ba8b932d8a7c0010c0c9abdc4b2addff0b2ec6beb84d3e8bb8a4c71 3
01e77419176cb182b1c2040a51b991934b2d8758f3ce65b8906d8b3e6ec9c995 4


# customer article recommendation function

In [37]:
def article_recommendation_customer(model, customer_article_interaction, customer_id, customer_dict, 
                               item_dict,threshold = 0,nrec_items = 7, show = True):
    
    # model prediction for customer_id
    n_customers, n_items = customer_article_interaction.shape
    customer_x = customer_dict[customer_id]
    scores = pd.Series(model.predict(customer_x,np.arange(n_items), item_features=articles_csr))
    scores.index = customer_article_interaction.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    # known items for customer_id
    known_items = list(pd.Series(customer_article_interaction.loc[customer_id,:] \
                                 [customer_article_interaction.loc[customer_id,:] > threshold].index).sort_values(ascending=False))
    
    # recommended items for customer_id
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    
    if show == True:
        print ("customer: " + str(customer_id))
        print("Known purchases:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter+=1
            
        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1

# Recommend items to purchase for selected user

In [39]:
## define customer_id
customer_id = 0
for i, key in enumerate(customer_dict.keys()):
    if i==555:
        customer_id=key

# customer_id = '0038c7e1a556b3893e698a349af40628f675cca3624c4a353f7f44f597b730cf'

# find artricle purchase recommendations
article_recommendation_customer(dev_model, customer_article_interaction, customer_id, customer_dict, item_dict)

customer: 8c2d6f414d5de8dbe7acd09b104e6b0a9fc5f3b943a975fe6771c63fd4b37278
Known purchases:
1- 5-pocket shorts in washed, stretch cotton denim. Loose fit with a high waist, button fly and cut-off, raw-edge hems. The cotton content of the shorts is partly recycled.
2- Ankle-length trousers in imitation leather with a high waist, zip fly with a concealed button, fake welt front pockets and straight-cut legs that taper to the hems with visible seams.
3- Jumper in a soft knit containing some wool with a high, ribbed collar and zip with a round pull at the top. Low dropped shoulders, long, wide sleeves and wide ribbing at the cuffs and hem.
4- Shorter polo-neck jumper in a rib knit containing some wool with dropped shoulders, long sleeves and roll-edges around the neckline, cuffs and hem. The polyester content of the jumper is recycled.
5- Top in softly draping, ribbed jersey with a sheen. V-neck, narrow shoulder straps and a lace trim at the top.
6- Short-sleeved top in a soft, fine knit w