In [None]:
import cupy as cp
import numpy as np
import time

%load_ext autoreload
%autoreload 2

In [None]:
### Numpy and CPU
s = time.time()
x_cpu = np.ones((4000,100,1))
e = time.time()
print(e - s)

In [None]:
### CuPy and GPU
s = time.time()
x_gpu = cp.ones((4000,1))
cp.cuda.Stream.null.synchronize()
e = time.time()
print(e - s)

In [None]:
from bprH_gpu import bprH, save_model, load_model
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
import cProfile as profile

In [None]:
def behavior_generator(num_user = 10, num_item=10):
    data = []
    for u in range(num_user):
        for i in range(num_item):
            p = np.random.uniform(0,1)
            if 0.99 <= p:
                data.append(['U' + str(u),'I' + str(i),'P'])
            elif 0.9 <= p:
                data.append(['U' + str(u),'I' + str(i),'P'])
                data.append(['U' + str(u),'I' + str(i),'V'])
            elif 0.5 <= p:
                data.append(['U' + str(u),'I' + str(i),'V'])
    data = pd.DataFrame(data, columns=['UserID', 'ItemID', 'Action'])
    return data

In [None]:
data = behavior_generator(num_user=47, num_item=50)

In [None]:
# to librec
data_librec_1 = data[data.Action == 'P']
data_librec_1.Action = 1
data_librec_1.to_csv('generated_data.txt', index=False, header=False)

data_librec_2 = data
data_librec_1.Action = 1
data_librec_1.to_csv('generated_data2.txt', index=False, header=False)

In [None]:
# the best way is to split the data with a time manner - from sklearn.model_selection import TimeSeriesSplit
# but first let's do it in a simple way

original_item_list = sorted(set(data.ItemID))
original_user_list = sorted(set(data.UserID))

data_without_target = data[data.Action != 'P']
data_only_with_target = data[data.Action == 'P']

In [None]:
# split the data_only_with_target into 5 folds
#kf = KFold(n_splits=5,shuffle=True, random_state=0)
#kf.get_n_splits(X=data_only_with_target)

d1, test_data = train_test_split(data_only_with_target, test_size=0.2)

train_data = pd.concat([d1, data_without_target])
train_data.reset_index(inplace=True, drop=True)
test_data.reset_index(inplace=True, drop=True)

In [None]:
#pr = profile.Profile()
#pr.enable()
model = bprH(dim=5,
                 omega=10,
                 rho=1,
                 lambda_u=0,
                 lambda_v=0,
                 lambda_b=0,
                 gamma=0.1,
                 random_state=20200704,
                 num_iter=4000)

model.fit(X=train_data,
          eval_X=test_data,
          original_item_list=original_item_list,
          original_user_list=original_user_list,
          saved_path='data/item-set-coselection-test.pkl',
          coselection=True,
          plot_metric=True,
          print_metric=True
)
#pr.disable()
#pr.dump_stats("bhrH_gpu_profile.pstat")

In [None]:
# metrcis on test data
scoring_list_10, precision_10, recall_10, _ = model.scoring(ground_truth=model.test_data,
              user_to_eval=sorted(set(model.test_data.UserID)),
              K=10)
model.recommend(user_to_recommend=[0], K=10)

In [None]:
model.test_data

In [None]:
scoring_list_10, precision_10, recall_10, _ = model.scoring(ground_truth=model.train_data,
              user_to_eval=sorted(set(model.train_data.UserID)),
              K=10)

In [None]:
import numpy as np
import scipy

In [None]:
lfm_train_data = np.zeros(shape=(4712,7015))
lfm_test_data = np.zeros(shape=(4712,7015))
for u in original_user_list:
    train_item_pool = set(train_data[(train_data.UserID == u) & (train_data.Action == 'P')].ItemID)
    test_item_pool = set(test_data[(test_data.UserID == u) & (test_data.Action == 'P')].ItemID)
    u_idx = original_user_list.index(u)
    for i in train_item_pool:
        i_idx = original_item_list.index(i)
        lfm_train_data[u_idx,i_idx] = 1
    for i in test_item_pool:
        i_idx = original_item_list.index(i)
        lfm_test_data[u_idx,i_idx] = 1

In [None]:
lfm_train_data = scipy.sparse.coo_matrix(lfm_train_data)
lfm_test_data = scipy.sparse.coo_matrix(lfm_test_data)

In [None]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM(no_components=10, learning_rate=0.005, loss='bpr')
model.fit(lfm_train_data, epochs=1000)

train_precision = precision_at_k(model, lfm_train_data, k=10).mean()
test_precision = precision_at_k(model, lfm_test_data, k=10).mean()

train_auc = auc_score(model, lfm_train_data).mean()
test_auc = auc_score(model, lfm_test_data).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

In [None]:
kf = KFold(n_splits=5,shuffle=True, random_state=0)
kf.get_n_splits(X=data_only_with_target)

cnt = 0
for train_index, test_index in kf.split(X=data_only_with_target):
    # build train and test data
    # page 90 5.2 section - make most of auxiliary data in training process
    train_data = pd.concat([data_only_with_target.iloc[train_index], data_without_target])
    test_data = data_only_with_target.iloc[test_index]

    # start a BPRH model
    model = bprH(dim=8,
                 omega=1,
                 rho=1,
                 lambda_u=1.5,
                 lambda_v=0.1,
                 lambda_b=0.1,
                 gamma=0.01,
                 random_state=20200704,
                 num_iter=9000)

    model.fit(X=train_data,
              eval_X=test_data,
              original_item_list=original_item_list,
              original_user_list=original_user_list,
              saved_path='data/item-set-coselection-'+str(cnt)+'.pkl',
              coselection=True,
              plot_metric=True)
    cnt += 1


In [None]:
# read sobazaar data
data = pd.read_csv('data/sb_bprh.csv')
# change column name
data.columns = ['ItemID', 'UserID', 'Action', 'Action_Date', 'Action_Time',
       'SessionID']

data = data[['ItemID', 'UserID', 'Action']]
data.drop_duplicates(inplace=True)

In [None]:
# the best way is to split the data with a time manner - from sklearn.model_selection import TimeSeriesSplit
# but first let's do it in a simple way

original_item_list = sorted(set(data.ItemID))
original_user_list = sorted(set(data.UserID))

data_without_target = data[data.Action != 'P']
data_only_with_target = data[data.Action == 'P']

In [None]:
# split the data_only_with_target into 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=0)
kf.get_n_splits(X=data_only_with_target)

In [None]:
cnt = 0
for train_index, test_index in kf.split(X=data_only_with_target):
    # build train and test data
    train_data = pd.concat([data_only_with_target.iloc[train_index], data_without_target])
    test_data = data_only_with_target.iloc[test_index]

    # start a BPRH model
    model = bprH(dim=30,
                 omega=1000,
                 rho=1,
                 lambda_u=1,
                 lambda_v=0.5,
                 lambda_b=0.5,
                 gamma=0.01,
                 random_state=20200704,
                 num_iter=10000)

    model.fit(X=train_data,
              eval_X=test_data,
              original_item_list=original_item_list,
              original_user_list=original_user_list,
              saved_path='data/item-set-coselection-'+str(cnt)+'.pkl',
              coselection=True,
              plot_metric=True)
    cnt += 1

In [None]:
d1, test_data = train_test_split(data_only_with_target, test_size=0.2)

train_data = pd.concat([d1, data_without_target])
train_data.reset_index(inplace=True,drop=True)
test_data.reset_index(inplace=True,drop=True)

In [None]:
model = bprH(dim=50,
                 omega=1000,
                 rho=1,
                 lambda_u=0.5,
                 lambda_v=0.5,
                 lambda_b=0.5,
                 gamma=0.005,
                 random_state=20200710,
                 num_iter=5000)

model.fit(X=train_data,
          eval_X=test_data,
          original_item_list=original_item_list,
          original_user_list=original_user_list,
          saved_path='data/item-set-coselection-test.pkl',
          coselection=True,
          plot_metric=False,
          print_metric=False)

Registering Model Parameters
Build I_u_t, I_u_a
Calculate auxiliary-target correlation
Generate Itemset Coselection - Build U_i


100%|██████████| 4712/4712 [01:29<00:00, 52.40it/s]
100%|██████████| 4712/4712 [00:00<00:00, 277356.06it/s]
 22%|██▏       | 1518/7015 [00:16<00:58, 94.73it/s] 

In [8]:
scoring_list_10, precision_10, recall_10, _ = \
    model.scoring(ground_truth=model.test_data,
              user_to_eval=sorted(set(model.test_data.UserID)),
              K=10)

In [18]:
model.train_data[(model.train_data.UserID == 404) &
                 (model.train_data.Action == "P")]

Unnamed: 0,ItemID,UserID,Action
236,4949,404,P
2268,236,404,P
3576,2035,404,P
3639,6068,404,P
3740,4092,404,P
4496,4434,404,P
5648,6933,404,P
8505,6407,404,P
9195,6675,404,P
9411,6758,404,P


In [14]:
model.test_data[model.test_data.UserID == 404]

Unnamed: 0,ItemID,UserID,Action
261,5046,404,P
1643,5770,404,P
1977,4171,404,P
2008,471,404,P


In [24]:
model.V

array([[-0.58238604, -0.42182379, -0.69283479, ..., -0.60697936,
         0.19777065,  0.08216565],
       [ 0.65250566,  0.15347718,  1.3230051 , ..., -0.81596365,
         2.49769663,  0.31699304],
       [ 0.84981093, -0.47369144,  0.41294763, ...,  0.90614293,
        -0.55250807, -0.84272059],
       ...,
       [-0.19771952,  1.06898457,  1.89134039, ..., -1.22563057,
        -1.29902831,  0.26265441],
       [ 0.44992601,  1.04122114, -0.70205945, ...,  0.48012517,
        -1.43401496,  1.12638675],
       [ 0.01890585, -1.06995588,  0.85704412, ..., -1.00359167,
         1.58781057, -1.9604453 ]])

In [None]:
cp.linalg.norm(cp.array([[1,2,3],[4,5,6]]) - cp.array([[1.9,1.7,1.2],[4.8,5.2,9.6]]))