In [1]:
import cupy as cp
import numpy as np
import time

%load_ext autoreload
%autoreload 2

In [None]:
### Numpy and CPU
s = time.time()
x_cpu = np.ones((4000,100,1))
e = time.time()
print(e - s)

In [None]:
### CuPy and GPU
s = time.time()
x_gpu = cp.ones((4000,1))
cp.cuda.Stream.null.synchronize()
e = time.time()
print(e - s)

In [2]:
from bprH_gpu import bprH, save_model, load_model
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
import cProfile as profile

In [3]:
def behavior_generator(num_user = 10, num_item=10):
    data = []
    for u in range(num_user):
        for i in range(num_item):
            p = np.random.uniform(0,1)
            if 0.95 <= p:
                data.append(['U' + str(u),'I' + str(i),'P'])
            elif 0.7 <= p:
                data.append(['U' + str(u),'I' + str(i),'P'])
                data.append(['U' + str(u),'I' + str(i),'V'])
            elif 0.3 <= p:
                data.append(['U' + str(u),'I' + str(i),'V'])
    data = pd.DataFrame(data, columns=['UserID', 'ItemID', 'Action'])
    return data

In [4]:
data = behavior_generator(num_user=500, num_item=500)

In [None]:
# to librec
data_librec_1 = data[data.Action == 'P']
data_librec_1.Action = 1
data_librec_1.to_csv('generated_data.txt', index=False, header=False)

data_librec_2 = data
data_librec_1.Action = 1
data_librec_1.to_csv('generated_data2.txt', index=False, header=False)

In [5]:
# the best way is to split the data with a time manner - from sklearn.model_selection import TimeSeriesSplit
# but first let's do it in a simple way

original_item_list = sorted(set(data.ItemID))
original_user_list = sorted(set(data.UserID))

data_without_target = data[data.Action != 'P']
data_only_with_target = data[data.Action == 'P']

In [6]:
# split the data_only_with_target into 5 folds
#kf = KFold(n_splits=5,shuffle=True, random_state=0)
#kf.get_n_splits(X=data_only_with_target)

d1, test_data = train_test_split(data_only_with_target, test_size=0.2)

train_data = pd.concat([d1, data_without_target])
train_data.reset_index(inplace=True)
test_data.reset_index(inplace=True)

In [None]:
pr = profile.Profile()
pr.enable()
model = bprH(dim=15,
                 omega=100,
                 rho=1,
                 lambda_u=0.01,
                 lambda_v=0.01,
                 lambda_b=0.01,
                 gamma=0.01,
                 random_state=20200704,
                 num_iter=500)

model.fit(X=train_data,
          eval_X=test_data,
          original_item_list=original_item_list,
          original_user_list=original_user_list,
          saved_path='data/item-set-coselection-test.pkl',
          coselection=True,
          plot_metric=False,
          print_metric=False
)
pr.disable()
pr.dump_stats("bhrH_gpu_profile.pstat")

Registering Model Parameters
Build I_u_t, I_u_a
Calculate auxiliary-target correlation
Generate Itemset Coselection - Build U_i
Generate Itemset Coselection - Build S
Initializing User and Item Matrices


100%|██████████| 500/500 [00:14<00:00, 33.35it/s]
100%|██████████| 500/500 [00:00<00:00, 31268.11it/s]
100%|██████████| 500/500 [00:08<00:00, 56.79it/s]
100%|██████████| 500/500 [00:01<00:00, 324.23it/s]
 59%|█████▉    | 296/500 [00:12<00:09, 20.65it/s, len_I=136, len_J=216, len_K=148]

In [8]:
# metrcis on test data
scoring_list_10, precision_10, recall_10, _ = model.scoring(ground_truth=model.test_data,
              user_to_eval=sorted(set(model.test_data.UserID)),
              K=10)

In [None]:
import numpy as np
import scipy

In [None]:
lfm_train_data = np.zeros(shape=(50,50))
lfm_test_data = np.zeros(shape=(50,50))
for u in original_user_list:
    train_item_pool = set(train_data[(train_data.UserID == u) & (train_data.Action == 'P')].ItemID)
    test_item_pool = set(test_data[(test_data.UserID == u) & (test_data.Action == 'P')].ItemID)
    u_idx = original_user_list.index(u)
    for i in train_item_pool:
        i_idx = original_item_list.index(i)
        lfm_train_data[u_idx,i_idx] = 1
    for i in test_item_pool:
        i_idx = original_item_list.index(i)
        lfm_test_data[u_idx,i_idx] = 1

In [None]:
lfm_train_data = scipy.sparse.coo_matrix(lfm_train_data)
lfm_test_data = scipy.sparse.coo_matrix(lfm_test_data)

In [None]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM(no_components=5, learning_rate=0.005, loss='bpr')
model.fit(lfm_train_data, epochs=5000)

train_precision = precision_at_k(model, lfm_train_data, k=10).mean()
test_precision = precision_at_k(model, lfm_test_data, k=10).mean()

train_auc = auc_score(model, lfm_train_data).mean()
test_auc = auc_score(model, lfm_test_data).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

In [None]:
kf = KFold(n_splits=5,shuffle=True, random_state=0)
kf.get_n_splits(X=data_only_with_target)

cnt = 0
for train_index, test_index in kf.split(X=data_only_with_target):
    # build train and test data
    # page 90 5.2 section - make most of auxiliary data in training process
    train_data = pd.concat([data_only_with_target.iloc[train_index], data_without_target])
    test_data = data_only_with_target.iloc[test_index]

    # start a BPRH model
    model = bprH(dim=8,
                 omega=1,
                 rho=1,
                 lambda_u=1.5,
                 lambda_v=0.1,
                 lambda_b=0.1,
                 gamma=0.01,
                 random_state=20200704,
                 num_iter=9000)

    model.fit(X=train_data,
              eval_X=test_data,
              original_item_list=original_item_list,
              original_user_list=original_user_list,
              saved_path='data/item-set-coselection-'+str(cnt)+'.pkl',
              coselection=True,
              plot_metric=True)
    cnt += 1


In [3]:
# read sobazaar data
data = pd.read_csv('data/sb_bprh.csv')
# change column name
data.columns = ['ItemID', 'UserID', 'Action', 'Action_Date', 'Action_Time',
       'SessionID']

# data = data.sample(frac=0.5) # TODO: test small data

In [4]:
# the best way is to split the data with a time manner - from sklearn.model_selection import TimeSeriesSplit
# but first let's do it in a simple way

original_item_list = sorted(set(data.ItemID))
original_user_list = sorted(set(data.UserID))

data_without_target = data[data.Action != 'P']
data_only_with_target = data[data.Action == 'P']

In [None]:
# split the data_only_with_target into 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=0)
kf.get_n_splits(X=data_only_with_target)

In [None]:
cnt = 0
for train_index, test_index in kf.split(X=data_only_with_target):
    # build train and test data
    train_data = pd.concat([data_only_with_target.iloc[train_index], data_without_target])
    test_data = data_only_with_target.iloc[test_index]

    # start a BPRH model
    model = bprH(dim=30,
                 omega=1000,
                 rho=1,
                 lambda_u=1,
                 lambda_v=0.5,
                 lambda_b=0.5,
                 gamma=0.01,
                 random_state=20200704,
                 num_iter=10000)

    model.fit(X=train_data,
              eval_X=test_data,
              original_item_list=original_item_list,
              original_user_list=original_user_list,
              saved_path='data/item-set-coselection-'+str(cnt)+'.pkl',
              coselection=True,
              plot_metric=True)
    cnt += 1

In [5]:
d1, test_data = train_test_split(data_only_with_target, test_size=0.2)

train_data = pd.concat([d1, data_without_target])
train_data.reset_index(inplace=True)
test_data.reset_index(inplace=True)

In [12]:
pr = profile.Profile()
pr.enable()

model = bprH(dim=50,
                 omega=1000,
                 rho=1,
                 lambda_u=0.01,
                 lambda_v=0.01,
                 lambda_b=0.01,
                 gamma=0.01,
                 random_state=20200704,
                 num_iter=15000)

model.fit(X=train_data,
              eval_X=test_data,
              original_item_list=original_item_list,
              original_user_list=original_user_list,
              saved_path='data/item-set-coselection-test.pkl',
              coselection=True,
              plot_metric=False,
          print_metric=False)

pr.disable()
pr.dump_stats("bhrH_gpu_profile.pstat")

100%|██████████| 4712/4712 [02:53<00:00, 27.10it/s]
100%|██████████| 4712/4712 [00:00<00:00, 224537.43it/s]
100%|██████████| 7015/7015 [02:12<00:00, 53.09it/s]
100%|██████████| 7015/7015 [00:32<00:00, 217.23it/s]
100%|██████████| 15000/15000 [30:46<00:00,  8.12it/s, len_I=1, len_J=2, len_K=3]  


Build I_u_t, I_u_a
Calculate auxiliary-target correlation
Generate Itemset Coselection - Build U_i
Generate Itemset Coselection - Build S


In [16]:
scoring_list_10, precision_10, recall_10, _ = model.scoring(ground_truth=model.test_data,
              user_to_eval=sorted(set(model.test_data.UserID)),
              K=10)