In [1]:
# import packages
import pandas as pd
import numpy as np

In [2]:
# read data
soba = pd.read_csv("data/Sobazaar-hashID.csv")

# clean data
action_set = set(soba.Action)
item_set = set(soba.ItemID)
user_set = set(soba.UserID)

# Users with at least one purchase behavior
user_p_id = set(soba[soba.Action == 'purchase:buy_clicked'].UserID)
temp = soba[soba.UserID.isin(user_p_id)]
temp = temp[temp.Action.isin(['product_detail_clicked', 'product_wanted', 'purchase:buy_clicked'])]
soba_cleaned_1 = temp.sort_values(by=['UserID', 'Timestamp'])
del temp
soba_cleaned_1.reset_index(drop=True, inplace=True)

# Users with at least one purchase &
# Items with at least one purchase
item_p_id = set(soba_cleaned_1[soba_cleaned_1.Action == 'purchase:buy_clicked'].ItemID)
soba_cleaned_2 = soba_cleaned_1[soba_cleaned_1.ItemID.isin(item_p_id)]
soba_cleaned_2.reset_index(drop=True, inplace=True)

# set the desired version
soba_cleaned = soba_cleaned_2

In [3]:
# parameter setting
dim = 10
num_iter = 1000
w = 1
rho = 0.5
lambda_u = 1
lambda_v = 1
lambda_b = 1
gamma = 1
num_u = len(set(soba_cleaned.UserID))
num_i = len(set(soba_cleaned.ItemID))

In [14]:
# Calculate auxiliary-target correlation C for every user and each types of auxiliary action
target_action = 'purchase:buy_clicked'
auxiliary_action = ['product_detail_clicked', 'product_wanted']
C_u = dict()
for u in set(soba_cleaned.UserID):
    C_u[u] = dict()
    I_t_u = set(soba_cleaned[(soba_cleaned.UserID == u) & (soba_cleaned.Action == target_action)].ItemID)
    # TODO filtered item set
    for X in auxiliary_action:
        I_a_u = set(soba_cleaned[(soba_cleaned.UserID == u) & (soba_cleaned.Action == X)].ItemID)
        if len(I_a_u) == 0:
            C_u[u][X] = 0
            continue
        C_u_at = len(I_t_u.intersection(I_a_u)) / len(I_t_u)
        C_u_ta = len(I_t_u.intersection(I_a_u)) / len(I_a_u)
        if C_u_ta + C_u_at == 0:
            C_u[u][X] = 0
            continue
        C_u_X = 2 * C_u_at * C_u_ta / (C_u_ta + C_u_at)
        C_u[u][X] = C_u_X

In [16]:
temp = pd.DataFrame.from_dict(C_u, orient='index')
temp['alpha'] = w * rho * temp.product_detail_clicked + w * (1 - rho) * temp.product_wanted
alpha_u = temp
del temp
alpha_u.reset_index(inplace=True)

In [19]:
# generate item-set based on co-selection
S = dict()
p = 0
for i in set(soba_cleaned.ItemID):
    p += 1
    print('Process: ', p / len(set(soba_cleaned.ItemID)))
    S[i] = set()
    U_i = set(soba_cleaned[soba_cleaned.ItemID == i].UserID)
    for j in set(soba_cleaned.ItemID):
        U_j = set(soba_cleaned[soba_cleaned.ItemID == j].UserID)
        if len(U_i.intersection(U_j)) >= 2: S[i].add(j)


Process:  0.00014255167498218105
Process:  0.0002851033499643621
Process:  0.00042765502494654313
Process:  0.0005702066999287242
Process:  0.0007127583749109052
Process:  0.0008553100498930863
Process:  0.0009978617248752673
Process:  0.0011404133998574484
Process:  0.0012829650748396293
Process:  0.0014255167498218105
Process:  0.0015680684248039914
Process:  0.0017106200997861725
Process:  0.0018531717747683535
Process:  0.0019957234497505346
Process:  0.0021382751247327157
Process:  0.002280826799714897
Process:  0.0024233784746970776
Process:  0.0025659301496792587
Process:  0.00270848182466144
Process:  0.002851033499643621
Process:  0.0029935851746258017
Process:  0.003136136849607983
Process:  0.003278688524590164
Process:  0.003421240199572345
Process:  0.003563791874554526
Process:  0.003706343549536707
Process:  0.003848895224518888
Process:  0.003991446899501069
Process:  0.00413399857448325
Process:  0.0042765502494654314
Process:  0.004419101924447612
Process:  0.00456165

KeyboardInterrupt: 

In [13]:
# initialization
U = np.random.uniform(size=(num_u, dim))
V = np.random.uniform(size=(dim, num_i))
B = np.random.uniform(size=(num_i, 1))