In [234]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from sklearn.preprocessing import LabelEncoder

In [235]:
bi = pd.read_csv('data/MovieLens/bundle_item.txt', sep="\t", names=['b', 'i'])
ui = pd.read_csv('data/MovieLens/user_item.txt', sep="\t", names=['u', 'i'])
ub_train = pd.read_csv('data/MovieLens/user_bundle_train.txt', sep="\t", names=['u', 'b'])
ub_test = pd.read_csv('data/MovieLens/user_bundle_test.txt', sep="\t", names=['u', 'b'])
ub_valid = pd.read_csv('data/MovieLens/user_bundle_tune.txt', sep="\t", names=['u', 'b'])


bi.head(), ui.head()

(   b     i
 0  0   164
 1  0   746
 2  0  1068
 3  0  1179
 4  0  1748,
    u     i
 0  0  2235
 1  0   587
 2  0  2240
 3  0  2884
 4  0  2885)

In [236]:
num_u = ui['u'].nunique()
num_b = bi['b'].nunique()
num_i = bi['i'].nunique()
num_iu = ui['i'].nunique()

num_b, num_i, num_iu, num_u

(2745, 643, 3415, 1000)

In [237]:
bi_idx = bi.to_numpy()
bi_idx

array([[   0,  164],
       [   0,  746],
       [   0, 1068],
       ...,
       [2744, 2105],
       [2744, 2253],
       [2744, 2872]])

In [238]:
min(bi_idx[:, 0]), min(bi_idx[:, 1])

(0, 1)

In [239]:
bi_graph = torch.sparse_coo_tensor(
    indices=torch.tensor([bi_idx[:, 0], bi_idx[:, 1]]),
    values=np.ones(len(bi)),
    size=(num_b, num_iu),
)

In [240]:
bi_graph.shape

torch.Size([2745, 3415])

In [241]:
intersect = bi_graph @ bi_graph.T
mask = torch.eye(num_b)

total_item = bi_graph.sum(dim=1)

In [242]:
total_item = total_item.to_dense()
total_item

tensor([ 7.,  9.,  7.,  ..., 10., 10.,  8.], dtype=torch.float64)

In [243]:
total_item_a = total_item.view(-1, 1)
total_item_b = total_item.view(1, -1)

total_overlap = total_item_a + total_item_b
total_overlap

tensor([[14., 16., 14.,  ..., 17., 17., 15.],
        [16., 18., 16.,  ..., 19., 19., 17.],
        [14., 16., 14.,  ..., 17., 17., 15.],
        ...,
        [17., 19., 17.,  ..., 20., 20., 18.],
        [17., 19., 17.,  ..., 20., 20., 18.],
        [15., 17., 15.,  ..., 18., 18., 16.]], dtype=torch.float64)

In [244]:
intersect

tensor(indices=tensor([[   0,    0,    0,  ..., 2744, 2744, 2744],
                       [   0,    6,   10,  ..., 2734, 2739, 2744]]),
       values=tensor([7., 2., 1.,  ..., 3., 2., 8.]),
       size=(2745, 2745), nnz=1302853, dtype=torch.float64,
       layout=torch.sparse_coo)

In [245]:
inter_non_eye = intersect - intersect * mask
total_overlap_non_eye = total_overlap - total_overlap * mask + mask

In [246]:
inter_non_eye

tensor(indices=tensor([[   0,    0,    0,  ..., 2744, 2744, 2744],
                       [   0,    6,   10,  ..., 2734, 2739, 2744]]),
       values=tensor([0., 2., 1.,  ..., 3., 2., 0.]),
       size=(2745, 2745), nnz=1302853, dtype=torch.float64,
       layout=torch.sparse_coo)

In [247]:
total_overlap_non_eye

tensor([[ 1., 16., 14.,  ..., 17., 17., 15.],
        [16.,  1., 16.,  ..., 19., 19., 17.],
        [14., 16.,  1.,  ..., 17., 17., 15.],
        ...,
        [17., 19., 17.,  ...,  1., 20., 18.],
        [17., 19., 17.,  ..., 20.,  1., 18.],
        [15., 17., 15.,  ..., 18., 18.,  1.]], dtype=torch.float64)

In [248]:
total_not_overlap = total_overlap_non_eye - inter_non_eye

In [249]:
bi_graph[0], bi_graph[6], total_not_overlap[0, 6] # overlap 2 item

(tensor(indices=tensor([[ 164,  746, 1068, 1179, 1748, 2066, 2726]]),
        values=tensor([1., 1., 1., 1., 1., 1., 1.]),
        size=(3415,), nnz=7, dtype=torch.float64, layout=torch.sparse_coo),
 tensor(indices=tensor([[ 922, 1068, 1154, 1748, 2008]]),
        values=tensor([1., 1., 1., 1., 1.]),
        size=(3415,), nnz=5, dtype=torch.float64, layout=torch.sparse_coo),
 tensor(10., dtype=torch.float64))

In [250]:
jaccard_pair_bundle = inter_non_eye.to_dense() / total_not_overlap

In [251]:
eli = (jaccard_pair_bundle >= 0.7).nonzero()
eli

tensor([[   4,  152],
        [   4,  558],
        [   9, 1473],
        ...,
        [2723, 2543],
        [2728,  653],
        [2728, 1915]])

In [252]:
eli_unique = eli[:, 0].unique()
len(eli_unique)

601

In [253]:
eli_unique

tensor([   4,    9,   17,   18,   22,   31,   33,   45,   53,   59,   62,   69,
          72,   76,   79,   85,   93,   99,  100,  103,  106,  113,  114,  115,
         118,  121,  122,  125,  126,  130,  132,  145,  146,  152,  153,  154,
         159,  163,  174,  179,  180,  181,  184,  195,  201,  209,  218,  223,
         225,  226,  232,  234,  236,  238,  243,  244,  247,  249,  252,  253,
         260,  266,  267,  268,  269,  285,  297,  299,  304,  310,  318,  320,
         323,  325,  328,  329,  331,  336,  346,  349,  352,  354,  361,  367,
         369,  372,  377,  378,  379,  381,  387,  389,  391,  392,  401,  405,
         419,  424,  443,  452,  460,  476,  482,  485,  492,  493,  497,  505,
         517,  527,  528,  529,  532,  541,  544,  551,  558,  563,  565,  573,
         577,  584,  592,  593,  594,  597,  602,  603,  604,  606,  609,  617,
         619,  621,  624,  628,  634,  639,  641,  647,  649,  653,  660,  663,
         674,  676,  680,  683,  684,  6

In [254]:
bi_graph[4], bi_graph[152]

(tensor(indices=tensor([[ 266,  458,  996, 1378, 3122, 3373]]),
        values=tensor([1., 1., 1., 1., 1., 1.]),
        size=(3415,), nnz=6, dtype=torch.float64, layout=torch.sparse_coo),
 tensor(indices=tensor([[ 266,  458,  996, 1007, 1378, 1379, 3122, 3373]]),
        values=tensor([1., 1., 1., 1., 1., 1., 1., 1.]),
        size=(3415,), nnz=8, dtype=torch.float64, layout=torch.sparse_coo))

In [255]:
ub = pd.concat([ub_train, ub_test, ub_valid])

In [263]:
bi_fil = bi[~bi['b'].isin(eli_unique.tolist())]
ub_train_fil = ub_train[~ub_train['b'].isin(eli_unique.tolist())]
ub_test_fil = ub_test[~ub_test['b'].isin(eli_unique.tolist())]
ub_valid_fil = ub_valid[~ub_valid['b'].isin(eli_unique.tolist())]

In [264]:
bi_fil

Unnamed: 0,b,i
0,0,164
1,0,746
2,0,1068
3,0,1179
4,0,1748
...,...,...
20509,2744,1097
20510,2744,2043
20511,2744,2105
20512,2744,2253


In [265]:
ub_all  = pd.concat([ub_train_fil, ub_test_fil, ub_valid_fil])
ub_all = ub_all.sample(frac=1)

In [266]:
train_idx = int(len(ub_all) * 0.7)
valid_idx = int(len(ub_all) * 0.8)


ub_train_fil = ub_all[:train_idx]
ub_valid_fil = ub_all[train_idx:valid_idx]
ub_test_fil = ub_all[valid_idx:]

In [267]:
len(ub_train_fil), len(ub_test_fil), len(ub_valid_fil), len(bi_fil)

(1502, 430, 215, 16366)

In [268]:
ub_train_fil.to_csv("user_bundle_train.txt", sep="\t", index=False, header=False)
ub_test_fil.to_csv("user_bundle_test.txt", sep="\t", index=False, header=False)
ub_valid_fil.to_csv("user_bundle_valid.txt", sep="\t", index=False, header=False)
bi_fil.to_csv("bundle_item.txt", sep="\t", index=False, header=False)