In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import sys
import scipy.sparse as sp

In [6]:
a = np.array([[1,0,1],[0,1,0],[1,0,1]])
sz = a.sum(axis=1)
a = sp.diags(1/sz) @ a + 1/np.exp(a.sum(axis=0))
a

array([[0.63533528, 0.36787944, 0.63533528],
       [0.13533528, 1.36787944, 0.13533528],
       [0.63533528, 0.36787944, 0.63533528]])

In [5]:
path = 'data/cold/Youshu/'
ui = pd.read_csv(os.path.join(path, 'user_item.txt'), sep='\t', header=None)
bi = pd.read_csv(os.path.join(path, 'bundle_item.txt'), sep='\t', header=None)
ui.columns = ['user', 'item']
bi.columns = ['bundle', 'item']

ub_train = pd.read_csv(os.path.join(path, 'user_bundle_train.txt'), sep='\t', header=None)
ub_tune = pd.read_csv(os.path.join(path, 'user_bundle_tune.txt'), sep='\t', header=None)
ub_test = pd.read_csv(os.path.join(path, 'user_bundle_test.txt'), sep='\t', header=None)
ub = pd.concat([ub_train, ub_tune, ub_test], ignore_index=True)
ub.columns = ['user', 'bundle']
ub_train.columns = ['user', 'bundle']
ub_tune.columns = ['user', 'bundle']
ub_test.columns = ['user', 'bundle']

In [6]:
with open(os.path.join(path, 'bundle_item.txt'), 'r') as f:
    bi_pairs = list(map(lambda s: tuple(int(i) for i in s[:-1].split('\t')), f.readlines()))
        
indices = np.array(bi_pairs, dtype=np.int32)
values = np.ones(len(bi_pairs))
bi_graph = sp.coo_matrix((values, (indices[:, 0], indices[:, 1])), shape=(4771, 32770)).tocsr()

In [8]:
ii_graph = bi_graph.T @ bi_graph

In [9]:
ii_graph.data

array([1., 1., 1., ..., 1., 1., 1.])

# **EDA**

In [73]:
i_ui = set(ui['item'].unique())
i_bi = set(bi['item'].unique())

# Real user-item interactions
item_freq_ui = ui['item'].value_counts()
print("Item frequency in UI view:")
print(item_freq_ui)
print('\n\n')
# Cold-item
cold_item = set(item_freq_ui[item_freq_ui < 5].index)
print("Number of cold-item:", len(cold_item))
# Warm-item
warm_item = set(item_freq_ui[item_freq_ui >= 5].index)
print("Number of warm-item:", len(warm_item))
non_interacted_item = i_bi.difference(i_bi.intersection(i_ui))
print("Number of non-interacted item:", len(non_interacted_item))

print('\n\n')
print("Statistics of item frequency in UI view:")
print(item_freq_ui.describe())

Item frequency in UI view:
16253    778
14647    410
30487    408
23602    407
30693    393
        ... 
28469      1
29226      1
31209      1
18649      1
7676       1
Name: item, Length: 21034, dtype: int64



Number of cold-item: 15864
Number of warm-item: 5170
Number of non-interacted item: 11736



Statistics of item frequency in UI view:
count    21034.000000
mean         6.585290
std         19.441295
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max        778.000000
Name: item, dtype: float64


In [74]:
# Real bundle-item interactions
item_freq_bi = bi['item'].value_counts()

print("Item frequency in BI view:")
print(item_freq_bi)

cold_bundle = {}
bi_lvl = {}

for x in bi['bundle'].unique():
    items = bi[bi['bundle'] == x]['item'].tolist()
    bi_lvl[x] = items
    n = len(items)
    if len(set(items).intersection(cold_item)) + len(set(items).intersection(non_interacted_item)) >= n * 0.5:
        cold_bundle[x] = items

print('\n\n')
print("Number of cold-bundle:", len(cold_bundle))
print("Number of warm-bundle:", len(bi['bundle'].unique()) - len(cold_bundle))

print('\n\n')
print("Statistics of item frequency in BI view:")
print(item_freq_bi.describe())

Item frequency in BI view:
17427    394
6109     390
12058    376
16253    353
6740     343
        ... 
16358      1
15370      1
15333      1
14668      1
1796       1
Name: item, Length: 28074, dtype: int64



Number of cold-bundle: 975
Number of warm-bundle: 3796



Statistics of item frequency in BI view:
count    28074.000000
mean         6.292904
std         16.808898
min          1.000000
25%          1.000000
50%          2.000000
75%          5.000000
max        394.000000
Name: item, dtype: float64
