In [2]:
import pandas as pd
from tqdm import tqdm
import os

from datetime import date as dt
from datetime import datetime

from src.utils import train_tune_test_split, filter_rows, get_count

In [6]:
t_mall = '/home/jb/ved/datasets/Tmall/ijcai2016_taobao.csv'
ta_feng = '/home/jb/ved/datasets/Ta_feng/ta_feng_all_months_merged.csv'

# for auto-reloading external modules
%load_ext autoreload
%autoreload 2
pd.options.display.max_rows = 500

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Tmall Dataset

In [233]:
df_tmall = pd.read_csv(t_mall, encoding="ISO-8859-1", sep=',', error_bad_lines=False, usecols=[0,2,4,5], names=['user', 'item', 'action', 'time'])
df_tmall = df_tmall[(df_tmall['action']==1)] 
df_tmall = df_tmall[['user', 'item', 'time']] 

# assume each user only makes at most one transaction everyday
df_tmall['trans_id'] =  df_tmall['user'].astype(str) + df_tmall['time'].astype(str) 
# sort before finding the products in the same basket
df_tmall = df_tmall.sort_values(['trans_id']) 

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [235]:
# Associate transaction id for each row
user = df_tmall.user.values[0]
trans_buf_id = df_tmall.trans_id.values[0]
trans = 0
trans_id_list = []
for i in tqdm(range(df_tmall.shape[0])):
    if df_tmall.user.values[i] == user:
        if df_tmall.trans_id.values[i] != trans_buf_id:
            trans += 1
            trans_buf_id = df_tmall.trans_id.values[i]
        trans_id_list.append(trans)
    else:
        user = df_tmall.user.values[i]
        trans_buf_id = df_tmall.trans_id.values[i]
        trans = 0
        trans_id_list.append(trans)
df_tmall['trans_id'] = trans_id_list

100%|██████████| 9336595/9336595 [01:30<00:00, 102731.73it/s]


In [239]:
# remove users with less than 'min_uc' counts, and items with less than 'min_sc' counts
# k-cores filtering, cf. FPMC
df_tmall, user_activity, item_popularity = filter_rows(df_tmall, min_uc=25, min_sc=15)

In [242]:
users = df_tmall.user.nunique()
items = df_tmall.item.nunique()
print(f"After filtering:\n   users: {users}\n   items: {items}")

After filtering:
   users: 17143
   items: 23925


In [245]:
# exclude users with less than 10 transactions in total
user = df_tmall.user.values[0]
start = 0
df_list = []
train_test = [] # initial train_test indicator
for i in tqdm(range(df_tmall.shape[0])): # find transactions of each customer
    if df_tmall['user'].values[i] == user:
        continue
    df_user = df_tmall.iloc[start:i,:].copy()
    if df_user.trans_id.max() > 9: 
        df_list.append(df_user)
    user = df_tmall.user.values[i]
    start = i
df_tmall_filtered = pd.concat(df_list)
df_tmall_filtered = df_tmall_filtered[['user', 'item', 'trans_id']]
df_tmall_filtered.head(5)

100%|██████████| 640893/640893 [00:08<00:00, 71403.13it/s]


Unnamed: 0,user,item,trans_id
20558752,1000105,205453,0
20696166,1000105,361850,0
30983520,1000105,533325,0
20713303,1000105,1015542,0
37927052,1000105,2126251,1


In [246]:
users = df_tmall_filtered.user.nunique()
items = df_tmall_filtered.item.nunique()
print(f"After transaction filtering:\n   users: {users}\n   items: {items}")

After transaction filtering:
   users: 14279
   items: 23742


In [250]:
sparsity = 1. * df_tmall_filtered.shape[0] / (users * items)
print(f"Dataset sparsity: {round(sparsity*100, 2)}%")

Dataset sparsity: 0.16%


In [252]:
# structures used to numerize the id (ie. to associate incremental integers to have a continuous list of numbers as ids)
users_id = pd.unique(df_tmall_filtered['user'])
items_id = pd.unique(df_tmall_filtered['item'])

In [254]:
item2id = dict((iid, i) for (i, iid) in enumerate(items_id))
user2id = dict((uid, i) for (i, uid) in enumerate(users_id))

# def numerize(tp):
#     uid = list(map(lambda x: user2id[x], tp['user']))
#     iid = list(map(lambda x: item2id[x], tp['item']))
#     return pd.DataFrame(data={'uid': uid, 'iid': iid, 'tid': tp['trans_id']}, columns=['uid', 'iid', 'tid'])

In [256]:
numerized_data = df_tmall_filtered
numerized_data.user = numerized_data.user.apply(lambda x: user2id[x])
numerized_data.item = numerized_data.item.apply(lambda x: item2id[x])
numerized_data = numerized_data.rename(columns={'trans_id': 'tid'})
numerized_data.head(10)

Unnamed: 0,user,item,tid
20558752,0,0,0
20696166,0,1,0
30983520,0,2,0
20713303,0,3,0
37927052,0,4,1
40672291,0,5,2
31106013,0,6,2
31231583,0,7,2
30898588,0,8,2
40772729,0,9,2


In [274]:
# convert into sequences for each user
start = 0
df_dict_train = {}
df_dict_test = {}
df_dict_test_user_bool = {}
buf_seq_train = []
buf_seq_test = []
user = numerized_data.user.values[0]

for i in tqdm(range(numerized_data.shape[0])):
    if numerized_data['user'].values[i] == user:
        continue
    df_user = numerized_data.iloc[start:i,:].copy()
    user_known_items = set()

    for j in range(0, df_user.tid.max() + 1):            
        trans_items = set(df_user.loc[df_user.tid == j,'item'])
        if not len(trans_items) == 0:
            if j == df_user.tid.max():
                test_trans_items = set()
                for k in trans_items:
                    if k not in user_known_items:
                        test_trans_items.add(k)
                if len(test_trans_items) == 0:
                    df_dict_test_user_bool[user] = 0
                else:
                    df_dict_test_user_bool[user] = 1
                buf_seq_test.append(test_trans_items)
            else:    
                for k in trans_items:
                    user_known_items.add(k)
                buf_seq_train.append(trans_items)
    
    df_dict_train[user] = buf_seq_train
    df_dict_test[user] = buf_seq_test

    buf_seq_test = []
    buf_seq_train = []
    user = numerized_data.user.values[i]
    start = i    
    
sequences = pd.DataFrame()
sequences['user'] = df_dict_train.keys()
sequences['test_user'] = df_dict_test_user_bool.values()
sequences['sequence_train'] = df_dict_train.values()
sequences['sequence_test'] = df_dict_test.values()

100%|██████████| 549194/549194 [01:35<00:00, 5738.35it/s]
  return array(a, dtype, copy=False, order=order)


In [277]:
sequences[sequences['test_user'] == 1]

Unnamed: 0,user,test_user,sequence_train,sequence_test
0,0,1,"[{0, 1, 2, 3}, {4}, {5, 6, 7, 8, 9, 10}, {3, 1...","[{0, 1, 2, 3}, {4}, {5, 6, 7, 8, 9, 10}, {3, 1..."
1,1,1,"[{59, 53, 54, 55, 56, 25, 58, 27, 60, 61, 62, ...","[{59, 53, 54, 55, 56, 25, 58, 27, 60, 61, 62, ..."
2,2,1,"[{93}, {94}, {95}, {96, 97, 98, 99, 100, 101},...","[{93}, {94}, {95}, {96, 97, 98, 99, 100, 101},..."
3,3,1,"[{108}, {109}, {110}, {111}, {112, 113}, {114,...","[{108}, {109}, {110}, {111}, {112, 113}, {114,..."
4,4,1,"[{128, 129, 130, 131, 132}, {133}, {134}, {135...","[{128, 129, 130, 131, 132}, {133}, {134}, {135..."
...,...,...,...,...
14271,14271,1,"[{14605}, {4644}, {10338}, {9289}, {2988}, {18...","[{14605}, {4644}, {10338}, {9289}, {2988}, {18..."
14272,14272,1,"[{6394, 61}, {3225, 13820}, {3201}, {717}, {23...","[{6394, 61}, {3225, 13820}, {3201}, {717}, {23..."
14273,14273,1,"[{848, 18928, 15525}, {203}, {1842}, {13584, 8...","[{848, 18928, 15525}, {203}, {1842}, {13584, 8..."
14274,14274,1,"[{18855}, {937}, {4459}, {7314}, {9071}, {1273...","[{18855}, {937}, {4459}, {7314}, {9071}, {1273..."


In [278]:
sequences.at[0, 'sequence_train']

[{0, 1, 2, 3},
 {4},
 {5, 6, 7, 8, 9, 10},
 {3, 11, 12, 13, 14},
 {7, 15, 16, 17, 18, 19, 20},
 {21},
 {22},
 {21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37},
 {38},
 {24},
 {37, 39, 40, 41},
 {11, 42, 43},
 {44},
 {21},
 {8, 45, 46, 47, 48},
 {15, 49, 50}]

In [281]:
sequences.at[0, 'sequence_test']

({51, 52},)

# Ta-Feng Dataset

In [7]:
df_ta_feng = pd.read_csv(ta_feng, encoding="ISO-8859-1", sep=',', error_bad_lines=False, usecols=[0,1,5], names=['time', 'user', 'item'])

# assume each user only makes at most one transaction everyday
df_ta_feng['trans_id'] =  df_ta_feng['user'].astype(str) + df_ta_feng['time'].astype(str) 
# sort before finding the products in the same basket
df_ta_feng = df_ta_feng.sort_values(['trans_id']) 

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [9]:
# Associate transaction id for each row
user = df_ta_feng.user.values[0]
trans_buf_id = df_ta_feng.trans_id.values[0]
trans = 0
trans_id_list = []
for i in tqdm(range(df_ta_feng.shape[0])):
    if df_ta_feng.user.values[i] == user:
        if df_ta_feng.trans_id.values[i] != trans_buf_id:
            trans += 1
            trans_buf_id = df_ta_feng.trans_id.values[i]
        trans_id_list.append(trans)
    else:
        user = df_ta_feng.user.values[i]
        trans_buf_id = df_ta_feng.trans_id.values[i]
        trans = 0
        trans_id_list.append(trans)
df_ta_feng['trans_id'] = trans_id_list

100%|██████████| 817742/817742 [00:06<00:00, 126639.13it/s]


In [10]:
# remove users with less than 'min_uc' counts, and items with less than 'min_sc' counts
# k-cores filtering, cf. FPMC
df_ta_feng, user_activity, item_popularity = filter_rows(df_ta_feng, min_uc=25, min_sc=15)

In [12]:
users = df_ta_feng.user.nunique()
items = df_ta_feng.item.nunique()
print(f"After filtering:\n   users: {users}\n   items: {items}")

After filtering:
   users: 8821
   items: 9027


In [13]:
# exclude users with less than 10 transactions in total
user = df_ta_feng.user.values[0]
start = 0
df_list = []
train_test = [] # initial train_test indicator
for i in tqdm(range(df_ta_feng.shape[0])): # find transactions of each customer
    if df_ta_feng['user'].values[i] == user:
        continue
    df_user = df_ta_feng.iloc[start:i,:].copy()
    if df_user.trans_id.max() > 9: 
        df_list.append(df_user)
    user = df_ta_feng.user.values[i]
    start = i
df_ta_feng_filtered = pd.concat(df_list)
df_ta_feng_filtered = df_ta_feng_filtered[['user', 'item', 'trans_id']]
df_ta_feng_filtered.head(5)

100%|██████████| 467150/467150 [00:03<00:00, 122888.37it/s]


Unnamed: 0,user,item,trans_id
454270,100021,4710105045443,0
451859,100021,4710063341090,0
455830,100021,4710339772139,0
467287,100021,4710094014741,1
460286,100021,9310042571491,1


In [14]:
users = df_ta_feng_filtered.user.nunique()
items = df_ta_feng_filtered.item.nunique()
print(f"After transaction filtering:\n   users: {users}\n   items: {items}")

After transaction filtering:
   users: 1594
   items: 8331


In [16]:
sparsity = 1. * df_ta_feng_filtered.shape[0] / (users * items)
print(f"Dataset sparsity: {round(sparsity*100, 2)}%")

Dataset sparsity: 1.08%


In [18]:
# structures used to numerize the id (ie. to associate incremental integers to have a continuous list of numbers as ids)
users_id = pd.unique(df_ta_feng_filtered['user'])
items_id = pd.unique(df_ta_feng_filtered['item'])

In [19]:
item2id = dict((iid, i) for (i, iid) in enumerate(items_id))
user2id = dict((uid, i) for (i, uid) in enumerate(users_id))

In [20]:
numerized_data = df_ta_feng_filtered
numerized_data.user = numerized_data.user.apply(lambda x: user2id[x])
numerized_data.item = numerized_data.item.apply(lambda x: item2id[x])
numerized_data = numerized_data.rename(columns={'trans_id': 'tid'})
numerized_data.head(10)

Unnamed: 0,user,item,tid
454270,0,0,0
451859,0,1,0
455830,0,2,0
467287,0,3,1
460286,0,4,1
466455,0,5,1
399592,0,6,1
466130,0,7,1
399593,0,8,1
465712,0,9,1


In [21]:
# convert into sequences for each user
start = 0
df_dict_train = {}
df_dict_test = {}
df_dict_test_user_bool = {}
buf_seq_train = []
buf_seq_test = []
user = numerized_data.user.values[0]

for i in tqdm(range(numerized_data.shape[0])):
    if numerized_data['user'].values[i] == user:
        continue
    df_user = numerized_data.iloc[start:i,:].copy()
    user_known_items = set()

    for j in range(0, df_user.tid.max() + 1):            
        trans_items = set(df_user.loc[df_user.tid == j,'item'])
        if not len(trans_items) == 0:
            if j == df_user.tid.max():
                test_trans_items = set()
                for k in trans_items:
                    if k not in user_known_items:
                        test_trans_items.add(k)
                if len(test_trans_items) == 0:
                    df_dict_test_user_bool[user] = 0
                else:
                    df_dict_test_user_bool[user] = 1
                buf_seq_test.append(test_trans_items)
            else:    
                for k in trans_items:
                    user_known_items.add(k)
                buf_seq_train.append(trans_items)
    
    df_dict_train[user] = buf_seq_train
    df_dict_test[user] = buf_seq_test

    buf_seq_test = []
    buf_seq_train = []
    user = numerized_data.user.values[i]
    start = i    
    
sequences = pd.DataFrame()
sequences['user'] = df_dict_train.keys()
sequences['test_user'] = df_dict_test_user_bool.values()
sequences['sequence_train'] = df_dict_train.values()
sequences['sequence_test'] = df_dict_test.values()

100%|██████████| 143310/143310 [00:08<00:00, 16066.21it/s]
  return array(a, dtype, copy=False, order=order)


In [22]:
sequences[sequences['test_user'] == 1]

Unnamed: 0,user,test_user,sequence_train,sequence_test
1,1,1,"[{80}, {80}, {81, 82}, {83, 84}, {81, 85, 86, ...","({124, 125},)"
2,2,1,"[{128, 16, 126, 127}, {129, 130}, {131, 132, 1...","({164, 165},)"
3,3,1,"[{168, 166, 167}, {169, 170, 171, 172, 173, 17...","({211, 212, 213},)"
4,4,1,"[{216, 214, 215}, {217, 218, 219, 220}, {224, ...","({279, 280, 281, 282, 283, 284, 285},)"
5,5,1,"[{288, 289, 290, 291, 286, 287}, {292, 293, 29...","({468},)"
...,...,...,...,...
1588,1588,1,"[{544, 587, 6832, 5910, 1116, 2879}, {159, 609...","({1001, 339, 967},)"
1589,1589,1,"[{1624, 577, 1116, 1541}, {448, 449, 450, 451,...","({963, 4550, 1323, 4245, 3482},)"
1590,1590,1,"[{4837, 841, 1211, 494, 242, 51, 1396, 1302, 1...","({225, 583, 1417, 6764, 2840},)"
1591,1591,1,"[{4185, 590}, {513}, {6650, 6573, 5431}, {1118...","({703, 1071},)"


In [23]:
sequences.at[1588, 'sequence_train']

[{544, 587, 1116, 2879, 5910, 6832},
 {159, 427, 609, 659, 767, 831, 1028, 1157, 4724, 5213},
 {292, 554, 2351, 3248, 5503},
 {1144, 4393, 5634},
 {154, 275, 878, 995, 1343, 1454, 1669, 2030, 2230, 5452, 5929, 7143},
 {154, 609, 844, 1089, 1669, 2084, 8329},
 {3371, 8119},
 {174, 558, 2788, 5101},
 {344, 863, 3009, 5752},
 {154, 544, 609, 844, 5101, 5237, 6822, 7333}]

In [24]:
sequences.at[1588, 'sequence_test']

({339, 967, 1001},)