In [1]:
import pandas as pd
from tqdm import tqdm
import os

from datetime import date as dt
from datetime import datetime

from src.utils import train_tune_test_split, filter_rows, get_count

In [2]:
t_mall = '/home/jb/ved/datasets/Tmall/ijcai2016_taobao.csv'
t_mall_frequences_train = '/home/jb/ved/datasets/Tmall/tmall_frequences_train.csv'
t_mall_frequences_test = '/home/jb/ved/datasets/Tmall/tmall_frequences_test.csv'
tmall_sequences_train = '/home/jb/ved/datasets/Tmall/sequences.tsv'
tmall_baskets_train = '/home/jb/ved/datasets/Tmall/baskets_train.tsv'
tmall_baskets_test = '/home/jb/ved/datasets/Tmall/basket_test.tsv'

ta_feng = '/home/jb/ved/datasets/Ta_feng/ta_feng_all_months_merged.csv'
ta_feng_frequences_train = '/home/jb/ved/datasets/Ta_feng/tafeng_frequences_train.csv'
ta_feng_frequences_test = '/home/jb/ved/datasets/Ta_feng/tafeng_frequences_test.csv'

# for auto-reloading external modules
%load_ext autoreload
%autoreload 2
pd.options.display.max_rows = 500

# Tmall Dataset

In [3]:
df_tmall = pd.read_csv(t_mall, encoding="ISO-8859-1", sep=',', error_bad_lines=False, usecols=[0,2,4,5], names=['user', 'item', 'action', 'time'])
df_tmall = df_tmall[(df_tmall['action']==1)] 
df_tmall = df_tmall[['user', 'item', 'time']] 

# assume each user only makes at most one transaction everyday
df_tmall['trans_id'] =  df_tmall['user'].astype(str) + df_tmall['time'].astype(str) 
# sort before finding the products in the same basket
df_tmall = df_tmall.sort_values(['trans_id']) 

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# Associate transaction id for each row
user = df_tmall.user.values[0]
trans_buf_id = df_tmall.trans_id.values[0]
trans = 0
trans_id_list = []
for i in tqdm(range(df_tmall.shape[0])):
    if df_tmall.user.values[i] == user:
        if df_tmall.trans_id.values[i] != trans_buf_id:
            trans += 1
            trans_buf_id = df_tmall.trans_id.values[i]
        trans_id_list.append(trans)
    else:
        user = df_tmall.user.values[i]
        trans_buf_id = df_tmall.trans_id.values[i]
        trans = 0
        trans_id_list.append(trans)
df_tmall['trans_id'] = trans_id_list

100%|██████████| 9336595/9336595 [01:28<00:00, 105309.85it/s]


In [5]:
# remove users with less than 'min_uc' counts, and items with less than 'min_sc' counts
# k-cores filtering, cf. FPMC
df_tmall, user_activity, item_popularity = filter_rows(df_tmall, min_uc=25, min_sc=15)

In [6]:
users = df_tmall.user.nunique()
items = df_tmall.item.nunique()
print(f"After filtering:\n   users: {users}\n   items: {items}")

After filtering:
   users: 39157
   items: 91572


In [7]:
# exclude users with less than 10 transactions in total
user = df_tmall.user.values[0]
start = 0
df_list = []
train_test = [] # initial train_test indicator
for i in tqdm(range(df_tmall.shape[0])): # find transactions of each customer
    if df_tmall['user'].values[i] == user:
        continue
    df_user = df_tmall.iloc[start:i,:].copy()
    if df_user.trans_id.max() > 9: 
        df_list.append(df_user)
    user = df_tmall.user.values[i]
    start = i
df_tmall_filtered = pd.concat(df_list)
df_tmall_filtered = df_tmall_filtered[['user', 'item', 'trans_id']]
df_tmall_filtered.head(5)

100%|██████████| 1462418/1462418 [00:16<00:00, 87432.66it/s]


Unnamed: 0,user,item,trans_id
20898186,1000038,322669,1
20913818,1000038,1749077,1
35188420,1000038,1814039,2
35188419,1000038,1814039,3
35188422,1000038,1814039,4


In [8]:
users = df_tmall_filtered.user.nunique()
items = df_tmall_filtered.item.nunique()
print(f"After transaction filtering:\n   users: {users}\n   items: {items}")

After transaction filtering:
   users: 32510
   items: 90404


In [9]:
sparsity = 1. * df_tmall_filtered.shape[0] / (users * items)
print(f"Dataset sparsity: {round(sparsity*100, 2)}%")

Dataset sparsity: 0.04%


In [10]:
# structures used to numerize the id (ie. to associate incremental integers to have a continuous list of numbers as ids)
users_id = pd.unique(df_tmall_filtered['user'])
items_id = pd.unique(df_tmall_filtered['item'])

In [11]:
item2id = dict((iid, i) for (i, iid) in enumerate(items_id))
user2id = dict((uid, i) for (i, uid) in enumerate(users_id))

# def numerize(tp):
#     uid = list(map(lambda x: user2id[x], tp['user']))
#     iid = list(map(lambda x: item2id[x], tp['item']))
#     return pd.DataFrame(data={'uid': uid, 'iid': iid, 'tid': tp['trans_id']}, columns=['uid', 'iid', 'tid'])

In [12]:
numerized_data = df_tmall_filtered
numerized_data.user = numerized_data.user.apply(lambda x: user2id[x])
numerized_data.item = numerized_data.item.apply(lambda x: item2id[x])
numerized_data = numerized_data.rename(columns={'trans_id': 'tid'})
numerized_data.head(10)

Unnamed: 0,user,item,tid
20898186,0,0,1
20913818,0,1,1
35188420,0,2,2
35188419,0,2,3
35188422,0,2,4
23180154,0,3,5
18264956,0,4,6
18264958,0,4,6
18264963,0,4,6
40222508,0,5,7


In [13]:
# convert into sequences for each user
start = 0
df_dict_train = {}
df_dict_test = {}
df_dict_test_user_bool = {}
buf_seq_train = []
buf_seq_test = []
user = numerized_data.user.values[0]

for i in tqdm(range(numerized_data.shape[0])):
    if numerized_data['user'].values[i] == user:
        continue
    df_user = numerized_data.iloc[start:i,:].copy()
    user_known_items = set()

    for j in range(0, df_user.tid.max() + 1):            
        trans_items = set(df_user.loc[df_user.tid == j,'item'])
        if not len(trans_items) == 0:
            if j == df_user.tid.max():
                test_trans_items = set()
                for k in trans_items:
                    if k not in user_known_items:
                        test_trans_items.add(k)
                if len(test_trans_items) == 0:
                    df_dict_test_user_bool[user] = 0
                else:
                    df_dict_test_user_bool[user] = 1
                buf_seq_test.append(test_trans_items)
            else:    
                for k in trans_items:
                    user_known_items.add(k)
                buf_seq_train.append(trans_items)
    
    df_dict_train[user] = buf_seq_train
    df_dict_test[user] = buf_seq_test

    buf_seq_test = []
    buf_seq_train = []
    user = numerized_data.user.values[i]
    start = i    
    
sequences = pd.DataFrame()
sequences['user'] = df_dict_train.keys()
sequences['test_user'] = df_dict_test_user_bool.values()
sequences['sequence_train'] = df_dict_train.values()
sequences['sequence_test'] = df_dict_test.values()

100%|██████████| 1251569/1251569 [03:17<00:00, 6322.90it/s]
  return array(a, dtype, copy=False, order=order)


In [14]:
sequences[sequences['test_user'] == 1]

Unnamed: 0,user,test_user,sequence_train,sequence_test
0,0,1,"[{0, 1}, {2}, {2}, {2}, {3}, {4}, {5, 6}, {5},...","({11},)"
1,1,1,"[{12}, {13}, {14}, {15}, {16, 17}, {16}, {18},...","({40, 39},)"
2,2,1,"[{41, 42, 43, 44, 45, 46}, {48, 49, 47}, {50, ...","({126, 127},)"
3,3,1,"[{2}, {128, 129}, {130}, {85}, {131}, {132}, {...","({143},)"
5,5,1,"[{160, 161, 162, 77, 79, 151, 152, 153, 154, 1...","({206},)"
...,...,...,...,...
32504,32504,1,"[{9344}, {6958}, {54858}, {2357}, {46045}, {67...","({39419},)"
32505,32505,1,"[{13731, 12766, 28300, 2062, 3671, 11162, 6970...","({45664},)"
32506,32506,1,"[{23358}, {6701}, {3326}, {52900, 19037, 49897...","({60984, 27705},)"
32507,32507,1,"[{78907, 7869}, {30317, 19492, 20117, 6476}, {...","({5767},)"


In [15]:
sequences.at[0, 'sequence_train']

[{0, 1}, {2}, {2}, {2}, {3}, {4}, {5, 6}, {5}, {2}, {7}, {8}, {9}, {10}]

In [16]:
sequences.at[0, 'sequence_test']

({11},)

In [17]:
# convert the train sequences into lines <(itemset1) (itemset2)... (itemsetk)> for each user into a file

with open(tmall_sequences_train, 'w') as out_file:
    for i in tqdm(range(sequences.shape[0])):

        seq = sequences.sequence_train.values[i]

        c = 0
        seq_string = ""
        for itemset in seq:
            c += 1
            for item in itemset:  
                seq_string = seq_string + str(item) + " "
            if c != len(seq):
                seq_string = seq_string + "-1 "
                
        out_file.write(seq_string+"-2\n")

100%|██████████| 32509/32509 [00:00<00:00, 48384.37it/s]


In [20]:
# convert the train sequences into lines:
#       < user (itemset1) >
#       < user (itemset2) >
#       < user (itemset3) >
#              ...
#       < user (itemsetk) >

with open(tmall_baskets_train, 'w') as train_file, open(tmall_baskets_test, 'w') as test_file:

    for i in tqdm(range(sequences.shape[0])):

        user = sequences.user.values[i]
        seq_train = sequences.sequence_train.values[i]

        seq_string = str(user) + ", "
        for itemset in seq_train:
            for item in itemset:  
                seq_string = seq_string + str(item) + " "

            train_file.write(seq_string + "\n")
            seq_string = str(user) + ", "

        if sequences.test_user.values[i] == 1:
            seq_string = str(user) + " "
            seq_test = sequences.sequence_test.values[i]

            for itemset in seq_test:
                for item in itemset:  
                    seq_string = seq_string + str(item) + " "
                test_file.write(seq_string + "\n")
                seq_string = str(user) + ", "


100%|██████████| 32509/32509 [00:01<00:00, 25095.15it/s]


In [17]:
# convert into a <user, item, frequence> regular csv training file
freqs = {}

for i in tqdm(range(sequences.shape[0])):
    user = sequences.user.values[i]
    seqs = sequences.sequence_train.values[i]
    for seq in seqs:
        for item in seq:                           
            freqs[(user,item)] = freqs.get((user,item), 0) + 1
freqs = list(map(lambda kv: (kv[0][0], kv[0][1], kv[1]), freqs.items()))
frequences_train = pd.DataFrame(freqs, columns = ['user_id' , 'item_id', 'frequence']) 

100%|██████████| 32509/32509 [00:00<00:00, 45600.84it/s]


In [18]:
frequences_train.head()

Unnamed: 0,user_id,item_id,frequence
0,0,0,1
1,0,1,1
2,0,2,4
3,0,3,1
4,0,4,1


In [19]:
frequences_train.to_csv(t_mall_frequences_train, index=False)

In [20]:
# convert into a <user, item, frequence> regular csv test file
freqs = {}

for i in tqdm(range(sequences.shape[0])):
    user = sequences.user.values[i]
    seqs = sequences.sequence_test.values[i]
    for seq in seqs:
        for item in seq:                           
            freqs[(user,item)] = freqs.get((user,item), 0) + 1
freqs = list(map(lambda kv: (kv[0][0], kv[0][1], kv[1]), freqs.items()))
frequences_test = pd.DataFrame(freqs, columns = ['user_id' , 'item_id', 'frequence']) 

100%|██████████| 32509/32509 [00:00<00:00, 117927.52it/s]


In [21]:
frequences_test.to_csv(t_mall_frequences_test, index=False)

# Ta-Feng Dataset

In [81]:
df_ta_feng = pd.read_csv(ta_feng, encoding="ISO-8859-1", sep=',', error_bad_lines=False, usecols=[0,1,5], names=['time', 'user', 'item'])

# assume each user only makes at most one transaction everyday
df_ta_feng['trans_id'] =  df_ta_feng['user'].astype(str) + df_ta_feng['time'].astype(str) 
# sort before finding the products in the same basket
df_ta_feng = df_ta_feng.sort_values(['trans_id']) 

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [82]:
# Associate transaction id for each row
user = df_ta_feng.user.values[0]
trans_buf_id = df_ta_feng.trans_id.values[0]
trans = 0
trans_id_list = []
for i in tqdm(range(df_ta_feng.shape[0])):
    if df_ta_feng.user.values[i] == user:
        if df_ta_feng.trans_id.values[i] != trans_buf_id:
            trans += 1
            trans_buf_id = df_ta_feng.trans_id.values[i]
        trans_id_list.append(trans)
    else:
        user = df_ta_feng.user.values[i]
        trans_buf_id = df_ta_feng.trans_id.values[i]
        trans = 0
        trans_id_list.append(trans)
df_ta_feng['trans_id'] = trans_id_list

100%|██████████| 817742/817742 [00:06<00:00, 121324.29it/s]


In [83]:
# remove users with less than 'min_uc' counts, and items with less than 'min_sc' counts
# k-cores filtering, cf. FPMC
df_ta_feng, user_activity, item_popularity = filter_rows(df_ta_feng, min_uc=25, min_sc=15)

In [84]:
users = df_ta_feng.user.nunique()
items = df_ta_feng.item.nunique()
print(f"After filtering:\n   users: {users}\n   items: {items}")

After filtering:
   users: 8821
   items: 9027


In [85]:
# exclude users with less than 10 transactions in total
user = df_ta_feng.user.values[0]
start = 0
df_list = []
train_test = [] # initial train_test indicator
for i in tqdm(range(df_ta_feng.shape[0])): # find transactions of each customer
    if df_ta_feng['user'].values[i] == user:
        continue
    df_user = df_ta_feng.iloc[start:i,:].copy()
    if df_user.trans_id.max() > 9: 
        df_list.append(df_user)
    user = df_ta_feng.user.values[i]
    start = i
df_ta_feng_filtered = pd.concat(df_list)
df_ta_feng_filtered = df_ta_feng_filtered[['user', 'item', 'trans_id']]
df_ta_feng_filtered.head(5)

100%|██████████| 467150/467150 [00:03<00:00, 126789.51it/s]


Unnamed: 0,user,item,trans_id
454270,100021,4710105045443,0
451859,100021,4710063341090,0
455830,100021,4710339772139,0
467287,100021,4710094014741,1
460286,100021,9310042571491,1


In [86]:
users = df_ta_feng_filtered.user.nunique()
items = df_ta_feng_filtered.item.nunique()
print(f"After transaction filtering:\n   users: {users}\n   items: {items}")

After transaction filtering:
   users: 1594
   items: 8331


In [87]:
sparsity = 1. * df_ta_feng_filtered.shape[0] / (users * items)
print(f"Dataset sparsity: {round(sparsity*100, 2)}%")

Dataset sparsity: 1.08%


In [88]:
# structures used to numerize the id (ie. to associate incremental integers to have a continuous list of numbers as ids)
users_id = pd.unique(df_ta_feng_filtered['user'])
items_id = pd.unique(df_ta_feng_filtered['item'])

In [89]:
item2id = dict((iid, i) for (i, iid) in enumerate(items_id))
user2id = dict((uid, i) for (i, uid) in enumerate(users_id))

In [90]:
numerized_data = df_ta_feng_filtered
numerized_data.user = numerized_data.user.apply(lambda x: user2id[x])
numerized_data.item = numerized_data.item.apply(lambda x: item2id[x])
numerized_data = numerized_data.rename(columns={'trans_id': 'tid'})
numerized_data.head(10)

Unnamed: 0,user,item,tid
454270,0,0,0
451859,0,1,0
455830,0,2,0
467287,0,3,1
460286,0,4,1
466455,0,5,1
399592,0,6,1
466130,0,7,1
399593,0,8,1
465712,0,9,1


In [91]:
# convert into sequences for each user
start = 0
df_dict_train = {}
df_dict_test = {}
df_dict_test_user_bool = {}
buf_seq_train = []
buf_seq_test = []
user = numerized_data.user.values[0]

for i in tqdm(range(numerized_data.shape[0])):
    if numerized_data['user'].values[i] == user:
        continue
    df_user = numerized_data.iloc[start:i,:].copy()
    user_known_items = set()

    for j in range(0, df_user.tid.max() + 1):            
        trans_items = set(df_user.loc[df_user.tid == j,'item'])
        if not len(trans_items) == 0:
            if j == df_user.tid.max():
                test_trans_items = set()
                for k in trans_items:
                    if k not in user_known_items:
                        test_trans_items.add(k)
                if len(test_trans_items) == 0:
                    df_dict_test_user_bool[user] = 0
                else:
                    df_dict_test_user_bool[user] = 1
                buf_seq_test.append(test_trans_items)
            else:    
                for k in trans_items:
                    user_known_items.add(k)
                buf_seq_train.append(trans_items)
    
    df_dict_train[user] = buf_seq_train
    df_dict_test[user] = buf_seq_test

    buf_seq_test = []
    buf_seq_train = []
    user = numerized_data.user.values[i]
    start = i    
    
sequences = pd.DataFrame()
sequences['user'] = df_dict_train.keys()
sequences['test_user'] = df_dict_test_user_bool.values()
sequences['sequence_train'] = df_dict_train.values()
sequences['sequence_test'] = df_dict_test.values()

100%|██████████| 143310/143310 [00:08<00:00, 16606.36it/s]
  return array(a, dtype, copy=False, order=order)


In [92]:
sequences[sequences['test_user'] == 0]

Unnamed: 0,user,test_user,sequence_train,sequence_test
0,0,0,"[{0, 1, 2}, {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...","({},)"
19,19,0,"[{1133, 159}, {1129}, {1136, 1134, 1135}, {114...","({},)"
24,24,0,"[{320, 995, 998, 1001, 1337, 1338, 1339}, {971...","({},)"
31,31,0,"[{463, 1625, 659, 1623, 1624, 409, 1626, 1627,...","({},)"
32,32,0,"[{1646}, {314, 1647}, {1655, 1324, 1453, 1648,...","({},)"
34,34,0,"[{1889, 1170}, {1890}, {834, 1891}, {1892, 189...","({},)"
66,66,0,"[{1730}, {1033, 2778, 2777, 68}, {545, 2779, 2...","({},)"
74,74,0,"[{428, 174}, {428, 2974}, {1753, 2525, 174}, {...","({},)"
85,85,0,"[{291}, {159, 51, 3268, 1463}, {821}, {314}, {...","({},)"
88,88,0,"[{994, 35, 3368, 3369, 3370, 496, 3061}, {834,...","({},)"


In [93]:
sequences.at[1588, 'sequence_train']

[{544, 587, 1116, 2879, 5910, 6832},
 {159, 427, 609, 659, 767, 831, 1028, 1157, 4724, 5213},
 {292, 554, 2351, 3248, 5503},
 {1144, 4393, 5634},
 {154, 275, 878, 995, 1343, 1454, 1669, 2030, 2230, 5452, 5929, 7143},
 {154, 609, 844, 1089, 1669, 2084, 8329},
 {3371, 8119},
 {174, 558, 2788, 5101},
 {344, 863, 3009, 5752},
 {154, 544, 609, 844, 5101, 5237, 6822, 7333}]

In [101]:
sequences.at[1588, 'sequence_test']

({339, 967, 1001},)

In [95]:
# convert into a <user, item, frequence> regular csv file
freqs = {}

for i in tqdm(range(sequences.shape[0])):
    user = sequences.user.values[i]
    seqs = sequences.sequence_train.values[i]
    for seq in seqs:
        for item in seq:                           
            freqs[(user,item)] = freqs.get((user,item), 0) + 1
freqs = list(map(lambda kv: (kv[0][0], kv[0][1], kv[1]), freqs.items()))
frequences_train = pd.DataFrame(freqs, columns = ['user_id' , 'item_id', 'frequence']) 

100%|██████████| 1593/1593 [00:00<00:00, 20293.11it/s]


In [96]:
frequences_train.head()

Unnamed: 0,user_id,item_id,frequence
0,0,0,3
1,0,1,3
2,0,2,5
3,0,3,2
4,0,4,1


In [97]:
frequences_train.to_csv(ta_feng_frequences_train, index=False)

In [98]:
# convert into a <user, item, frequence> regular csv test file
freqs = {}

for i in tqdm(range(sequences.shape[0])):
    user = sequences.user.values[i]
    seqs = sequences.sequence_test.values[i]
    for seq in seqs:
        for item in seq:                           
            freqs[(user,item)] = freqs.get((user,item), 0) + 1
freqs = list(map(lambda kv: (kv[0][0], kv[0][1], kv[1]), freqs.items()))
frequences_test = pd.DataFrame(freqs, columns = ['user_id' , 'item_id', 'frequence']) 

100%|██████████| 1593/1593 [00:00<00:00, 82150.25it/s]


In [99]:
frequences_test.head()

Unnamed: 0,user_id,item_id,frequence
0,1,124,1
1,1,125,1
2,2,164,1
3,2,165,1
4,3,211,1


In [100]:
frequences_test.to_csv(ta_feng_frequences_test, index=False)