In [1]:
# !pip install similaripy==0.1.2 

In [2]:
# !pip install cython==0.29.30

In [32]:
import pandas as pd
import random
import numpy as np
from scipy import sparse
import similaripy as sim
import pickle
from datetime import datetime

In [245]:
import sys
sys.path.append('./models/')
import pernir_preproc

### 0. Set Hyper Params

In [10]:
min_basket_per_user = 4
min_item_per_basket = 4
sample_size = 10000
seed_len = 3
mask_percentage = 0.2

## 1. instaCart

In [11]:
prior_orders_file_path = '../data/instacart/order_products__prior.csv'
train_orders_file_path = '../data/instacart/order_products__train.csv'
orders_file_path = '../data/instacart/orders.csv'

In [12]:
all_orders = pernir_preproc.read_and_merge_orders(prior_orders_file_path, train_orders_file_path, orders_file_path)

In [30]:
all_orders

Unnamed: 0,basket_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,item_id,add_to_cart_order,reordered
0,2539329,1,prior,1,2,8,,196,1,0
1,2539329,1,prior,1,2,8,,14084,2,0
2,2539329,1,prior,1,2,8,,12427,3,0
3,2539329,1,prior,1,2,8,,26088,4,0
4,2539329,1,prior,1,2,8,,26405,5,0
...,...,...,...,...,...,...,...,...,...,...
33819101,272231,206209,train,14,6,14,30.0,40603,4,0
33819102,272231,206209,train,14,6,14,30.0,15655,5,0
33819103,272231,206209,train,14,6,14,30.0,42606,6,0
33819104,272231,206209,train,14,6,14,30.0,37966,7,0


In [13]:
all_orders, basket_per_user = pernir_preproc.filter_orders(all_orders, min_basket_per_user, min_item_per_basket)
users = pernir_preproc.sample_users(basket_per_user, sample_size)

In [14]:
train_baskets_file_path = '../data/instacart/train_baskets.csv'
test_baskets_file_path = '../data/instacart/test_baskets.csv'
valid_baskets_file_path = '../data/instacart/valid_baskets.csv'

In [15]:
train_baskets, test_baskets, valid_baskets = pernir_preproc.split_baskets(all_orders, users)

pernir_preproc.save_baskets(train_baskets, test_baskets, valid_baskets, train_baskets_file_path, test_baskets_file_path, valid_baskets_file_path)

In [16]:
basket_items_dict = pernir_preproc.create_basket_items_dict(train_baskets)
user_baskets_dict = pernir_preproc.create_user_baskets_dict(train_baskets)
item_base_scores = pernir_preproc.create_item_base_scores(user_baskets_dict, basket_items_dict)
data_list = pernir_preproc.create_data_list(item_base_scores, user_baskets_dict)

In [17]:
file_path = '../data/instacart/user_item_scores.csv'
pernir_preproc.save_dataframe(data_list, file_path)

In [21]:
df = pd.read_csv(file_path)
item_dic, rev_item_dic, user_dic, rev_user_dic = pernir_preproc.create_user_item_dicts(df)
userItem_mat = pernir_preproc.create_user_item_matrix(df, user_dic, item_dic)
final_user_sim_dict = pernir_preproc.create_user_sim_dict(userItem_mat, user_dic, rev_user_dic)

Done: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 4728.86it/s]


In [22]:
file_path = '../data/instacart/user_sim.pickle'
pernir_preproc.save_user_sim_dict(final_user_sim_dict, file_path)

In [23]:
basket_path = '../data/instacart/valid_baskets.csv'
sample_path = '../data/instacart/valid_samples.csv'

basket_users_dict, basket_items_dict = pernir_preproc.read_and_process_baskets(basket_path)

In [24]:
sample_baskets, sample_users, full_baskets, masked_items_list, masked_baskets, num_to_mask_list = pernir_preproc.create_masked_baskets(basket_items_dict, basket_users_dict, mask_percentage)


In [25]:
sample_df = pernir_preproc.create_sample_dataframe(sample_baskets, sample_users, full_baskets, masked_items_list, masked_baskets, num_to_mask_list)

In [27]:
sample_df.head(3)

Unnamed: 0,basket_id,user_id,full_basket,masked_items,masked_basket,num_to_mask
0,282,8456,"[2727, 21338, 23767, 43892]",[43892],"[2727, 21338, 23767]",1
1,1016,26337,"[38152, 12916, 36126, 32175]",[32175],"[38152, 12916, 36126]",1
2,1973,115583,"[4778, 10957, 40174, 36618, 7648, 14028, 17696]",[10957],"[4778, 40174, 36618, 7648, 14028, 17696]",1


In [29]:
sample_df.to_csv('../data/instacart/test_sample_to_predict_instacart.csv', index = False)

## 2. Ta Feng

In [288]:
min_basket_per_user = 1
min_item_per_basket = 3
sample_size = 10000

In [289]:
ta_feng_dataset = pd.read_csv('../data/ta_feng/ta_feng_all_months_merged.csv')[
    ['TRANSACTION_DT', 'CUSTOMER_ID', 'PRODUCT_ID']]

ta_feng_dataset.TRANSACTION_DT = ta_feng_dataset.TRANSACTION_DT .apply(lambda x: x.replace('/', '-'))
ta_feng_dataset.TRANSACTION_DT = [datetime.strptime(date, '%m-%d-%Y').strftime('%Y-%m-%d') for date in ta_feng_dataset.TRANSACTION_DT]

In [290]:
df = ta_feng_dataset.rename({'TRANSACTION_DT': 'date', 'CUSTOMER_ID': 'user_id', 
                               'PRODUCT_ID': 'item_id'}, axis = 1)

In [291]:
# len(df.user_id.unique())

32266

In [292]:
def labelling_order_id(df):
    
    #setting order id as its not impicitly mentioned in original data 
    #we count order id as label of basket of items for users puchases within a certain day

    grouped = df.groupby(['date', 'user_id'])
    df['basket_id'] = grouped.ngroup() + 1
    
    return df

labelling_order_id(df).head(3)

Unnamed: 0,date,user_id,item_id,basket_id
0,2000-11-01,1104905,4710199010372,78
1,2000-11-01,418683,4710857472535,34
2,2000-11-01,1057331,4710043654103,75


In [293]:
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['date', 'user_id', 'basket_id'])
df['add_to_cart_order'] = df.groupby(['user_id', 'basket_id']).cumcount() + 1

In [294]:
#count order number as there werent any data in original dataset
def add_order_number(df):
    df = df.reset_index()
    orders_num_df = df.groupby(['user_id', 'date', 'basket_id']).agg(list).reset_index()
    orders_num_df['order_number'] = orders_num_df.groupby('user_id').cumcount() + 1
    orders_num_df = orders_num_df[['user_id', 'date', 'basket_id', 'order_number']]
    df = df.merge(orders_num_df, on = ['date', 'user_id', 'basket_id'])

    return df

In [295]:
fin_df = add_order_number(df)
fin_df.head(3)

Unnamed: 0,index,date,user_id,item_id,basket_id,add_to_cart_order,order_number
0,789,2000-11-01,38317,4714981010038,1,1,1
1,826,2000-11-01,38317,4719090105002,1,2,1
2,863,2000-11-01,45902,4710147100018,2,1,1


In [296]:
all_orders_tf, basket_per_user_tf = pernir_preproc.filter_orders(fin_df, min_basket_per_user, min_item_per_basket)
users_tf = pernir_preproc.sample_users(basket_per_user_tf, sample_size)

In [297]:
train_baskets_file_path_tf = '../data/ta_feng/train_baskets.csv'
test_baskets_file_path_tf = '../data/ta_feng/test_baskets.csv'
valid_baskets_file_path_tf = '../data/ta_feng/valid_baskets.csv'

In [298]:
train_baskets_tf, test_baskets_tf, valid_baskets_tf = pernir_preproc.split_baskets(all_orders_tf, users_tf)

In [299]:
pernir_preproc.save_baskets(train_baskets_tf, test_baskets_tf, valid_baskets_tf, 
                            train_baskets_file_path_tf, test_baskets_file_path_tf, valid_baskets_file_path_tf)

In [300]:
basket_items_dict_tf = pernir_preproc.create_basket_items_dict(train_baskets_tf)
user_baskets_dict_tf = pernir_preproc.create_user_baskets_dict(train_baskets_tf)
item_base_scores_tf = pernir_preproc.create_item_base_scores(user_baskets_dict_tf, basket_items_dict_tf)
data_list_tf = pernir_preproc.create_data_list(item_base_scores_tf, user_baskets_dict_tf)

In [301]:
file_path = '../data/ta_feng/user_item_scores.csv'
pernir_preproc.save_dataframe(data_list_tf, file_path)

In [302]:
df = pd.read_csv(file_path)
item_dic_tf, rev_item_dic_tf, user_dic_tf, rev_user_dic_tf = pernir_preproc.create_user_item_dicts(df)
userItem_mat_tf = pernir_preproc.create_user_item_matrix(df, user_dic_tf, item_dic_tf)
final_user_sim_dict_tf = pernir_preproc.create_user_sim_dict(userItem_mat_tf, user_dic_tf, rev_user_dic_tf)

Done: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3790/3790 [00:00<00:00, 38282.47it/s]


In [303]:
file_path = '../data/ta_feng/user_sim.pickle'
pernir_preproc.save_user_sim_dict(final_user_sim_dict_tf, file_path)

In [304]:
basket_path = '../data/ta_feng/valid_baskets.csv'
sample_path = '../data/ta_feng/valid_samples.csv'

basket_users_dict_tf, basket_items_dict_tf = pernir_preproc.read_and_process_baskets(basket_path)

In [305]:
sample_baskets, sample_users, full_baskets, masked_items_list, masked_baskets, num_to_mask_list = pernir_preproc.create_masked_baskets(basket_items_dict_tf, basket_users_dict_tf, mask_percentage)


In [306]:
sample_df = pernir_preproc.create_sample_dataframe(sample_baskets, sample_users, full_baskets, masked_items_list, masked_baskets, num_to_mask_list)

In [307]:
sample_df.head()

Unnamed: 0,basket_id,user_id,full_basket,masked_items,masked_basket,num_to_mask
0,150,1885958,"[4710943100410, 4710992231189, 4711207003027, ...",[4710943100410],"[4710992231189, 4711207003027, 4710046011101]",1
1,166,2034638,"[4710085172696, 4710085172702, 4710085120093, ...",[4710085172702],"[4710085172696, 4710085120093, 4710085150311, ...",1
2,305,375917,"[4711258007371, 4710363913201, 4710311856116, ...","[4710095987402, 4710172020077, 4710063121494, ...","[4711258007371, 4710363913201, 4710311856116, ...",4
3,408,828222,"[4710498123339, 4714981010038, 4715398208339, ...",[8992741941327],"[4710498123339, 4714981010038, 4715398208339, ...",1
4,440,903042,"[4710114718505, 4710054380619, 4710314432157]",[4710114718505],"[4710054380619, 4710314432157]",1


In [308]:
sample_df.to_csv('../data/ta_feng/test_sample_to_predict_ta_feng.csv', index = False)

## 3. Dunnhumpy 

In [219]:
dh_dataset = pd.read_csv('../data/dunnhumby/transaction_data.csv')[['household_key', 'BASKET_ID', 'PRODUCT_ID', 'WEEK_NO']].rename({'household_key': 'user_id',
                                                                                                               'BASKET_ID': 'basket_id',
                                                                                                               'PRODUCT_ID': 'item_id', 
                                                                                                                                   'WEEK_NO': 'date'}
                                                                                                             , axis = 1)

In [220]:
dh_dataset.head(3)

Unnamed: 0,user_id,basket_id,item_id,date
0,2375,26984851472,1004906,1
1,2375,26984851472,1033142,1
2,2375,26984851472,1036325,1


In [221]:
len(dh_dataset.user_id.unique())

2500

In [222]:
dh_dataset = dh_dataset.sort_values(['user_id', 'date', 'basket_id'])
dh_dataset['add_to_cart_order'] = dh_dataset.groupby(['user_id', 'basket_id']).cumcount() + 1

In [223]:
fin_dh = add_order_number(dh_dataset)
fin_dh.head(3)

Unnamed: 0,index,user_id,basket_id,item_id,date,add_to_cart_order,order_number
0,46996,1,27601281299,825123,8,1,1
1,46997,1,27601281299,831447,8,2,1
2,46998,1,27601281299,840361,8,3,1


In [230]:
fin_dh.shape

min_basket_per_user = 1
min_item_per_basket = 2
sample_size = 2500 #we are using all users 

In [231]:
all_orders_dh, basket_per_user_dh = pernir_preproc.filter_orders(fin_dh, min_basket_per_user, min_item_per_basket)
users_dh = pernir_preproc.sample_users(basket_per_user_dh, sample_size)

In [232]:
train_baskets_file_path_dh = '../data/dunnhumby/train_baskets.csv'
test_baskets_file_path_dh = '../data/dunnhumby/test_baskets.csv'
valid_baskets_file_path_dh = '../data/dunnhumby/valid_baskets.csv'

In [233]:
train_baskets_dh, test_baskets_dh, valid_baskets_dh = pernir_preproc.split_baskets(all_orders_dh, users_dh)

In [234]:
pernir_preproc.save_baskets(train_baskets_dh, test_baskets_dh, valid_baskets_dh, 
                            train_baskets_file_path_dh, test_baskets_file_path_dh, valid_baskets_file_path_dh)

In [235]:
basket_items_dict_dh = pernir_preproc.create_basket_items_dict(train_baskets_dh)
user_baskets_dict_dh = pernir_preproc.create_user_baskets_dict(train_baskets_dh)
item_base_scores_dh = pernir_preproc.create_item_base_scores(user_baskets_dict_dh, basket_items_dict_dh)
data_list_dh = pernir_preproc.create_data_list(item_base_scores_dh, user_baskets_dict_dh)

In [236]:
file_path = '../data/dunnhumby/user_item_scores.csv'
pernir_preproc.save_dataframe(data_list_dh, file_path)

In [237]:
df = pd.read_csv(file_path)
item_dic_dh, rev_item_dic_dh, user_dic_dh, rev_user_dic_dh = pernir_preproc.create_user_item_dicts(df)
userItem_mat_dh = pernir_preproc.create_user_item_matrix(df, user_dic_dh, item_dic_dh)
final_user_sim_dict_dh = pernir_preproc.create_user_sim_dict(userItem_mat_dh, user_dic_dh, rev_user_dic_dh)

Done: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2485/2485 [00:00<00:00, 6095.44it/s]


In [238]:
file_path = '../data/dunnhumby/user_sim.pickle'
pernir_preproc.save_user_sim_dict(final_user_sim_dict_dh, file_path)

In [239]:
basket_path = '../data/dunnhumby/valid_baskets.csv'
sample_path = '../data/dunnhumby/valid_samples.csv'

basket_users_dict_dh, basket_items_dict_dh = pernir_preproc.read_and_process_baskets(basket_path)

In [240]:
sample_baskets, sample_users, full_baskets, masked_items_list, masked_baskets, num_to_mask_list = pernir_preproc.create_masked_baskets(basket_items_dict_dh, 
                                                                                           basket_users_dict_dh, mask_percentage)


In [241]:
sample_df = pernir_preproc.create_sample_dataframe(sample_baskets, sample_users, full_baskets, masked_items_list, masked_baskets, num_to_mask_list)

In [242]:
sample_df

Unnamed: 0,basket_id,user_id,full_basket,masked_items,masked_basket,num_to_mask
0,27384572088,359,"[826249, 828525, 838396, 869622, 871470, 90701...","[1095700, 1062966, 1013999, 5566922, 871470, 9...","[826249, 828525, 838396, 869622, 907014, 91745...",9
1,27398557136,2422,"[823795, 862010, 956689, 960729, 973150, 10008...","[956689, 1047450, 6602576]","[823795, 862010, 960729, 973150, 1000870, 1060...",3
2,27517272089,872,"[1028944, 1082185]",[1028944],[1082185],1
3,27555107198,2201,"[1516066, 1532784, 1625932, 1627393, 1635336]",[1635336],"[1516066, 1532784, 1625932, 1627393]",1
4,27601557220,948,"[822970, 831063, 837107, 839262, 846550, 84798...","[1100379, 942525, 1054688, 1002654, 1101173, 1...","[822970, 831063, 837107, 846550, 847982, 84897...",10
...,...,...,...,...,...,...
2489,42289891395,539,"[857571, 1008547, 1038985, 1042907, 1072009, 1...",[857571],"[1008547, 1038985, 1042907, 1072009, 1126899, ...",1
2490,42289906117,1823,"[851865, 852540, 865174, 901062, 923789, 94545...","[1082185, 1130763, 1122568, 1075637, 851865, 9...","[852540, 865174, 901062, 945456, 948509, 95172...",6
2491,42291245957,2125,"[1741748, 1754035, 1766903, 1769503, 1772030, ...","[13117489, 1793630, 2024753, 2031026, 1907825,...","[1741748, 1754035, 1769503, 1772030, 1789655, ...",13
2492,42302711057,2206,"[972191, 1122428]",[1122428],[972191],1


In [243]:
sample_df.to_csv('../data/dunnhumby/test_sample_to_predict_dunnhumby.csv', index = False)