# Preprocessing Katcher's dataset for RLMREC

Note: This notebook is ad-hoc based. The paths of the necessary files are hardcoded. Sorry for the inconvenience.

Input
- train_LLM.csv, val_LLM.csv, test_LLM.csv

In [1]:
import pandas as pd
import numpy as np
import re
import json
from scipy.sparse import coo_matrix
import pickle

In [2]:
raw_train_data = pd.read_csv('/Jupyter/dev_src/RLMRec/data/katchers/raw/train_LLM.csv')
raw_val_data = pd.read_csv('/Jupyter/dev_src/RLMRec/data/katchers/raw/val_LLM.csv')
raw_test_data = pd.read_csv('/Jupyter/dev_src/RLMRec/data/katchers/raw/test_LLM.csv')

## Make item dictionary 

iid, prod_id, title

In [3]:
def convert_each_row(row):
    pattern = r'주문 내역: (.+?)(?=\d{4}-\d{2}-\d{2}|\Z)'
    prev = row.prev
    matches = re.findall(pattern, prev, re.DOTALL)
    x = [match.strip() for match in matches]
    y = [int(y_) for y_ in row.prev_id.split(',')]
    x_next = row.next.split('\n')[1]
    y_next = row.next_id
    x.append(x_next)
    y.append(y_next)
    return {id:ss for id, ss in zip(y, x)}
        

In [4]:
item_dic = {}
for d in raw_train_data.apply(convert_each_row, axis=1):
    item_dic.update(d)
for d in raw_val_data.apply(convert_each_row, axis=1):
    item_dic.update(d)
for d in raw_test_data.apply(convert_each_row, axis=1):
    item_dic.update(d)

In [5]:
item_df = pd.DataFrame(list(item_dic.items()), columns=['prod_id', 'title']).sort_values(by='prod_id').reset_index(drop=True)
item_df.index.name = 'iid'

### output: katchers_item.json

uniq items: 1,669

In [None]:
item_df.reset_index()[['iid','prod_id']].to_json('/Jupyter/dev_src/RLMRec/data/mapper/katchers_item.json', orient='records', lines=True)

In [None]:
item_df.to_pickle('/Jupyter/dev_src/RLMRec/data/katchers/intermediate/item_df.pkl')

# Make user dictionary

In [None]:
# concat user columns
all_users = pd.concat([raw_train_data['user'], raw_val_data['user'], raw_test_data['user']])

# get unique users
unique_users = sorted(all_users.unique())

# DataFrame 
user_df = pd.DataFrame({
    'uid': range(len(unique_users)),
    'user_id': unique_users,
    
})

user_df.set_index('uid', inplace=True)

### output: katchers_user.json

uniq users: 16,813

In [7]:
user_df.reset_index()[['uid','user_id']].to_json('/Jupyter/dev_src/RLMRec/data/mapper/katchers_user.json', orient='records', lines=True)

In [None]:
user_df.to_pickle('/Jupyter/dev_src/RLMRec/data/katchers/intermediate/user_df.pkl')

In [8]:
len(user_df), len(item_df)

(16813, 1669)

# Gathers purchase history in order to build user profile


Note: The resulting history file (compact_history_df.pkl) is used as a source of GPT input prompt.

In [None]:
history_df = pd.concat((raw_train_data.query("label==1"),
raw_val_data.query("label==1"),
raw_test_data.query("label==1")), axis=0)

In [None]:
compact_history_df = pd.merge(history_df, user_df.reset_index(), left_on='user', right_on='user_id').set_index('uid')[['user', 'prev_id']]

In [9]:
prod_title_dic = item_df.set_index('prod_id')['title'].to_dict()

In [None]:
def item_history_mapper(row):
    user = row.user
    purchased_prods = []
    for prod in set(row.prev_id.split(',')):
        purchased_prods.append(
            {'title':prod_title_dic[int(prod)],
         'description':'None',
         'review':'None'
        }
        )
    json_str = json.dumps(purchased_prods, ensure_ascii=False)
    return json_str

In [None]:
compact_history_df['user_profile_input'] = compact_history_df.apply(item_history_mapper, axis=1)

output: intermediate/compact_history_df.pkl

In [None]:
compact_history_df.to_pickle('/Jupyter/dev_src/RLMRec/data/katchers/intermediate/compact_history_df.pkl')

# Make coo_matrices

The prev_id in the purchase history is used to build the coo_matrix.

The prev_is of users in the validation and test are integrated into the training matrix.

However the next_id of users in the validation and test are kept for evaluation and test.

In [60]:
n_user, n_item = len(user_df), len(item_df)
print(n_user, n_item)

16813 1669


In [73]:
all_history_df = pd.concat((raw_train_data.query("label==1"),
raw_val_data.query("label==1"),
raw_test_data.query("label==1")), axis=0)

In [61]:
prod_to_iid_dic = item_df.reset_index().set_index('prod_id')['iid'].to_dict()

In [74]:
def purchased_prev_iids_wrapper(row):
    user = row.user
    
    purchased_prods = set([int(x) for x in row.prev_id.split(',')])

    purchased_iids = []
    for prod in purchased_prods:
        if prod in prod_to_iid_dic:
            purchased_iids.append(prod_to_iid_dic[prod])

    return purchased_iids

def purchased_next_id_wrapper(row):
    user = row.user
    
    purchased_prods = set([int(row.next_id)])

    purchased_iids = []
    for prod in purchased_prods:
        if prod in prod_to_iid_dic:
            purchased_iids.append(prod_to_iid_dic[prod])

    return purchased_iids

In [75]:

def build_train_coo_matrix(in_df, user_df, n_user, n_items):
    tmp_df = pd.merge(in_df.query("label==1")[['user','prev_id']], user_df.reset_index(), left_on='user', right_on='user_id').set_index('uid')
    tmp_df['iids'] = tmp_df.apply(purchased_prev_iids_wrapper, axis=1)
    v = tmp_df.explode('iids')['iids']
    return coo_matrix(([1.0] * len(v), (v.index.values, v.values)), shape=(n_user, n_item), dtype=np.float32)

def build_other_coo_matrix(in_df, user_df, n_user, n_items):
    tmp_df = pd.merge(in_df.query("label==1")[['user','next_id']], user_df.reset_index(), left_on='user', right_on='user_id').set_index('uid')
    tmp_df['iids'] = tmp_df.apply(purchased_next_id_wrapper, axis=1)
    v = tmp_df.explode('iids')['iids']
    return coo_matrix(([1.0] * len(v), (v.index.values, v.values)), shape=(n_user, n_item), dtype=np.float32)

In [77]:
trn_mat = build_train_coo_matrix(in_df=all_history_df, user_df=user_df, n_user=n_user, n_items=n_item)
with open('/Jupyter/dev_src/RLMRec/data/katchers/trn_mat.pkl', 'wb') as f:
    pickle.dump(trn_mat, f)

In [78]:
val_mat = build_other_coo_matrix(in_df=raw_val_data, user_df=user_df, n_user=n_user, n_items=n_item)
with open('/Jupyter/dev_src/RLMRec/data/katchers/val_mat.pkl', 'wb') as f:
    pickle.dump(val_mat, f)

In [79]:
tst_mat = build_other_coo_matrix(in_df=raw_test_data, user_df=user_df, n_user=n_user, n_items=n_item)
with open('/Jupyter/dev_src/RLMRec/data/katchers/tst_mat.pkl', 'wb') as f:
    pickle.dump(tst_mat, f)

In [86]:
index_list = list(zip(trn_mat.row, trn_mat.col))

# Make extra information for test data

In our paper, during testing, each user is assigned one positive example and 49 negative examples. We measured NDCG@K based on their predicted values. Similarly, for RMLEC, we generated tst_masked_unquery_iids information to evaluate 50 examples per test user. Since RMLEC performs predictions on all items, we added this masking information to ensure that only the predicted values for the 50 test examples are used, and the others are disregarded.

In [13]:
tmp_df = pd.merge(raw_test_data[['user','next_id']], user_df.reset_index(), left_on='user', right_on='user_id').set_index('uid')

In [23]:
tmp_df['test_iid'] = tmp_df.apply(lambda row: prod_to_iid_dic[int(row.next_id)], axis=1)

In [57]:
all_iids = set(item_df.index.values)
test_query_iids_dic = tmp_df.reset_index().groupby('uid')['test_iid'].apply(set)
masked_unquery_iids = {uid:list(all_iids - query_iids) for uid, query_iids in test_query_iids_dic.items()}
with open('/Jupyter/dev_src/RLMRec/data/katchers/tst_maksed_unquery_iids.pkl', 'wb') as f:
    pickle.dump(masked_unquery_iids, f)
