# Libraries

In [1]:
import pandas as pd


# Utility Functions

In [2]:
def read_session_file(filename):
    return pd.read_csv(filename, index_col="session_id", dtype={"item_id": str, "session_id": str}, parse_dates=["date"])\
        .sort_values(by=['session_id', 'date'], ascending=[True, True])


def read_purchases_file(filename):
    return pd.read_csv(filename, index_col="session_id", dtype={"item_id": str, "session_id": str}, parse_dates=["date"])\
        .drop("date", axis=1)\
        .rename({"item_id": "item_purchase"}, axis=1)


def read_candidate_items_file(filename):
    return pd.read_csv(filename, dtype=str)


def build_view_history(session_df):
    return session_df\
        .groupby("session_id")\
        .agg({"item_id": lambda x: x.to_list()})\
        .rename({"item_id": "item_views"}, axis=1)


def build_itemid_docid_lookup(item_ids):

    doc_ids = pd.array(range(len(item_ids)))

    df = pd.DataFrame({
        "item_id": item_ids,
        "doc_id": doc_ids
    })

    return df


# Data

Read the given files:

In [3]:
CANDIDATE_ITEMS = "/workspaces/recsys2022-weaviate/dressipi_recsys2022_dataset/candidate_items.csv"

TRAIN_SESSION = "/workspaces/recsys2022-weaviate/dressipi_recsys2022_dataset/train_sessions.csv"
TRAIN_PURCHASES = "/workspaces/recsys2022-weaviate/dressipi_recsys2022_dataset/train_purchases.csv"


In [4]:
df_candidate_items = read_candidate_items_file(CANDIDATE_ITEMS)


In [5]:
df_session = read_session_file(TRAIN_SESSION)
df_purchases = read_purchases_file(TRAIN_PURCHASES)
df_view_hist = build_view_history(df_session)


# Dataset Split

## Training Set

In [6]:
train_df = df_view_hist.join(df_purchases, on="session_id", how="inner")
train_df.head(5)


Unnamed: 0_level_0,item_views,item_purchase
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000004,"[26396, 5288, 3298]",18834
1000008,"[15533, 15533, 15533]",17641
1000016,"[2972, 2972, 2972, 2972, 2972, 3402]",22794
1000019,[4872],26711
1000024,"[9238, 25745]",9362


## Validation Set

In [7]:
VALID_SESSION = "/workspaces/recsys2022-weaviate/dressipi_recsys2022_dataset/test_leaderboard_sessions.csv"
VALID_PURCHASES = "/workspaces/recsys2022-weaviate/dressipi_recsys2022_dataset/test_leaderboard_purchases.csv"

df_session = read_session_file(VALID_SESSION)
df_purchases = read_purchases_file(VALID_PURCHASES)
df_view_hist = build_view_history(df_session)

valid_df = df_view_hist.join(df_purchases, on="session_id", how="inner")
valid_df.head(5)


Unnamed: 0_level_0,item_views,item_purchase
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
100021,"[9219, 7926, 21719, 26785]",14306
1000328,"[1152, 9201, 20536]",18511
1000433,"[16922, 14881, 15861, 14881, 7963]",2794
100050,[4314],13966
100053,[2891],25278


## Test Set

In [8]:
TEST_SESSION = "/workspaces/recsys2022-weaviate/dressipi_recsys2022_dataset/test_final_sessions.csv"
TEST_PURCHASES = "/workspaces/recsys2022-weaviate/dressipi_recsys2022_dataset/test_final_purchases.csv"

df_session = read_session_file(TEST_SESSION)
df_purchases = read_purchases_file(TEST_PURCHASES)
df_view_hist = build_view_history(df_session)

test_df = df_view_hist.join(df_purchases, on="session_id", how="inner")
test_df.head(5)


Unnamed: 0_level_0,item_views,item_purchase
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000025,"[15216, 15216, 8060, 27442, 13914]",20629
1000039,"[14932, 17014]",15842
1000134,"[8291, 16412, 19765]",3825
1000150,"[9522, 1669]",18723
1000174,"[12508, 17740, 5199, 20260, 12014, 15550, 1774...",10410


## Document ID

Let `item_purchase` be the document id and `item_views` the "text" in the document:

In [9]:
item_ids = pd.concat([df_candidate_items["item_id"],
                     train_df["item_purchase"]]).unique()

itemid_docid_lookup = build_itemid_docid_lookup(item_ids)
itemid_docid_lookup.head()


Unnamed: 0,item_id,doc_id
0,4,0
1,8,1
2,9,2
3,19,3
4,20,4


# Save Dataset Splits

In [10]:
train_df.to_parquet("train_df.parquet", index=True)
valid_df.to_parquet("valid_df.parquet", index=True)
test_df.to_parquet("test_df.parquet", index=True)

itemid_docid_lookup.to_parquet("itemid_docid_lookup.parquet", index=True)
