In [1]:
import numpy as np
import pandas as pd
from configs import *

In [2]:
orders = pd.read_csv(f'{DATA_DIR}/orders.csv')
order_products = pd.read_csv(f'{DATA_DIR}/order_products__prior.csv')
products = pd.read_csv(f'{DATA_DIR}/products.csv')

df = (
    order_products
        .merge(orders[['order_id', 'user_id', 'order_number']], on='order_id', how='left')
        .merge(products[['product_id', 'product_name', 'department_id']], on='product_id', how='left')
)

In [3]:
# Exclude non-food departments
exclude_depts = [5, 8, 11, 17, 18, 2, 21]
df = df[~df['department_id'].isin(exclude_depts)].copy()

# filter for users without enough orders
user_order_count = df.groupby('user_id')['order_id'].nunique()
active_users = user_order_count[user_order_count >= MIN_USER_ORDERS].index

# filter for items without enough purchases
product_purchase_count = df.groupby('product_id').size()
popular_products = product_purchase_count[product_purchase_count >= MIN_ITEM_PURCHASES].index

df = df[df['user_id'].isin(active_users) & 
        df['product_id'].isin(popular_products)
     ].copy()

In [4]:
# Keep only unique user_id->product_id pairs
df_condensed = df[['user_id', 'product_id', 'product_name']].drop_duplicates().copy()
df_condensed.to_csv(f'{DATA_DIR}/filtered_purchases.csv', index=False)

In [5]:
# Keep all orders
df_with_orders = df[['user_id', 'product_id', 'order_number', 'product_name']].copy()
df_with_orders.to_csv(f'{DATA_DIR}/time_aware_filtered_purchases.csv', index=False)