# File and libraries

In [16]:
import pandas as pd
import numpy as np


file_tag = "Instacart Market Basket"

# DSLabs functions

In [17]:
%run "scripts/dslabs_functions.py"


# data functions

In [18]:
%run "scripts/data_functions.py"


data_functions lodaded


# sampling and testing

In [19]:

# test_data=True
test_data=False


# Define the sampling function
def sample_user_orders(data, fraction=0.1):
    # Get unique user_ids and order_ids
    unique_user_orders = data[['user_id', 'order_id']].drop_duplicates()
    sampled_user_orders = unique_user_orders.sample(frac=fraction)
    
    # Filter the dataset to include only the sampled users and orders
    sampled_data = data[data[['user_id', 'order_id']].apply(tuple, axis=1).isin(sampled_user_orders.apply(tuple, axis=1))]
    return sampled_data


# Load the data
orders = pd.read_csv('data/input/orders.csv')


if test_data==True:



    # Apply the sampling to each group 1%
    sample=0.01
    order_data = sample_user_orders(orders, fraction=sample)
    

else:
    
    # Apply the sampling to each group 10%
    sample=0.2
    order_data = sample_user_orders(orders, fraction=sample)    
    
    
# drop eval_set as its not necessary
order_data=order_data.drop(['eval_set'], axis=1)

print(order_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 684217 entries, 2 to 3421059
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   order_id                684217 non-null  int64  
 1   user_id                 684217 non-null  int64  
 2   order_number            684217 non-null  int64  
 3   order_dow               684217 non-null  int64  
 4   order_hour_of_day       684217 non-null  int64  
 5   days_since_prior_order  642929 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 36.5 MB
None


## merge orders with prior

In [20]:
prior=pd.read_csv('data/input/order_products__prior.csv')


# merge the two dataframes on order id
data = pd.merge(prior,order_data , on='order_id', how='inner')

# class target column


In [21]:
target = "reordered"

values = data[target].value_counts(normalize=True) 
print(values)

reordered
1    0.589058
0    0.410942
Name: proportion, dtype: float64


In [22]:
data.shape

(6484493, 9)

In [23]:
summary5 = data.describe(include="all")

summary5

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,6484493.0,6484493.0,6484493.0,6484493.0,6484493.0,6484493.0,6484493.0,6484493.0,6068761.0
mean,1711740.0,25572.99,8.34003,0.5890578,103072.1,17.10323,2.743039,13.42976,11.10043
std,987142.0,14097.07,7.1049,0.4920048,59463.69,17.50739,2.089,4.247166,8.784925
min,3.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,857136.0,13524.0,3.0,0.0,51581.0,5.0,1.0,10.0,5.0
50%,1711463.0,25237.0,6.0,1.0,103036.0,11.0,3.0,13.0,8.0
75%,2565257.0,37935.0,11.0,1.0,154466.0,24.0,5.0,16.0,15.0
max,3421073.0,49688.0,116.0,1.0,206208.0,99.0,6.0,23.0,30.0


### additional date columns creation

In [24]:
# Categorize the time of day
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 22:
        return 'Evening'
    else:
        return 'Night'

data['order_time_of_day'] = data['order_hour_of_day'].apply(categorize_time_of_day)

data['is_weekend'] = data['order_dow'].apply(lambda x: 1 if x >= 5 else 0)  # 1 for weekend, 0 for weekday


data['weeks_since_prior_order'] = data['days_since_prior_order'].apply(lambda x: round(x / 7, 0) if pd.notnull(x) else np.nan)



## encode date time to cyclic

In [25]:
from math import pi, sin, cos

# Function to apply sin and cos on an already-mapped cyclic feature
def apply_sin_cos_for_mapped_column(data, column):
    data[column + '_sin'] = np.sin(data[column])  # apply sine
    data[column + '_cos'] = np.cos(data[column])  # apply cosine
    return data

# Function to encode cyclic variables using sine and cosine
def cyclic_encode(value, x_max):
    # sine and cosine components to capture cyclic pattern
    value_sin = np.sin(2 * np.pi * value / x_max)
    value_cos = np.cos(2 * np.pi * value / x_max)
    return value_sin, value_cos


day_of_week_encoding_mapping={
    'Night':0,   
    'Morning':pi/2,
    'Afternoon':pi,
    'Evening':-pi/2,   
}

data['order_time_of_day_enc'] = encode_column_with_mapping(data, 'order_time_of_day', day_of_week_encoding_mapping)


# Encoding for local_hour (0-23)
data['order_hour_of_day_sin'], data['order_hour_of_day_cos'] = zip(*data['order_hour_of_day'].apply(lambda x: cyclic_encode(x, 23)))

# Encoding for day_of_week (0-6)
data['order_dow_sin'], data['order_dow_cos'] = zip(*data['order_dow'].apply(lambda x: cyclic_encode(x, 6)))


# user order dataframe for clustering

In [26]:
# Group by order_id and aggregate features
order_agg = data.groupby('order_id').agg({
    'add_to_cart_order': 'max',
    'reordered': 'mean'
}).rename(columns={'add_to_cart_order': 'num_products', 'reordered': 'reorder_rate'})


data=data.merge(order_agg, on='order_id', how='left')

In [31]:
# Drop product_id column and get unique values per order_id
distinct_orders = data.drop(columns=['product_id']).drop_duplicates()

# Group by user_id and aggregate features
user_agg = distinct_orders.groupby('user_id').agg({
    'order_number': 'max',  # Total number of orders
    'num_products': 'sum',  # Total number of products purchased
    'days_since_prior_order': 'mean',  # Mean lag between orders
    'num_products': 'mean',  # Mean number of products per order
    'reorder_rate': 'mean',  # Mean reorder rate
    'is_weekend': 'mean',  # Mean weekend order
    'order_dow': 'mean',  # Mean day of week of order
    'order_hour_of_day': 'mean',  # Mean hour of order
    'order_hour_of_day': 'median',  # Median hour of order
}).rename(columns={
    'order_number': 'total_orders',
    'num_products': 'total_products',
    'days_since_prior_order': 'mean_lag_between_orders',
    'num_products': 'mean_num_products',
    'reorder_rate': 'mean_reorder_rate',
    'is_weekend': 'mean_weekend_order',
    'order_dow': 'mean_order_dow',
    'order_hour_of_day': 'mean_order_hour',
    'order_hour_of_day': 'median_order_hour',
}).reset_index()


# user_agg = user_agg.dropna()
user_agg.head()

Unnamed: 0,user_id,total_orders,mean_num_products,mean_lag_between_orders,mean_reorder_rate,mean_weekend_order,mean_order_dow,median_order_hour
0,1,7,5.0,20.5,0.8,0.0,2.0,10.5
1,2,12,15.88,23.68,0.48,0.24,1.96,9.0
2,3,3,6.0,21.0,0.5,0.0,3.0,16.0
3,7,9,17.444444,17.425926,0.833333,0.185185,1.777778,16.5
4,12,3,12.0,14.0,0.25,1.0,5.0,8.0


# enrich main order product dataframe

In [28]:


enriched_data = enrich_instacart_df(data)

                   
enriched_data.head(20)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,order_time_of_day,...,order_hour_of_day_cos,order_dow_sin,order_dow_cos,num_products,reorder_rate,product_name,aisle_id,department_id,aisle,department
0,3,33754,1,1,205970,16,5,17,12.0,Evening,...,-0.068242,-0.866025,0.5,8,1.0,Total 2% with Strawberry Lowfat Greek Strained...,120,16,yogurt,dairy eggs
1,3,24838,2,1,205970,16,5,17,12.0,Evening,...,-0.068242,-0.866025,0.5,8,1.0,Unsweetened Almondmilk,91,16,soy lactosefree,dairy eggs
2,3,17704,3,1,205970,16,5,17,12.0,Evening,...,-0.068242,-0.866025,0.5,8,1.0,Lemons,123,4,packaged vegetables fruits,produce
3,3,21903,4,1,205970,16,5,17,12.0,Evening,...,-0.068242,-0.866025,0.5,8,1.0,Organic Baby Spinach,123,4,packaged vegetables fruits,produce
4,3,17668,5,1,205970,16,5,17,12.0,Evening,...,-0.068242,-0.866025,0.5,8,1.0,Unsweetened Chocolate Almond Breeze Almond Milk,91,16,soy lactosefree,dairy eggs
5,3,46667,6,1,205970,16,5,17,12.0,Evening,...,-0.068242,-0.866025,0.5,8,1.0,Organic Ginger Root,83,4,fresh vegetables,produce
6,3,17461,7,1,205970,16,5,17,12.0,Evening,...,-0.068242,-0.866025,0.5,8,1.0,Air Chilled Organic Boneless Skinless Chicken ...,35,12,poultry counter,meat seafood
7,3,32665,8,1,205970,16,5,17,12.0,Evening,...,-0.068242,-0.866025,0.5,8,1.0,Organic Ezekiel 49 Bread Cinnamon Raisin,112,3,bread,bakery
8,4,46842,1,0,178520,36,1,9,7.0,Morning,...,-0.775711,0.866025,0.5,13,0.923077,Plain Pre-Sliced Bagels,93,3,breakfast bakery,bakery
9,4,26434,2,1,178520,36,1,9,7.0,Morning,...,-0.775711,0.866025,0.5,13,0.923077,Honey/Lemon Cough Drops,11,11,cold flu allergy,personal care


# Final df to csv

In [29]:
data.to_csv('data/instacart_pre_proc.csv',index=False)

In [32]:
user_agg.to_csv('data/instacart_user_pre_proc.csv',index=False)