# File and libraries

In [21]:
import pandas as pd
import numpy as np


file_tag = "Instacart Market Basket"

# DSLabs functions

In [22]:
%run "scripts/dslabs_functions.py"


# data functions

In [23]:
%run "scripts/data_functions.py"


data_functions lodaded


# sampling and testing

In [24]:

# test_data=True
test_data=False


# Define the sampling function
def sample_user_orders(data, fraction=0.1):
    # Get unique user_ids and order_ids
    unique_user_orders = data[['user_id', 'order_id']].drop_duplicates()
    sampled_user_orders = unique_user_orders.sample(frac=fraction)
    
    # Filter the dataset to include only the sampled users and orders
    sampled_data = data[data[['user_id', 'order_id']].apply(tuple, axis=1).isin(sampled_user_orders.apply(tuple, axis=1))]
    return sampled_data


# Load the data
orders = pd.read_csv('data/input/orders.csv')


if test_data==True:



    # Apply the sampling to each group 1%
    sample=0.05
    order_data = sample_user_orders(orders, fraction=sample)
    

else:
    
    order_data = orders
    
    
# drop eval_set as its not necessary
order_data=order_data.drop(['eval_set'], axis=1)

print(order_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int64  
 3   order_dow               int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(5)
memory usage: 156.6 MB
None


## merge orders with prior

In [25]:
prior=pd.read_csv('data/input/order_products__prior.csv')


# merge the two dataframes on order id
data = pd.merge(prior,order_data , on='order_id', how='inner')

# class target column


In [26]:
target = "reordered"

values = data[target].value_counts(normalize=True) 
print(values)

reordered
1    0.589697
0    0.410303
Name: proportion, dtype: float64


In [27]:
data.shape

(32434489, 9)

In [28]:
summary5 = data.describe(include="all")

summary5

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,32434490.0,32434490.0,32434490.0,32434490.0,32434490.0,32434490.0,32434490.0,32434490.0,30356420.0
mean,1710749.0,25576.34,8.351076,0.5896975,102937.2,17.14205,2.738818,13.42498,11.10407
std,987300.7,14096.69,7.126671,0.4918886,59466.48,17.53504,2.090049,4.246365,8.778914
min,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,855943.0,13530.0,3.0,0.0,51421.0,5.0,1.0,10.0,5.0
50%,1711048.0,25256.0,6.0,1.0,102611.0,11.0,3.0,13.0,8.0
75%,2565514.0,37935.0,11.0,1.0,154391.0,24.0,5.0,16.0,15.0
max,3421083.0,49688.0,145.0,1.0,206209.0,99.0,6.0,23.0,30.0


### additional date columns creation

In [29]:
# Categorize the time of day
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 22:
        return 'Evening'
    else:
        return 'Night'

data['order_time_of_day'] = data['order_hour_of_day'].apply(categorize_time_of_day)

data['is_weekend'] = data['order_dow'].apply(lambda x: 1 if x >= 5 else 0)  # 1 for weekend, 0 for weekday


data['weeks_since_prior_order'] = data['days_since_prior_order'].apply(lambda x: round(x / 7, 0) if pd.notnull(x) else np.nan)



## encode date time to cyclic

In [30]:
from math import pi, sin, cos

# Function to apply sin and cos on an already-mapped cyclic feature
def apply_sin_cos_for_mapped_column(data, column):
    data[column + '_sin'] = np.sin(data[column])  # apply sine
    data[column + '_cos'] = np.cos(data[column])  # apply cosine
    return data

# Function to encode cyclic variables using sine and cosine
def cyclic_encode(value, x_max):
    # sine and cosine components to capture cyclic pattern
    value_sin = np.sin(2 * np.pi * value / x_max)
    value_cos = np.cos(2 * np.pi * value / x_max)
    return value_sin, value_cos


day_of_week_encoding_mapping={
    'Night':0,   
    'Morning':pi/2,
    'Afternoon':pi,
    'Evening':-pi/2,   
}

data['order_time_of_day_enc'] = encode_column_with_mapping(data, 'order_time_of_day', day_of_week_encoding_mapping)


# Encoding for local_hour (0-23)
data['order_hour_of_day_sin'], data['order_hour_of_day_cos'] = zip(*data['order_hour_of_day'].apply(lambda x: cyclic_encode(x, 23)))

# Encoding for day_of_week (0-6)
data['order_dow_sin'], data['order_dow_cos'] = zip(*data['order_dow'].apply(lambda x: cyclic_encode(x, 6)))


# user order dataframe for clustering

In [31]:
# Group by order_id and aggregate features
order_agg = data.groupby('order_id').agg({
    'add_to_cart_order': 'max',
    'reordered': 'mean'
}).rename(columns={'add_to_cart_order': 'num_products', 'reordered': 'reorder_rate'})


data=data.merge(order_agg, on='order_id', how='left')

In [32]:
# Drop product_id column and get unique values per order_id
distinct_orders = data.drop(columns=['product_id']).drop_duplicates()

# Group by user_id and aggregate features using named aggregations
user_agg = distinct_orders.groupby('user_id').agg(
    total_orders=('order_number', 'max'),  # Total number of orders
    total_products=('num_products', 'sum'),  # Total products purchased
    mean_lag_between_orders=('days_since_prior_order', 'mean'),
    std_lag_between_orders=('days_since_prior_order', 'std'),
    mean_reorder_rate=('reorder_rate', 'mean'),
    mean_weekend_order=('is_weekend', 'mean'),
    mean_order_dow=('order_dow', 'mean'),
    mean_order_hour=('order_hour_of_day', 'mean'),
    median_order_hour=('order_hour_of_day', 'median'),
    peak_order_hour=('order_hour_of_day', lambda x: x.mode()[0]),
    order_day_variety=('order_dow', lambda x: x.nunique()),
).reset_index()

# Calculate additional features
user_agg['order_frequency'] = user_agg['total_orders'] / user_agg['mean_lag_between_orders']
user_agg['reorder_ratio'] = user_agg['mean_reorder_rate'] / user_agg['total_products']
user_agg['unique_products'] = distinct_orders.groupby('user_id')['order_id'].nunique().values
user_agg['weekend_order_ratio'] = user_agg['mean_weekend_order'] / user_agg['total_orders']
user_agg['order_size_variability'] = distinct_orders.groupby('user_id')['num_products'].std().values

# Handle NaN values (if needed)
# user_agg = user_agg.fillna(0)  # or user_agg.dropna()

user_agg.head()

Unnamed: 0,user_id,total_orders,total_products,mean_lag_between_orders,std_lag_between_orders,mean_reorder_rate,mean_weekend_order,mean_order_dow,mean_order_hour,median_order_hour,peak_order_hour,order_day_variety,order_frequency,reorder_ratio,unique_products,weekend_order_ratio,order_size_variability
0,1,10,369,20.259259,9.304463,0.694915,0.0,2.644068,10.542373,9.0,7,4,0.493601,0.001883,10,0.0,1.582155
1,2,14,3141,15.967033,9.119769,0.476923,0.030769,2.005128,10.441026,10.0,9,5,0.876807,0.000152,14,0.002198,5.469097
2,3,12,694,11.487179,4.869048,0.625,0.0,1.011364,16.352273,16.0,16,4,1.044643,0.000901,12,0.0,2.042265
3,4,5,82,15.357143,8.580901,0.055556,0.5,4.722222,13.111111,13.0,15,3,0.325581,0.000678,5,0.1,2.12055
4,5,4,371,14.5,4.263801,0.378378,0.0,1.621622,15.72973,16.0,18,3,0.275862,0.00102,4,0.0,2.315245


# enrich main order product dataframe

In [33]:


enriched_data = enrich_instacart_df(data)

                   
enriched_data.head(20)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,order_time_of_day,...,order_hour_of_day_cos,order_dow_sin,order_dow_cos,num_products,reorder_rate,product_name,aisle_id,department_id,aisle,department
0,2,33120,1,1,202279,3,5,9,8.0,Morning,...,-0.775711,-0.866025,0.5,9,0.666667,Organic Egg Whites,86,16,eggs,dairy eggs
1,2,28985,2,1,202279,3,5,9,8.0,Morning,...,-0.775711,-0.866025,0.5,9,0.666667,Michigan Organic Kale,83,4,fresh vegetables,produce
2,2,9327,3,0,202279,3,5,9,8.0,Morning,...,-0.775711,-0.866025,0.5,9,0.666667,Garlic Powder,104,13,spices seasonings,pantry
3,2,45918,4,1,202279,3,5,9,8.0,Morning,...,-0.775711,-0.866025,0.5,9,0.666667,Coconut Butter,19,13,oils vinegars,pantry
4,2,30035,5,0,202279,3,5,9,8.0,Morning,...,-0.775711,-0.866025,0.5,9,0.666667,Natural Sweetener,17,13,baking ingredients,pantry
5,2,17794,6,1,202279,3,5,9,8.0,Morning,...,-0.775711,-0.866025,0.5,9,0.666667,Carrots,83,4,fresh vegetables,produce
6,2,40141,7,1,202279,3,5,9,8.0,Morning,...,-0.775711,-0.866025,0.5,9,0.666667,Original Unflavored Gelatine Mix,105,13,doughs gelatins bake mixes,pantry
7,2,1819,8,1,202279,3,5,9,8.0,Morning,...,-0.775711,-0.866025,0.5,9,0.666667,All Natural No Stir Creamy Almond Butter,88,13,spreads,pantry
8,2,43668,9,0,202279,3,5,9,8.0,Morning,...,-0.775711,-0.866025,0.5,9,0.666667,Classic Blend Cole Slaw,123,4,packaged vegetables fruits,produce
9,3,33754,1,1,205970,16,5,17,12.0,Evening,...,-0.068242,-0.866025,0.5,8,1.0,Total 2% with Strawberry Lowfat Greek Strained...,120,16,yogurt,dairy eggs


# Final df to csv

In [34]:
if test_data==False:
    
    data.to_csv('data/instacart_pre_proc.csv',index=False)

    user_agg.to_csv('data/instacart_user_pre_proc.csv',index=False)

## sample df to csv

In [39]:
# Apply the sampling to each group 1%
sample=0.20

In [40]:
if test_data==False:
    
    sample_data = sample_user_orders(data, fraction=sample)

    sample_data.to_csv('data/instacart_pre_proc_sample.csv',index=False)


In [41]:
if test_data==False:
    
    users_df_sample=user_agg.sample(frac=sample)
    
    users_df_sample.to_csv('data/instacart_user_pre_proc_sample.csv',index=False)
