# File and libraries

In [1]:
import pandas as pd
import numpy as np


file_tag = "Instacart Market Basket"

# DSLabs functions

In [2]:
%run "scripts/dslabs_functions.py"


# data functions

In [3]:
%run "scripts/data_functions.py"


data_functions lodaded


# sampling and testing

In [4]:

# test_data=True
test_data=False


# Define the sampling function
def sample_user_orders(data, fraction=0.1):
    # Get unique user_ids and order_ids
    unique_user_orders = data[['user_id', 'order_id']].drop_duplicates()
    sampled_user_orders = unique_user_orders.sample(frac=fraction)
    
    # Filter the dataset to include only the sampled users and orders
    sampled_data = data[data[['user_id', 'order_id']].apply(tuple, axis=1).isin(sampled_user_orders.apply(tuple, axis=1))]
    return sampled_data


# Load the data
orders = pd.read_csv('data/input/orders.csv')


if test_data==True:



    # Apply the sampling to each group 1%
    sample=0.01
    order_data = sample_user_orders(orders, fraction=sample)
    

else:
    
    # Apply the sampling to each group 10%
    sample=0.1
    order_data = sample_user_orders(orders, fraction=sample)    
    
    
# drop eval_set as its not necessary
order_data=order_data.drop(['eval_set'], axis=1)

print(order_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 342108 entries, 6 to 3421081
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   order_id                342108 non-null  int64  
 1   user_id                 342108 non-null  int64  
 2   order_number            342108 non-null  int64  
 3   order_dow               342108 non-null  int64  
 4   order_hour_of_day       342108 non-null  int64  
 5   days_since_prior_order  321418 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 18.3 MB
None


## merge orders with prior

In [5]:
prior=pd.read_csv('data/input/order_products__prior.csv')


# merge the two dataframes on order id
data = pd.merge(prior,order_data , on='order_id', how='inner')

# class target column


In [6]:
target = "reordered"

values = data[target].value_counts(normalize=True) 
print(values)

reordered
1    0.589562
0    0.410438
Name: proportion, dtype: float64


In [7]:
data.shape

(3241233, 9)

In [8]:
summary5 = data.describe(include="all")

summary5

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3241233.0,3241233.0,3241233.0,3241233.0,3241233.0,3241233.0,3241233.0,3241233.0,3032641.0
mean,1709374.0,25571.06,8.350269,0.5895624,102988.5,17.15303,2.738741,13.42085,11.11255
std,986782.2,14099.22,7.115777,0.4919133,59451.77,17.51208,2.089052,4.251476,8.785146
min,8.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,856468.0,13517.0,3.0,0.0,51474.0,5.0,1.0,10.0,5.0
50%,1706684.0,25232.0,6.0,1.0,102628.0,11.0,3.0,13.0,8.0
75%,2564846.0,37935.0,11.0,1.0,154578.0,24.0,5.0,16.0,15.0
max,3421081.0,49688.0,116.0,1.0,206209.0,99.0,6.0,23.0,30.0


### additional date columns creation

In [9]:
# Categorize the time of day
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 22:
        return 'Evening'
    else:
        return 'Night'

data['order_time_of_day'] = data['order_hour_of_day'].apply(categorize_time_of_day)

data['is_weekend'] = data['order_dow'].apply(lambda x: 1 if x >= 5 else 0)  # 1 for weekend, 0 for weekday


data['weeks_since_prior_order'] = data['days_since_prior_order'].apply(lambda x: round(x / 7, 0) if pd.notnull(x) else np.nan)



## encode date time to cyclic

In [10]:
from math import pi, sin, cos

# Function to apply sin and cos on an already-mapped cyclic feature
def apply_sin_cos_for_mapped_column(data, column):
    data[column + '_sin'] = np.sin(data[column])  # apply sine
    data[column + '_cos'] = np.cos(data[column])  # apply cosine
    return data

# Function to encode cyclic variables using sine and cosine
def cyclic_encode(value, x_max):
    # sine and cosine components to capture cyclic pattern
    value_sin = np.sin(2 * np.pi * value / x_max)
    value_cos = np.cos(2 * np.pi * value / x_max)
    return value_sin, value_cos


day_of_week_encoding_mapping={
    'Night':0,   
    'Morning':pi/2,
    'Afternoon':pi,
    'Evening':-pi/2,   
}

data['order_time_of_day_enc'] = encode_column_with_mapping(data, 'order_time_of_day', day_of_week_encoding_mapping)


# Encoding for local_hour (0-23)
data['order_hour_of_day_sin'], data['order_hour_of_day_cos'] = zip(*data['order_hour_of_day'].apply(lambda x: cyclic_encode(x, 23)))

# Encoding for day_of_week (0-6)
data['order_dow_sin'], data['order_dow_cos'] = zip(*data['order_dow'].apply(lambda x: cyclic_encode(x, 6)))


# enrich data

In [11]:


enriched_data = enrich_instacart_df(data)

                   
enriched_data.head(20)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,order_time_of_day,...,order_time_of_day_enc,order_hour_of_day_sin,order_hour_of_day_cos,order_dow_sin,order_dow_cos,product_name,aisle_id,department_id,aisle,department
0,8,23423,1,1,3107,5,4,6,17.0,Morning,...,1.570796,0.997669,-0.068242,-0.8660254,-0.5,Original Hawaiian Sweet Rolls,43,3,buns rolls,bakery
1,15,19660,1,1,54901,51,3,11,2.0,Morning,...,1.570796,0.136167,-0.990686,1.224647e-16,-1.0,Spring Water,115,7,water seltzer sparkling water,beverages
2,15,21195,2,1,54901,51,3,11,2.0,Morning,...,1.570796,0.136167,-0.990686,1.224647e-16,-1.0,Organic Extra Virgin Olive Oil,19,13,oils vinegars,pantry
3,15,7461,3,1,54901,51,3,11,2.0,Morning,...,1.570796,0.136167,-0.990686,1.224647e-16,-1.0,Pinto Beans No Salt Added,59,15,canned meals beans,canned goods
4,15,2996,4,1,54901,51,3,11,2.0,Morning,...,1.570796,0.136167,-0.990686,1.224647e-16,-1.0,Honeysuckle Hand Soap,25,11,soap,personal care
5,15,32463,5,1,54901,51,3,11,2.0,Morning,...,1.570796,0.136167,-0.990686,1.224647e-16,-1.0,Olive Oil & Aloe Vera Hand Soap,25,11,soap,personal care
6,18,8021,1,0,118860,3,4,20,6.0,Evening,...,-1.570796,-0.730836,0.682553,-0.8660254,-0.5,100% Recycled Paper Towels,54,17,paper goods,household
7,18,34969,2,1,118860,3,4,20,6.0,Evening,...,-1.570796,-0.730836,0.682553,-0.8660254,-0.5,Red Vine Tomato,83,4,fresh vegetables,produce
8,18,1000,3,0,118860,3,4,20,6.0,Evening,...,-1.570796,-0.730836,0.682553,-0.8660254,-0.5,Apricots,18,10,bulk dried fruits vegetables,bulk
9,18,5212,4,0,118860,3,4,20,6.0,Evening,...,-1.570796,-0.730836,0.682553,-0.8660254,-0.5,Watermelon Chunks,123,4,packaged vegetables fruits,produce


# Final df to csv

In [12]:
data.to_csv('data/instacart_pre_proc.csv',index=False)