# File and libraries

In [25]:
import pandas as pd
import numpy as np


file_tag = "Instacart Market Basket"

# DSLabs functions

In [26]:
%run "scripts/dslabs_functions.py"


# data functions

In [27]:
%run "scripts/data_functions.py"


data_functions lodaded


# sampling and testing

In [28]:

# test_data=True
test_data=False


# Define the sampling function
def sample_user_orders(data, fraction=0.1):
    # Get unique user_ids and order_ids
    unique_user_orders = data[['user_id', 'order_id']].drop_duplicates()
    sampled_user_orders = unique_user_orders.sample(frac=fraction)
    
    # Filter the dataset to include only the sampled users and orders
    sampled_data = data[data[['user_id', 'order_id']].apply(tuple, axis=1).isin(sampled_user_orders.apply(tuple, axis=1))]
    return sampled_data


# Load the data
orders = pd.read_csv('data/input/orders.csv')


if test_data==True:



    # Apply the sampling to each group 1%
    sample=0.01
    order_data = sample_user_orders(orders, fraction=sample)
    

else:
    
    # Apply the sampling to each group 10%
    sample=0.1
    order_data = sample_user_orders(orders, fraction=sample)    
    
    
# drop eval_set as its not necessary
order_data=order_data.drop(['eval_set'], axis=1)

print(order_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 342108 entries, 30 to 3421080
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   order_id                342108 non-null  int64  
 1   user_id                 342108 non-null  int64  
 2   order_number            342108 non-null  int64  
 3   order_dow               342108 non-null  int64  
 4   order_hour_of_day       342108 non-null  int64  
 5   days_since_prior_order  321581 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 18.3 MB
None


## merge orders with prior

In [29]:
prior=pd.read_csv('data/input/order_products__prior.csv')


# merge the two dataframes on order id
data = pd.merge(prior,order_data , on='order_id', how='inner')

# class target column


In [30]:
target = "reordered"

values = data[target].value_counts(normalize=True) 
print(values)

reordered
1    0.589881
0    0.410119
Name: proportion, dtype: float64


In [31]:
data.shape

(3241297, 9)

In [32]:
summary5 = data.describe(include="all")

summary5

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3241297.0,3241297.0,3241297.0,3241297.0,3241297.0,3241297.0,3241297.0,3241297.0,3034599.0
mean,1715403.0,25588.66,8.35993,0.5898815,102854.9,17.11146,2.739031,13.42555,11.12681
std,987717.6,14093.87,7.140972,0.4918551,59495.35,17.50739,2.092428,4.240616,8.798426
min,15.0,1.0,1.0,0.0,3.0,1.0,0.0,0.0,0.0
25%,859161.0,13535.0,3.0,0.0,51249.0,5.0,1.0,10.0,5.0
50%,1719978.0,25273.0,6.0,1.0,102407.0,11.0,3.0,13.0,8.0
75%,2570191.0,37945.0,11.0,1.0,154342.0,24.0,5.0,16.0,15.0
max,3421074.0,49688.0,116.0,1.0,206209.0,99.0,6.0,23.0,30.0


### additional date columns creation

In [33]:
# Categorize the time of day
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 22:
        return 'Evening'
    else:
        return 'Night'

data['order_time_of_day'] = data['order_hour_of_day'].apply(categorize_time_of_day)

data['is_weekend'] = data['order_dow'].apply(lambda x: 1 if x >= 5 else 0)  # 1 for weekend, 0 for weekday


data['weeks_since_prior_order'] = data['days_since_prior_order'].apply(lambda x: round(x / 7, 0) if pd.notnull(x) else np.nan)



## encode date time to cyclic

In [34]:
from math import pi, sin, cos

# Function to apply sin and cos on an already-mapped cyclic feature
def apply_sin_cos_for_mapped_column(data, column):
    data[column + '_sin'] = np.sin(data[column])  # apply sine
    data[column + '_cos'] = np.cos(data[column])  # apply cosine
    return data

# Function to encode cyclic variables using sine and cosine
def cyclic_encode(value, x_max):
    # sine and cosine components to capture cyclic pattern
    value_sin = np.sin(2 * np.pi * value / x_max)
    value_cos = np.cos(2 * np.pi * value / x_max)
    return value_sin, value_cos


day_of_week_encoding_mapping={
    'Night':0,   
    'Morning':pi/2,
    'Afternoon':pi,
    'Evening':-pi/2,   
}

data['order_time_of_day_enc'] = encode_column_with_mapping(data, 'order_time_of_day', day_of_week_encoding_mapping)


# Encoding for local_hour (0-23)
data['order_hour_of_day_sin'], data['order_hour_of_day_cos'] = zip(*data['order_hour_of_day'].apply(lambda x: cyclic_encode(x, 23)))

# Encoding for day_of_week (0-6)
data['order_dow_sin'], data['order_dow_cos'] = zip(*data['order_dow'].apply(lambda x: cyclic_encode(x, 6)))


# enrich data

In [35]:


enriched_data = enrich_instacart_df(data)

                   
enriched_data.head(20)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,order_time_of_day,...,order_time_of_day_enc,order_hour_of_day_sin,order_hour_of_day_cos,order_dow_sin,order_dow_cos,product_name,aisle_id,department_id,aisle,department
0,15,19660,1,1,54901,51,3,11,2.0,Morning,...,1.570796,0.136167,-0.990686,1.224647e-16,-1.0,Spring Water,115,7,water seltzer sparkling water,beverages
1,15,21195,2,1,54901,51,3,11,2.0,Morning,...,1.570796,0.136167,-0.990686,1.224647e-16,-1.0,Organic Extra Virgin Olive Oil,19,13,oils vinegars,pantry
2,15,7461,3,1,54901,51,3,11,2.0,Morning,...,1.570796,0.136167,-0.990686,1.224647e-16,-1.0,Pinto Beans No Salt Added,59,15,canned meals beans,canned goods
3,15,2996,4,1,54901,51,3,11,2.0,Morning,...,1.570796,0.136167,-0.990686,1.224647e-16,-1.0,Honeysuckle Hand Soap,25,11,soap,personal care
4,15,32463,5,1,54901,51,3,11,2.0,Morning,...,1.570796,0.136167,-0.990686,1.224647e-16,-1.0,Olive Oil & Aloe Vera Hand Soap,25,11,soap,personal care
5,16,9755,1,1,174840,18,3,12,13.0,Afternoon,...,3.141593,-0.136167,-0.990686,1.224647e-16,-1.0,Original Popcorn,23,19,popcorn jerky,snacks
6,16,25466,2,0,174840,18,3,12,13.0,Afternoon,...,3.141593,-0.136167,-0.990686,1.224647e-16,-1.0,Water,115,7,water seltzer sparkling water,beverages
7,16,45437,3,0,174840,18,3,12,13.0,Afternoon,...,3.141593,-0.136167,-0.990686,1.224647e-16,-1.0,Sea Salt Made With Organic Grain Rice Chips,107,19,chips pretzels,snacks
8,24,40078,1,0,193635,19,0,14,0.0,Afternoon,...,3.141593,-0.631088,-0.775711,0.0,1.0,Strawberry Lemonade Frozen Pops,37,1,ice cream ice,frozen
9,24,9065,2,0,193635,19,0,14,0.0,Afternoon,...,3.141593,-0.631088,-0.775711,0.0,1.0,Mint Chocolate Cookie Ice Cream,37,1,ice cream ice,frozen


# Final df to csv

In [36]:
data.to_csv('data/instacart_pre_proc.csv',index=False)