# File and libraries

In [1]:
import pandas as pd
import numpy as np


file_tag = "Instacart Market Basket"

# DSLabs functions

In [2]:
%run "scripts/dslabs_functions.py"

# data functions

In [3]:
%run "scripts/data_functions.py"


data_functions lodaded


# sampling and testing

In [4]:

# test_data=True
test_data=False


# Define the sampling function
def sample_user_orders(data, fraction=0.1):
    # Get unique user_ids and order_ids
    unique_user_orders = data[['user_id', 'order_id']].drop_duplicates()
    sampled_user_orders = unique_user_orders.sample(frac=fraction)
    
    # Filter the dataset to include only the sampled users and orders
    sampled_data = data[data[['user_id', 'order_id']].apply(tuple, axis=1).isin(sampled_user_orders.apply(tuple, axis=1))]
    return sampled_data


# Load the data
orders = pd.read_csv('data/input/orders.csv')


if test_data==True:



    # Apply the sampling to each group 1%
    sample=0.01
    order_data = sample_user_orders(orders, fraction=sample)
    

else:
    
    # Apply the sampling to each group 10%
    sample=0.1
    order_data = sample_user_orders(orders, fraction=sample)    
    
    
# drop eval_set as its not necessary
order_data=order_data.drop(['eval_set'], axis=1)

print(order_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 342108 entries, 25 to 3421081
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   order_id                342108 non-null  int64  
 1   user_id                 342108 non-null  int64  
 2   order_number            342108 non-null  int64  
 3   order_dow               342108 non-null  int64  
 4   order_hour_of_day       342108 non-null  int64  
 5   days_since_prior_order  321524 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 18.3 MB
None


## merge orders with prior

In [5]:
prior=pd.read_csv('data/input/order_products__prior.csv')


# merge the two dataframes on order id
data = pd.merge(prior,order_data , on='order_id', how='inner')

# class target column


In [6]:
target = "reordered"

values = data[target].value_counts(normalize=True) 
print(values)

reordered
1    0.588728
0    0.411272
Name: proportion, dtype: float64


In [7]:
data.shape

(3242962, 9)

In [8]:
summary5 = data.describe(include="all")

summary5

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3242962.0,3242962.0,3242962.0,3242962.0,3242962.0,3242962.0,3242962.0,3242962.0,3033254.0
mean,1711341.0,25588.9,8.360345,0.5887281,102773.7,17.07364,2.732926,13.44481,11.13585
std,987963.0,14096.05,7.160776,0.4920644,59514.03,17.4804,2.088305,4.248853,8.795639
min,6.0,1.0,1.0,0.0,3.0,1.0,0.0,0.0,0.0
25%,854647.0,13535.0,3.0,0.0,51187.0,5.0,1.0,10.0,5.0
50%,1716956.0,25305.0,6.0,1.0,102397.0,11.0,3.0,13.0,8.0
75%,2566826.0,37940.0,11.0,1.0,154261.0,24.0,5.0,16.0,15.0
max,3421078.0,49688.0,109.0,1.0,206209.0,99.0,6.0,23.0,30.0


### additional date columns creation

In [9]:
# Categorize the time of day
def categorize_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 22:
        return 'Evening'
    else:
        return 'Night'

data['order_time_of_day'] = data['order_hour_of_day'].apply(categorize_time_of_day)

data['is_weekend'] = data['order_dow'].apply(lambda x: 1 if x >= 5 else 0)  # 1 for weekend, 0 for weekday


data['weeks_since_prior_order'] = data['days_since_prior_order'].apply(lambda x: round(x / 7, 0) if pd.notnull(x) else np.nan)



# enrich data

In [10]:


enriched_data = enrich_instacart_df(data)

                   
enriched_data.head(20)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,order_time_of_day,is_weekend,weeks_since_prior_order,product_name,aisle_id,department_id,aisle,department
0,6,40462,1,0,22352,4,1,12,30.0,Afternoon,0,4.0,Cleanse,31,7,refrigerated,beverages
1,6,15873,2,0,22352,4,1,12,30.0,Afternoon,0,4.0,Dryer Sheets Geranium Scent,75,17,laundry,household
2,6,41897,3,0,22352,4,1,12,30.0,Afternoon,0,4.0,Clean Day Lavender Scent Room Freshener Spray,101,17,air fresheners candles,household
3,10,24852,1,1,135442,4,6,8,8.0,Morning,1,1.0,Banana,24,4,fresh fruits,produce
4,10,4796,2,1,135442,4,6,8,8.0,Morning,1,1.0,Baby Portabella Mushrooms,83,4,fresh vegetables,produce
5,10,31717,3,0,135442,4,6,8,8.0,Morning,1,1.0,Organic Cilantro,16,4,fresh herbs,produce
6,10,47766,4,1,135442,4,6,8,8.0,Morning,1,1.0,Organic Avocado,24,4,fresh fruits,produce
7,10,4605,5,1,135442,4,6,8,8.0,Morning,1,1.0,Yellow Onions,83,4,fresh vegetables,produce
8,10,1529,6,0,135442,4,6,8,8.0,Morning,1,1.0,"Parsley, Italian (Flat), New England Grown",16,4,fresh herbs,produce
9,10,21137,7,1,135442,4,6,8,8.0,Morning,1,1.0,Organic Strawberries,24,4,fresh fruits,produce


# Encoding Excel File

In [13]:

# Define the groups of columns for hierarchical encoding, grouped by sheet name
columns_to_save = {
    
    'product_name' : ['product_id', 'product_name'],  
    'aisle' : ['department_id', 'department','aisle_id', 'aisle'],
    'department' : ['department_id', 'department'],

}



# Save the distinct values combinations of each column group into corresponding sheets
append_columns_to_excel(enriched_data, columns_to_save, f'data/instacart_pre_encoding.xlsx')

print("Excel file has been updated successfully.")

Error: property 'book' of 'OpenpyxlWriter' object has no setter
The file might be corrupt or invalid. Creating a new file.
Excel file has been updated successfully.


# Final df to csv

In [14]:
data.to_csv('data/instacart_pre_proc.csv',index=False)