# Data Wrangling and Analysis

## Importing Libraries

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import os

## Defining Project Path

In [2]:
# Defining the project folder path using a raw string literal
project_path = r"C:\Users\mshhan\Documents\05-2024 Instacart Basket Analysis\02 Data\Original Data"


## Importing Data

In [7]:
# Importing the orders.csv data set
orders_path = os.path.join(project_path, "orders.csv")
df_ords = pd.read_csv(orders_path, usecols=lambda column: column != 'eval_set')

# Importing the products.csv data set
products_path = os.path.join(project_path, "products.csv")
df_prods = pd.read_csv(products_path)

## Additional Data Wrangling Procedures

In [8]:
# Changing the format of 'order_id' to string
df_ords['order_id'] = df_ords['order_id'].astype(str)

In [9]:
# Renaming the 'order_dow' column to 'day_of_week'
df_ords = df_ords.rename(columns={'order_dow': 'day_of_week'})

In [10]:
# Finding the busiest hour for placing orders
busiest_hour = df_ords['order_hour_of_day'].value_counts().idxmax()
busiest_hour_count = df_ords['order_hour_of_day'].value_counts().max()

print(f"The busiest hour for placing orders is {busiest_hour} with {busiest_hour_count} orders.")


The busiest hour for placing orders is 10 with 288418 orders.


## Reformatting Data Dictionary

In [21]:
# Assuming we have a data dictionary as a CSV file
data_dict_path = os.path.join(project_path, "departments.csv")
df_dep = pd.read_csv(data_dict_path)

# Print the columns to understand the actual names
print(df_dep.head())

# Extract the department names from the first row
department_names = df_dep.iloc[0, 1:].values.tolist()

# Create a new dataframe with department_id and department_name
department_ids = range(1, len(department_names) + 1)
df_dep_t = pd.DataFrame({'department_id': department_ids, 'department_name': department_names})

# Finding the meaning of the department_id value 4
department_meaning = df_dep_t[df_dep_t['department_id'] == 4]['department_name'].values[0]
print(f"The meaning of department_id 4 is {department_meaning}.")


  department_id       1      2       3        4        5              6  \
0    department  frozen  other  bakery  produce  alcohol  international   

           7     8                9  ...            12      13         14  \
0  beverages  pets  dry goods pasta  ...  meat seafood  pantry  breakfast   

             15          16         17      18      19    20       21  
0  canned goods  dairy eggs  household  babies  snacks  deli  missing  

[1 rows x 22 columns]
The meaning of department_id 4 is produce.


In [22]:
# Creating a subset for breakfast items
breakfast_items = df_prods[df_prods['department_id'] == df_dep_t[df_dep_t['department_name'] == 'breakfast']['department_id'].values[0]]


In [23]:
# Creating a subset for dinner party items
dinner_party_items = df_prods[df_prods['department_id'].isin(df_dep_t[df_dep_t['department_name'].isin(['alcohol', 'deli', 'beverages', 'meat seafood'])]['department_id'].values)]


In [24]:
# Counting the number of rows in the dinner_party_items dataframe
num_rows_dinner_party = dinner_party_items.shape[0]
print(f"The dinner_party_items dataframe has {num_rows_dinner_party} rows.")


The dinner_party_items dataframe has 7650 rows.


In [25]:
# Extracting information for user_id 1
user_info = df_ords[df_ords['user_id'] == 1]


In [26]:
# Basic stats for user_id 1
user_stats = user_info.describe()
print(user_stats)


       user_id  order_number  day_of_week  order_hour_of_day  \
count     11.0     11.000000    11.000000          11.000000   
mean       1.0      6.000000     2.636364          10.090909   
std        0.0      3.316625     1.286291           3.477198   
min        1.0      1.000000     1.000000           7.000000   
25%        1.0      3.500000     1.500000           7.500000   
50%        1.0      6.000000     3.000000           8.000000   
75%        1.0      8.500000     4.000000          13.000000   
max        1.0     11.000000     4.000000          16.000000   

       days_since_prior_order  
count               10.000000  
mean                19.000000  
std                  9.030811  
min                  0.000000  
25%                 14.250000  
50%                 19.500000  
75%                 26.250000  
max                 30.000000  


## Exporting DataFrames

In [28]:
# Exporting the df_ords dataframe
prepared_data_path = os.path.join(project_path, "..", "Prepared Data")
if not os.path.exists(prepared_data_path):
    os.makedirs(prepared_data_path)
df_ords.to_csv(os.path.join(prepared_data_path, "orders_wrangled.csv"), index=False)

# Exporting the df_dep dataframe (assuming df_dep_t_new is the same as df_dep after renaming)
df_dep_t.to_csv(os.path.join(prepared_data_path, "departments_wrangled.csv"), index=False)
