***1. Importing Libraries***

In [4]:
import pandas as pd
import numpy as np
import os

***2. Importing Data***

In [6]:
#create path
path=r'C:\Users\lisac\10-10-2024 Instacart Basket Analysis'

In [7]:
#import orders csv
df_ords=pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col=False)

In [8]:
#import products csv
df_prods=pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col=False)

In [9]:
#import departments csv
df_deps=pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col=False)

***3. Wrangling Data***

***3.1 Dropping column***

In [12]:
#Dropping eval_set column from df_ords
df_ords.drop(columns=['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


***3.2 Renaming Column***

In [14]:
#Renaming from order_dow to order_day_of_week
df_ords.rename(columns={'order_dow': 'order_day_of_week'}, inplace=True)

In [15]:
#Checking to see if changes were made correctly
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


Observation:  correctly renamed column to order_day_of_week 

***3.3 Changing Data Type***

In [18]:
#Changing Variable Data Type by first using descriptive analysis
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [19]:
#Changing order_id from numeric to string datatype, which .describe() function will ignore
df_ords['order_id']=df_ords['order_id'].astype('str')

In [20]:
#Checking to see if changes were made correctly
df_ords.describe()

Unnamed: 0,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,102978.2,17.15486,2.776219,13.45202,11.11484
std,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,0.0,0.0,0.0
25%,51394.0,5.0,1.0,10.0,4.0
50%,102689.0,11.0,3.0,13.0,7.0
75%,154385.0,23.0,5.0,16.0,15.0
max,206209.0,100.0,6.0,23.0,30.0


Observation:  correctly changed order_id datatype from numeric to string; the .describe() function correctly did not include it in the descriptive analysis

3.4 Transposing Data

In [23]:
#Looking at headers from the departments.csv file to transpose data
df_deps.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [24]:
#Transposing from Wide format to Long format
df_deps.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [25]:
#Overwriting dataframe with new long format
df_deps_t=df_deps.T

In [26]:
df_deps_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [27]:
#Adding index to dataframe and removing the "0" above the first row
df_deps_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


Observation: Long format entact, can start procedures for 1) creating new header, 2) removing the first row in df_deps dataframe and 3) adding the new header permanently

In [29]:
#1 Creating new header by creating an object that contains the first row in the df_deps dataframe
new_header=df_deps_t.iloc[0]

In [30]:
#Checking if object created
new_header

0    department
Name: department_id, dtype: object

In [31]:
#2 Removing the first row in df_deps by telling Python to only copy over the rows beyond the first row
df_deps_t_new=df_deps_t[1:]

In [32]:
#Checking if it worked
df_deps_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


Observation:  removed the first column and made the indexing start at row 1, not row 0

In [34]:
#3 Setting new header variable as the new header in the df_deps dataframe
df_deps_t_new.columns=new_header

In [35]:
#Checking to see if new header is in df_deps dataframe
df_deps_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


Observation: index is no longer in wide format, new header correctly listed (department_id and department) and first row starts at 1

***4. Creating Data Dictionary***

In [38]:
#Creating the data dictionary on the dr_depts dataframe
data_dict=df_deps_t_new.to_dict('index')

In [39]:
#Checking to see if data dictionary worked
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


Observation: Data dictionary created from df_deps dataframe was included in the .head() query which brought up the first five rows in the df_prods dataframe

***Steps 4-10 from Task 4.4 will be actioned below***

In [42]:
#Step 2:  Identify variable in df_ords dataframe to change datatype
df_ords.describe()

Unnamed: 0,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,102978.2,17.15486,2.776219,13.45202,11.11484
std,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,0.0,0.0,0.0
25%,51394.0,5.0,1.0,10.0,4.0
50%,102689.0,11.0,3.0,13.0,7.0
75%,154385.0,23.0,5.0,16.0,15.0
max,206209.0,100.0,6.0,23.0,30.0


Change the variable order_day_of_week from numeric to category by using pandas mapping dictionary

In [44]:
# Create a mapping dictionary for days of the week
day_mapping = {
    0: 'Sunday', 
    1: 'Monday', 
    2: 'Tuesday', 
    3: 'Wednesday', 
    4: 'Thursday', 
    5: 'Friday', 
    6: 'Saturday'
}

In [45]:
# Map the numeric days to day names without altering the original column
df_ords['order_day_of_week_name'] = df_ords['order_day_of_week'].map(day_mapping)

In [46]:
df_ords.describe(include='all')

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,order_day_of_week_name
count,3421083.0,3421083.0,3421083,3421083.0,3421083.0,3421083.0,3214874.0,3421083
unique,3421083.0,,3,,,,,7
top,2539329.0,,prior,,,,,Sunday
freq,1.0,,3214874,,,,,600905
mean,,102978.2,,17.15486,2.776219,13.45202,11.11484,
std,,59533.72,,17.73316,2.046829,4.226088,9.206737,
min,,1.0,,1.0,0.0,0.0,0.0,
25%,,51394.0,,5.0,1.0,10.0,4.0,
50%,,102689.0,,11.0,3.0,13.0,7.0,
75%,,154385.0,,23.0,5.0,16.0,15.0,


Observation: this command ADDED a variable order_day_of_week_name instead of replacing order_day_of_week.  
Will use the .drop command to drop the order_day_of_week column WITHOUT overwriting the dataframe.  
This column will not be visible, but will still be in the df_ords dataframe.

In [48]:
df_ords.drop(columns=['order_day_of_week'])

Unnamed: 0,order_id,user_id,eval_set,order_number,order_hour_of_day,days_since_prior_order,order_day_of_week_name
0,2539329,1,prior,1,8,,Tuesday
1,2398795,1,prior,2,7,15.0,Wednesday
2,473747,1,prior,3,12,21.0,Wednesday
3,2254736,1,prior,4,7,29.0,Thursday
4,431534,1,prior,5,15,28.0,Thursday
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,18,29.0,Friday
3421079,1854736,206209,prior,11,10,30.0,Thursday
3421080,626363,206209,prior,12,12,18.0,Monday
3421081,2977660,206209,prior,13,12,7.0,Monday


Observation:  Successfully dropped the order_day_of_week column, but noticed that the eval_set column is back.
Will execute command again, listing both columns inside the brackets.

In [50]:
df_ords.drop(columns=['eval_set', 'order_day_of_week'])

Unnamed: 0,order_id,user_id,order_number,order_hour_of_day,days_since_prior_order,order_day_of_week_name
0,2539329,1,1,8,,Tuesday
1,2398795,1,2,7,15.0,Wednesday
2,473747,1,3,12,21.0,Wednesday
3,2254736,1,4,7,29.0,Thursday
4,431534,1,5,15,28.0,Thursday
...,...,...,...,...,...,...
3421078,2266710,206209,10,18,29.0,Friday
3421079,1854736,206209,11,10,30.0,Thursday
3421080,626363,206209,12,12,18.0,Monday
3421081,2977660,206209,13,12,7.0,Monday


Observation:  I AM TOTALLY AWESOME!!!!

In [52]:
#Step 3:  Look for unituitive variable name and change it without overwriting the dataframe
#Change user_id to customer_id
df_ords.rename(columns={'user_id': 'customer_id'}, inplace=True)

In [53]:
#checking header to see if name change happened
df_ords.head()

Unnamed: 0,order_id,customer_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,order_day_of_week_name
0,2539329,1,prior,1,2,8,,Tuesday
1,2398795,1,prior,2,3,7,15.0,Wednesday
2,473747,1,prior,3,3,12,21.0,Wednesday
3,2254736,1,prior,4,4,7,29.0,Thursday
4,431534,1,prior,5,4,15,28.0,Thursday


In [54]:
#Noticed eval_set and order_day_of_week columns were visible again, used .drop to hide columns from view
df_ords.drop(columns=['eval_set', 'order_day_of_week'])

Unnamed: 0,order_id,customer_id,order_number,order_hour_of_day,days_since_prior_order,order_day_of_week_name
0,2539329,1,1,8,,Tuesday
1,2398795,1,2,7,15.0,Wednesday
2,473747,1,3,12,21.0,Wednesday
3,2254736,1,4,7,29.0,Thursday
4,431534,1,5,15,28.0,Thursday
...,...,...,...,...,...,...
3421078,2266710,206209,10,18,29.0,Friday
3421079,1854736,206209,11,10,30.0,Thursday
3421080,626363,206209,12,12,18.0,Monday
3421081,2977660,206209,13,12,7.0,Monday


In [55]:
#Step 4:  Find the busiest hour for placing orders
#Using value.counts command

In [56]:
# Find the busiest order hour and count the number of orders
busiest_hour = df_ords['order_hour_of_day'].value_counts().idxmax()
busiest_count = df_ords['order_hour_of_day'].value_counts().max()

In [57]:
print(f"The busiest order hour is {busiest_hour} with {busiest_count} orders.")

The busiest order hour is 10 with 288418 orders.


In [58]:
#Step 5:  In df_prods dataframe, find the meaning behind value of 4 in that column
#Using data dictionary

In [59]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

Observation:  Value of 4 in department_id is "produce"

In [61]:
#Step 6:  Create a subset containing only the breakfast item sales
df_prods['department_id']==14

0        False
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [62]:
df_prods[df_prods['department_id']==14]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


In [129]:
#Step 7:  Create a subset containing catering (dinner party) items from alcohol 5, beverages 7, meat seafood 12, deli 20
df_catering = df_prods[df_prods['department_id'].isin([5, 7, 12, 20])]


In [131]:
df_catering.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1


In [136]:
#Step 8:  Track the number of rows df_catering dataframe created using .len() function
len(df_catering)

7650

In [142]:
#Step 9:  Extract all information on user_id "1"
#Filtering data and calling this subset customer_data (user_id changed to customer_id in Step 3)
customer_data = df_ords[df_ords['customer_id'] == 1]

In [144]:
print(customer_data)

   order_id  customer_id eval_set  order_number  order_day_of_week  \
0   2539329            1    prior             1                  2   
1   2398795            1    prior             2                  3   
2    473747            1    prior             3                  3   
3   2254736            1    prior             4                  4   
4    431534            1    prior             5                  4   
5   3367565            1    prior             6                  2   
6    550135            1    prior             7                  1   
7   3108588            1    prior             8                  1   
8   2295261            1    prior             9                  1   
9   2550362            1    prior            10                  4   
10  1187899            1    train            11                  4   

    order_hour_of_day  days_since_prior_order order_day_of_week_name  
0                   8                     NaN                Tuesday  
1                

In [146]:
#Step 10:  Provide basic stats on customer_id "1"
print(customer_data.describe())

       customer_id  order_number  order_day_of_week  order_hour_of_day  \
count         11.0     11.000000          11.000000          11.000000   
mean           1.0      6.000000           2.636364          10.090909   
std            0.0      3.316625           1.286291           3.477198   
min            1.0      1.000000           1.000000           7.000000   
25%            1.0      3.500000           1.500000           7.500000   
50%            1.0      6.000000           3.000000           8.000000   
75%            1.0      8.500000           4.000000          13.000000   
max            1.0     11.000000           4.000000          16.000000   

       days_since_prior_order  
count               10.000000  
mean                19.000000  
std                  9.030811  
min                  0.000000  
25%                 14.250000  
50%                 19.500000  
75%                 26.250000  
max                 30.000000  


In [148]:
#Step 12: Export new datafrome to Prepared Data folder:
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'))