# Table of Contents

1) Import libraries and data

2) Dropping 'eval set' from orders data

3) Change data type of user_id in orders data

4) Renaming columns in orders data

5) Transform Departments Data

6) Create Data Dictionary for Department data

7) Find the busiest hours for placing orders. 

8) Create a subset of breakfast items

9) Create a subset of dinner party items

10) Export data

### 1)
Import libraries and data

In [1]:
#import libraries
import pandas as pd
import numpy as np
import os

In [2]:
#define data path
path = r'C:\Users\Owner\Documents\Career Foundry\Tasks\Data Immersion Tasks\Instacart Project\2 Data\original data'

In [3]:
#import orders data as df_ords
df_ords = pd.read_csv(os.path.join(path,'orders.csv'), index_col = False)

In [4]:
#import products data as df_prods
df_prods = pd.read_csv(os.path.join(path,'products.csv'), index_col = False)

In [5]:
#import departments data as df_dep
df_dep = pd.read_csv(os.path.join(path,'departments.csv'), index_col = False)

## 2
Dropping 'eval set' from orders data

In [52]:
#dropping eval_set column from orders data set using df.drop()
df_ords = df_ords.drop(columns = ['eval_set'])

In [6]:
#searching for missing values 'NaN' within the 'days_since_prior_order' column
df_ords['days_since_prior_order'].value_counts(dropna = False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

### 3
Change data type of user_id in orders data

In [None]:
#changing the format of user_id to a string as we would not get much use from it as an integer
df_ords['user_id'] = df_ords['user_id'].astype('str')

## 4
Renaming columns in orders data

In [7]:
#renaming the column order_dow in the orders data set to orders_day_of_week
df_ords.rename(columns={'order_dow':'orders_day_of_week'}, inplace = True)

In [8]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


## 5
Transform Departments Data

In [9]:
#look at department data
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [11]:
#transform department dataframe df_dep and define as new dataframe df_dep_t
df_dep_t = df_dep.T

In [12]:
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [13]:
#create a new header that will be used later using the values in row 0 of df_dep_t
new_header = df_dep_t.iloc[0]

In [14]:
#remove row 0 and above by creating a new df of row 1 and beyong
df_dep_t_new = df_dep_t[1:]

In [15]:
#set new header as the column headers for df_dep_t_new by renaming columns
df_dep_t_new.columns = new_header

In [16]:
#confirm df_dep_t_new looks as expected
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


## 6
Create Data Dictionary for Department data

In [34]:
#take the information from df_dep_t_new and create a data dictionary
#that can be used
data_dict = df_dep_t_new.to_dict('index')

In [35]:
#view data dictionary created for df_dep_t_new
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [36]:
#look in products data for values in department id column that will be in
# the data dictionary like 19
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [37]:
#see what value the data dictionary hold for 19
print(data_dict.get('19'))

{'department': 'snacks'}


In [17]:
# search for all rows in products data where the expression of which row has 
# value in department_id of 19 is TRUE
df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


## 7)
Find the busiest hours for placing orders. 

In [36]:
#find counts of 'order_hour_of_day' for each value in df_ords data
# ANSWER : 10 am is the busiest time of the day for orders
df_ords['order_hour_of_day'].value_counts(dropna=False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

## 8)
The sales team in your client’s organization wants to know more about breakfast item sales. Create a subset containing only the required information.

In [40]:
#find department id associated with breakfast
# breakfast is 14
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [None]:
#create subset of product data called df_prod_breakfast that only includes 
# values from breakfast department (id 14)

In [43]:
df_prod_breakfast = df_prods[df_prods['department_id']==14]

## 9) 
They’d also like to see details about customers who might be throwing dinner parties. Your task is to find all observations from the entire dataframe that include items from the following departments: alcohol, deli, beverages, and meat/seafood. You’ll need to present this subset to your client.

In [46]:
#find department ids for alcohol, deli, beverages, and meat/seafood alcohol (5), deli(20), beverages(7), meat/seafood(12)
#then create subset df_prods_din_party that only includes values from these depts

df_prods_din_party = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]

In [49]:
#find rows of df_prods_din_party
#ANSWER: 7650 rows
df_prods_din_party.shape

(7650, 5)

## 10 
Export your df_ords dataframe and df_dep_t_new

In [70]:
# redefine path2 to be less specific that path
path2 = r'C:\Users\Owner\Documents\Career Foundry\Tasks\Data Immersion Tasks\Instacart Project\2 Data'

In [71]:
# export df_ords to prepared data folder as orders_wrangled.csv
df_ords.to_csv(os.path.join(path2, 'prepared data', 'orders_wrangled.csv'))

In [72]:
# export df_dep_t_new to prepared data folder as departments_wrangled.csv
df_dep_t_new.to_csv(os.path.join(path2, 'prepared data', 'departments_wrangled.csv'))