# 4.4 - Data Wrangling & Subsetting - Part 1







### This script contains the following points:

* 01 - Importing Libraries
* 02 - Importing Data
* 03 - Wrangling Procedures
    * a) Looking for missing values (NaN)
    * b) Renaming column names
    * c) Changing Variable’s Data Types
    * d) Transposing Data
* 04 - Creating Subsets
* 05 - Exporting Data

---

# 01 - Importing Libraries

In [85]:
# Import libraries

import numpy as np
import pandas as pd
import os

---

# 02 - Importing Data

In [86]:
# Define path

path = r'/Users/juanigalvalisi/01-07-2022 - Instacart Basket Analysis'

In [87]:
# Import CSVs

df_ords = pd.read_csv(r'/Users/juanigalvalisi/01-07-2022 - Instacart Basket Analysis/02 - Data/Original Data/orders.csv', index_col = False)

In [88]:
df_prods = pd.read_csv(r'/Users/juanigalvalisi/01-07-2022 - Instacart Basket Analysis/02 - Data/Original Data/products.csv', index_col = False)

---

# 03 - Wrangling Procedures

In [89]:
# Dropp eval_set column from orders.csv

df_ords.drop(columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


#### a) Looking for missing values (NaN)

In [90]:
# Look for missing values (NaN)

df_ords['days_since_prior_order'].value_counts(dropna = False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

#### b) Renaming column names

In [91]:
# Rename 'order_dow' column name

df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [92]:
# Check whether the code was executed successfully

df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


#### c) Changing Variable’s Data Types

In [93]:
# Change the data type to 'str' because they act as a key

df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [94]:
# Change the data type to reduce size

df_ords['order_number'] = df_ords['order_number'].astype('int8')
df_ords['orders_day_of_week'] = df_ords['orders_day_of_week'].astype('int8')
df_ords['order_hour_of_day'] = df_ords['order_hour_of_day'].astype('int8')
df_ords['days_since_prior_order'] = df_ords['days_since_prior_order'].astype('float16')

In [95]:
# Check the output

df_ords.dtypes

order_id                   object
user_id                    object
eval_set                   object
order_number                 int8
orders_day_of_week           int8
order_hour_of_day            int8
days_since_prior_order    float16
dtype: object

In [96]:
# Return the data type of your new 'order_id' column
# Use the dtype() function whenever you want to check the data type of a single column

df_ords['order_id'].dtype

dtype('O')

#### d) Transposing Data

In [97]:
# Imoporting data set departments.csv

df_dep = pd.read_csv(r'/Users/juanigalvalisi/01-07-2022 - Instacart Basket Analysis/02 - Data/Original Data/departments.csv', index_col = False)

In [98]:
# Function Transposing Data (only view)

df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [99]:
# Function Transposing Data (overwrite)

df_dep_t = df_dep.T

In [100]:
# Check the output

df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [101]:
# To get rid of the “0” and turn the first row of your dataframe into your headers

df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [102]:
# Create a new header for your dataframe

In [103]:
# 1) Create a new header: Take the first row of table_name for the header

new_header = df_dep_t.iloc[0]

In [104]:
# Check the output

new_header

0    department
Name: department_id, dtype: object

In [105]:
# 2) Remove the first row (copy everything from the first row onward
# from the df_dep_tdataframe into a new dataframe, df_dep_t_new)

df_dep_t_new = df_dep_t[1:]

In [106]:
# Check the output

df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [107]:
# 3) Add a new header: Tell Python to use the list of column names in your new_header variable as your new header

df_dep_t_new.columns = new_header

In [108]:
# Check the output

df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [109]:
# Creating Dictionaries

data_dict = df_dep_t_new.to_dict('index')

In [110]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [111]:
# Check the output

df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [112]:
print(data_dict.get('19'))

{'department': 'snacks'}


---

# 04 - Creating Subsets

In [113]:
# Create a subset for your df_prods dataframe that only contains data from the snacks department

df_snacks =  df_prods[df_prods['department_id'] == 19]

In [114]:
# Part I - Simply searches for the data in question (assigning each value
# either True or False depending on whether it meets the criteria)

df_prods['department_id'] == 19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [115]:
# Part II - A list of only those values within df_prods that are true

df_prods[df_prods['department_id'] == 19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [116]:
# Part III - Full comamnd

df_snacks = df_prods[df_prods['department_id'] == 19]

In [117]:
# Check the output

df_snacks

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [118]:
# OPTION B - Using LOC function

df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [119]:
# Check the output

df_snacks_2

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [120]:
# OPTION C - Using LOC + ISIN functions: tell the loc function to look into a list: isin([19])

df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]

In [121]:
# Check the output

df_snacks_3

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


---

# 05 - Exporting Data

In [122]:
# Export df_ords as .csv 

df_ords.to_csv(os.path.join(path, '02 - Data','Prepared Data', 'orders_wrangled.csv'))